diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 59e2ea224f6..573fecca003 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -7,8 +7,9 @@ notebooks/         @rapidsai/cudf-python-codeowners
 python/dask_cudf/  @rapidsai/cudf-dask-codeowners
 
 #cmake code owners
-**/CMakeLists.txt  @rapidsai/cudf-cmake-codeowners
-**/cmake/          @rapidsai/cudf-cmake-codeowners
+cpp/CMakeLists.txt               @rapidsai/cudf-cmake-codeowners
+cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners
+**/cmake/                        @rapidsai/cudf-cmake-codeowners
 
 #java code owners
 java/              @rapidsai/cudf-java-codeowners
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index ae895daf28a..2c5ecf68690 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -19,15 +19,18 @@ Here are some guidelines to help the review process go smoothly.
    noted here: https://help.github.com/articles/closing-issues-using-keywords/
 
 5. If your pull request is not ready for review but you want to make use of the
-   continuous integration testing facilities please label it with `[WIP]`.
+   continuous integration testing facilities please mark your pull request as Draft.
+   https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request#converting-a-pull-request-to-a-draft
 
 6. If your pull request is ready to be reviewed without requiring additional
-   work on top of it, then remove the `[WIP]` label (if present) and replace
-   it with `[REVIEW]`. If assistance is required to complete the functionality,
-   for example when the C/C++ code of a feature is complete but Python bindings
-   are still required, then add the label `[HELP-REQ]` so that others can triage
-   and assist. The additional changes then can be implemented on top of the
-   same PR. If the assistance is done by members of the rapidsAI team, then no
+   work on top of it, then remove it from "Draft" and make it "Ready for Review".
+   https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request#marking-a-pull-request-as-ready-for-review
+
+   If assistance is required to complete the functionality, for example when the
+   C/C++ code of a feature is complete but Python bindings are still required,
+   then add the label `help wanted` so that others can triage and assist.
+   The additional changes then can be implemented on top of the same PR.
+   If the assistance is done by members of the rapidsAI team, then no
    additional actions are required by the creator of the original PR for this,
    otherwise the original author of the PR needs to give permission to the
    person(s) assisting to commit to their personal fork of the project. If that
@@ -39,10 +42,10 @@ Here are some guidelines to help the review process go smoothly.
    features or make changes out of the scope of those requested by the reviewer
    (doing this just add delays as already reviewed code ends up having to be
    re-reviewed/it is hard to tell what is new etc!). Further, please do not
-   rebase your branch on main/force push/rewrite history, doing any of these
-   causes the context of any comments made by reviewers to be lost. If
-   conflicts occur against main they should be resolved by merging main
-   into the branch used for making the pull request.
+   rebase your branch on the target branch, force push, or rewrite history.
+   Doing any of these causes the context of any comments made by reviewers to be lost.
+   If conflicts occur against the target branch they should be resolved by
+   merging the target branch into the branch used for making the pull request.
 
 Many thanks in advance for your cooperation!
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 69f6634b5c2..f82fa9ef361 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -41,6 +41,12 @@ repos:
                 entry: mypy --config-file=python/cudf/setup.cfg python/cudf/cudf
                 language: system
                 types: [python]
+      -   repo: https://github.com/pycqa/pydocstyle
+          rev: 6.0.0
+          hooks:
+              - id: pydocstyle
+                args: ["--config=python/.flake8"]
+
 
 default_language_version:
       python: python3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 21ab8ed3274..08a34a07ba3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,417 @@
-# cuDF 0.19.0 (Date TBD)
-
-Please see https://github.com/rapidsai/cudf/releases/tag/v0.19.0a for the latest changes to this development branch.
+# cuDF 21.06.00 (9 Jun 2021)
+
+## 🚨 Beaking Changes
+
+- Add suppot fo `make_meta_obj` dispatch in `dask-cudf` ([#8342](https://github.com/rapidsai/cudf/pull/8342)) [@galipemsaga](https://github.com/galipemsaga)
+- Add sepaato-on-null paamete to stings concatenate APIs ([#8282](https://github.com/rapidsai/cudf/pull/8282)) [@davidwendt](https://github.com/davidwendt)
+- Intoduce a common paent class fo NumeicalColumn and DecimalColumn ([#8278](https://github.com/rapidsai/cudf/pull/8278)) [@vyas](https://github.com/vyas)
+- Update ORC statistics API to use C++17 standad libay ([#8241](https://github.com/rapidsai/cudf/pull/8241)) [@vuule](https://github.com/vuule)
+- Peseve column hieachy when getting NULL ow fom `LIST` column ([#8206](https://github.com/rapidsai/cudf/pull/8206)) [@isVoid](https://github.com/isVoid)
+- `Goupby.shift` c++ API efacto and python binding ([#8131](https://github.com/rapidsai/cudf/pull/8131)) [@isVoid](https://github.com/isVoid)
+
+## 🐛 Bug Fixes
+
+- Fix stuct flattening to add a validity column only when the input column has null element ([#8374](https://github.com/rapidsai/cudf/pull/8374)) [@ttnghia](https://github.com/ttnghia)
+- Compilation fix: Remove edefinition fo `std::is_same_v()` ([#8369](https://github.com/rapidsai/cudf/pull/8369)) [@mythocks](https://github.com/mythocks)
+- Add backwad compatibility fo `dask-cudf` to wok with othe vesions of `dask` ([#8368](https://github.com/rapidsai/cudf/pull/8368)) [@galipemsaga](https://github.com/galipemsaga)
+- Handle empty esults with nested types in copy_if_else ([#8359](https://github.com/rapidsai/cudf/pull/8359)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Handle nested column types popely fo empty paquet files. ([#8350](https://github.com/rapidsai/cudf/pull/8350)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Raise eo when unsuppoted aguments ae passed to `dask_cudf.DataFame.sot_values` ([#8349](https://github.com/rapidsai/cudf/pull/8349)) [@galipemsaga](https://github.com/galipemsaga)
+- Raise `NotImplementedEo` fo axis=1 in `ank` ([#8347](https://github.com/rapidsai/cudf/pull/8347)) [@galipemsaga](https://github.com/galipemsaga)
+- Add suppot fo `make_meta_obj` dispatch in `dask-cudf` ([#8342](https://github.com/rapidsai/cudf/pull/8342)) [@galipemsaga](https://github.com/galipemsaga)
+- Update Java sting concatenate test fo single column ([#8330](https://github.com/rapidsai/cudf/pull/8330)) [@tgavescs](https://github.com/tgavescs)
+- Use empty_like in scatte ([#8314](https://github.com/rapidsai/cudf/pull/8314)) [@evans2](https://github.com/evans2)
+- Fix concatenate_lists_ignoe_null on ows of all_nulls ([#8312](https://github.com/rapidsai/cudf/pull/8312)) [@spelingxx](https://github.com/spelingxx)
+- Add sepaato-on-null paamete to stings concatenate APIs ([#8282](https://github.com/rapidsai/cudf/pull/8282)) [@davidwendt](https://github.com/davidwendt)
+- COLLECT_LIST suppot etuning empty output columns. ([#8279](https://github.com/rapidsai/cudf/pull/8279)) [@mythocks](https://github.com/mythocks)
+- Update io util to convet path like object to sting ([#8275](https://github.com/rapidsai/cudf/pull/8275)) [@ayushdg](https://github.com/ayushdg)
+- Fix esult column types fo empty inputs to olling window ([#8274](https://github.com/rapidsai/cudf/pull/8274)) [@mythocks](https://github.com/mythocks)
+- Actually test equality in asset_goupby_esults_equal ([#8272](https://github.com/rapidsai/cudf/pull/8272)) [@shwina](https://github.com/shwina)
+- CMake always explicitly specify a souce files extension ([#8270](https://github.com/rapidsai/cudf/pull/8270)) [@obetmaynad](https://github.com/obetmaynad)
+- Fix stuct binay seach and stuct flattening ([#8268](https://github.com/rapidsai/cudf/pull/8268)) [@ttnghia](https://github.com/ttnghia)
+- Revet &quot;patch thust to fix intmax num elements limitation in scan_by_key&quot; ([#8263](https://github.com/rapidsai/cudf/pull/8263)) [@cwhais](https://github.com/cwhais)
+- upgade dlpack to 0.5 ([#8262](https://github.com/rapidsai/cudf/pull/8262)) [@cwhais](https://github.com/cwhais)
+- Fixes CSV-eade type infeence fo thousands sepaato and decimal point ([#8261](https://github.com/rapidsai/cudf/pull/8261)) [@elstehle](https://github.com/elstehle)
+- Fix incoect assetion in Java concat ([#8258](https://github.com/rapidsai/cudf/pull/8258)) [@spelingxx](https://github.com/spelingxx)
+- Copy nested types upon constuction ([#8244](https://github.com/rapidsai/cudf/pull/8244)) [@isVoid](https://github.com/isVoid)
+- Peseve column hieachy when getting NULL ow fom `LIST` column ([#8206](https://github.com/rapidsai/cudf/pull/8206)) [@isVoid](https://github.com/isVoid)
+- Clip decimal binay op pecision at max pecision ([#8194](https://github.com/rapidsai/cudf/pull/8194)) [@ChisJa](https://github.com/ChisJa)
+
+## 📖 Documentation
+
+- Add docsting fo `dask_cudf.ead_csv` ([#8355](https://github.com/rapidsai/cudf/pull/8355)) [@galipemsaga](https://github.com/galipemsaga)
+- Fix cudf elease vesion in eadme ([#8331](https://github.com/rapidsai/cudf/pull/8331)) [@galipemsaga](https://github.com/galipemsaga)
+- Fix stucts column desciption in dev docs ([#8318](https://github.com/rapidsai/cudf/pull/8318)) [@isVoid](https://github.com/isVoid)
+- Update eadme with coect CUDA vesions ([#8315](https://github.com/rapidsai/cudf/pull/8315)) [@aydouglass](https://github.com/aydouglass)
+- Add desciption of the cuIO GDS integation ([#8293](https://github.com/rapidsai/cudf/pull/8293)) [@vuule](https://github.com/vuule)
+- Remove unused paamete fom copy_patition kenel documentation ([#8283](https://github.com/rapidsai/cudf/pull/8283)) [@obetmaynad](https://github.com/obetmaynad)
+
+## 🚀 New Featues
+
+- Add suppot meging b/w categoical data ([#8332](https://github.com/rapidsai/cudf/pull/8332)) [@galipemsaga](https://github.com/galipemsaga)
+- Java: Suppot stuct scala ([#8327](https://github.com/rapidsai/cudf/pull/8327)) [@spelingxx](https://github.com/spelingxx)
+- added _is_homogeneous popety ([#8299](https://github.com/rapidsai/cudf/pull/8299)) [@shaneding](https://github.com/shaneding)
+- Added decimal witing fo CSV wite ([#8296](https://github.com/rapidsai/cudf/pull/8296)) [@kaatish](https://github.com/kaatish)
+- Java: Suppot ceating a scala fom utf8 sting ([#8294](https://github.com/rapidsai/cudf/pull/8294)) [@fiestaman](https://github.com/fiestaman)
+- Add Java API fo Concatenate stings with sepaato ([#8289](https://github.com/rapidsai/cudf/pull/8289)) [@tgavescs](https://github.com/tgavescs)
+- `stings::join_list_elements` options fo empty list inputs ([#8285](https://github.com/rapidsai/cudf/pull/8285)) [@ttnghia](https://github.com/ttnghia)
+- Retun python lists fo __getitem__ calls to list type seies ([#8265](https://github.com/rapidsai/cudf/pull/8265)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- add unit tests fo lead/lag on list fo ow window ([#8259](https://github.com/rapidsai/cudf/pull/8259)) [@wbo4958](https://github.com/wbo4958)
+- Ceate a Sting column fom UTF8 Sting byte aays ([#8257](https://github.com/rapidsai/cudf/pull/8257)) [@fiestaman](https://github.com/fiestaman)
+- Suppot scatteing `list_scala` ([#8256](https://github.com/rapidsai/cudf/pull/8256)) [@isVoid](https://github.com/isVoid)
+- Implement `lists::concatenate_list_elements` ([#8231](https://github.com/rapidsai/cudf/pull/8231)) [@ttnghia](https://github.com/ttnghia)
+- Suppot fo stuct scalas. ([#8220](https://github.com/rapidsai/cudf/pull/8220)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Add suppot fo decimal types in ORC wite ([#8198](https://github.com/rapidsai/cudf/pull/8198)) [@vuule](https://github.com/vuule)
+- Suppot ceate lists column fom a `list_scala` ([#8185](https://github.com/rapidsai/cudf/pull/8185)) [@isVoid](https://github.com/isVoid)
+- `Goupby.shift` c++ API efacto and python binding ([#8131](https://github.com/rapidsai/cudf/pull/8131)) [@isVoid](https://github.com/isVoid)
+- Add `goupby::eplace_nulls(eplace_policy)` api ([#7118](https://github.com/rapidsai/cudf/pull/7118)) [@isVoid](https://github.com/isVoid)
+
+## 🛠️ Impovements
+
+- Suppot Dask + Distibuted 2021.05.1 ([#8392](https://github.com/rapidsai/cudf/pull/8392)) [@jakikham](https://github.com/jakikham)
+- Add aliases fo sting methods ([#8353](https://github.com/rapidsai/cudf/pull/8353)) [@shwina](https://github.com/shwina)
+- Update envionment vaiable used to detemine `cuda_vesion` ([#8321](https://github.com/rapidsai/cudf/pull/8321)) [@ajschmidt8](https://github.com/ajschmidt8)
+- JNI: Refacto the code of making column fom scala ([#8310](https://github.com/rapidsai/cudf/pull/8310)) [@fiestaman](https://github.com/fiestaman)
+- Update `CHANGELOG.md` links fo calve ([#8303](https://github.com/rapidsai/cudf/pull/8303)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Mege `banch-0.19` into `banch-21.06` ([#8302](https://github.com/rapidsai/cudf/pull/8302)) [@ajschmidt8](https://github.com/ajschmidt8)
+- use addess and length fo GDS eads/wites ([#8301](https://github.com/rapidsai/cudf/pull/8301)) [@ongou](https://github.com/ongou)
+- Update cudfjni vesion to 21.06.0 ([#8292](https://github.com/rapidsai/cudf/pull/8292)) [@pxLi](https://github.com/pxLi)
+- Update docs build scipt ([#8284](https://github.com/rapidsai/cudf/pull/8284)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Make device_buffe steams explicit and enfoce move constuction ([#8280](https://github.com/rapidsai/cudf/pull/8280)) [@haism](https://github.com/haism)
+- Intoduce a common paent class fo NumeicalColumn and DecimalColumn ([#8278](https://github.com/rapidsai/cudf/pull/8278)) [@vyas](https://github.com/vyas)
+- Do not add nulls to the hash table when null_equality::NOT_EQUAL is passed to left_semi_join and left_anti_join ([#8277](https://github.com/rapidsai/cudf/pull/8277)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Enable implicit casting when concatenating mixed types ([#8276](https://github.com/rapidsai/cudf/pull/8276)) [@ChisJa](https://github.com/ChisJa)
+- Fix CMake FindPackage mm, pin dev envs&#39; dlpack to v0.3 ([#8271](https://github.com/rapidsai/cudf/pull/8271)) [@txcllnt](https://github.com/txcllnt)
+- Update cudfjni vesion to 21.06 ([#8267](https://github.com/rapidsai/cudf/pull/8267)) [@pxLi](https://github.com/pxLi)
+- suppot RMM aligned esouce adapte in JNI ([#8266](https://github.com/rapidsai/cudf/pull/8266)) [@ongou](https://github.com/ongou)
+- Pass compile envionment vaiables to conda python build ([#8260](https://github.com/rapidsai/cudf/pull/8260)) [@Ethyling](https://github.com/Ethyling)
+- Remove abc inheitance fom Seializable ([#8254](https://github.com/rapidsai/cudf/pull/8254)) [@vyas](https://github.com/vyas)
+- Move moe methods into SingleColumnFame ([#8253](https://github.com/rapidsai/cudf/pull/8253)) [@vyas](https://github.com/vyas)
+- Update ORC statistics API to use C++17 standad libay ([#8241](https://github.com/rapidsai/cudf/pull/8241)) [@vuule](https://github.com/vuule)
+- Coect unused paamete wanings in dictonay algoithms ([#8239](https://github.com/rapidsai/cudf/pull/8239)) [@obetmaynad](https://github.com/obetmaynad)
+- Coect unused paametes in the copying algoithms ([#8232](https://github.com/rapidsai/cudf/pull/8232)) [@obetmaynad](https://github.com/obetmaynad)
+- IO statistics cleanup ([#8191](https://github.com/rapidsai/cudf/pull/8191)) [@kaatish](https://github.com/kaatish)
+- Refacto of olling_window implementation. ([#8158](https://github.com/rapidsai/cudf/pull/8158)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Add a flag fo allowing single quotes in JSON stings. ([#8144](https://github.com/rapidsai/cudf/pull/8144)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Column efactoing 2 ([#8130](https://github.com/rapidsai/cudf/pull/8130)) [@vyas](https://github.com/vyas)
+- suppot space in wokspace ([#7956](https://github.com/rapidsai/cudf/pull/7956)) [@jolounyomi](https://github.com/jolounyomi)
+- Suppot collect_set on olling window ([#7881](https://github.com/rapidsai/cudf/pull/7881)) [@spelingxx](https://github.com/spelingxx)
+
+# cuDF 0.19.0 (21 Apr 2021)
+
+## 🚨 Breaking Changes
+
+- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee)
+- Allow merging index column with data column using keyword &quot;on&quot; ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source)
+- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2)
+- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism)
+- Don&#39;t identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr)
+- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2)
+- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism)
+- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt)
+- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt)
+- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret)
+- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina)
+- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport)
+- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source)
+- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt)
+- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism)
+- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard)
+- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar)
+- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia)
+- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt)
+
+## 🐛 Bug Fixes
+
+- Fix a `NameError` in meta dispatch API ([#7996](https://github.com/rapidsai/cudf/pull/7996)) [@galipremsagar](https://github.com/galipremsagar)
+- Reindex in `DataFrame.__setitem__` ([#7957](https://github.com/rapidsai/cudf/pull/7957)) [@galipremsagar](https://github.com/galipremsagar)
+- jitify direct-to-cubin compilation and caching. ([#7919](https://github.com/rapidsai/cudf/pull/7919)) [@cwharris](https://github.com/cwharris)
+- Use dynamic cudart for nvcomp in java build ([#7896](https://github.com/rapidsai/cudf/pull/7896)) [@abellina](https://github.com/abellina)
+- fix &quot;incompatible redefinition&quot; warnings ([#7894](https://github.com/rapidsai/cudf/pull/7894)) [@cwharris](https://github.com/cwharris)
+- cudf consistently specifies the cuda runtime ([#7887](https://github.com/rapidsai/cudf/pull/7887)) [@robertmaynard](https://github.com/robertmaynard)
+- disable verbose output for jitify_preprocess ([#7886](https://github.com/rapidsai/cudf/pull/7886)) [@cwharris](https://github.com/cwharris)
+- CMake jit_preprocess_files function only runs when needed ([#7872](https://github.com/rapidsai/cudf/pull/7872)) [@robertmaynard](https://github.com/robertmaynard)
+- Push DeviceScalar construction into cython for list.contains ([#7864](https://github.com/rapidsai/cudf/pull/7864)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- cudf now sets an install rpath of $ORIGIN ([#7863](https://github.com/rapidsai/cudf/pull/7863)) [@robertmaynard](https://github.com/robertmaynard)
+- Don&#39;t install Thrust examples, tests, docs, and python files ([#7811](https://github.com/rapidsai/cudf/pull/7811)) [@robertmaynard](https://github.com/robertmaynard)
+- Sort by index in groupby tests more consistently ([#7802](https://github.com/rapidsai/cudf/pull/7802)) [@shwina](https://github.com/shwina)
+- Revert &quot;Update conda recipes pinning of repo dependencies ([#7743)&quot; (#7793](https://github.com/rapidsai/cudf/pull/7743)&quot; (#7793)) [@raydouglass](https://github.com/raydouglass)
+- Add decimal column handling in copy_type_metadata ([#7788](https://github.com/rapidsai/cudf/pull/7788)) [@shwina](https://github.com/shwina)
+- Add column names validation in parquet writer ([#7786](https://github.com/rapidsai/cudf/pull/7786)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Java explode outer unit tests ([#7782](https://github.com/rapidsai/cudf/pull/7782)) [@jlowe](https://github.com/jlowe)
+- Fix compiler warning about non-POD types passed through ellipsis ([#7781](https://github.com/rapidsai/cudf/pull/7781)) [@jrhemstad](https://github.com/jrhemstad)
+- User resource fix for replace_nulls ([#7769](https://github.com/rapidsai/cudf/pull/7769)) [@magnatelee](https://github.com/magnatelee)
+- Fix type dispatch for columnar replace_nulls ([#7768](https://github.com/rapidsai/cudf/pull/7768)) [@jlowe](https://github.com/jlowe)
+- Add `ignore_order` parameter to dask-cudf concat dispatch ([#7765](https://github.com/rapidsai/cudf/pull/7765)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix slicing and arrow representations of decimal columns ([#7755](https://github.com/rapidsai/cudf/pull/7755)) [@vyasr](https://github.com/vyasr)
+- Fixing issue with explode_outer position not nulling position entries of null rows ([#7754](https://github.com/rapidsai/cudf/pull/7754)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Implement scatter for struct columns ([#7752](https://github.com/rapidsai/cudf/pull/7752)) [@ttnghia](https://github.com/ttnghia)
+- Fix data corruption in string columns ([#7746](https://github.com/rapidsai/cudf/pull/7746)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix string length in stripe dictionary building ([#7744](https://github.com/rapidsai/cudf/pull/7744)) [@kaatish](https://github.com/kaatish)
+- Update conda recipes pinning of repo dependencies ([#7743](https://github.com/rapidsai/cudf/pull/7743)) [@mike-wendt](https://github.com/mike-wendt)
+- Enable dask dispatch to cuDF&#39;s `is_categorical_dtype` for cuDF objects ([#7740](https://github.com/rapidsai/cudf/pull/7740)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix dictionary size computation in ORC writer ([#7737](https://github.com/rapidsai/cudf/pull/7737)) [@vuule](https://github.com/vuule)
+- Fix `cudf::cast` overflow for `decimal64` to `int32_t` or smaller in certain cases ([#7733](https://github.com/rapidsai/cudf/pull/7733)) [@codereport](https://github.com/codereport)
+- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2)
+- Disable column_view data accessors for unsupported types ([#7725](https://github.com/rapidsai/cudf/pull/7725)) [@jrhemstad](https://github.com/jrhemstad)
+- Materialize `RangeIndex` when `index=True` in parquet writer ([#7711](https://github.com/rapidsai/cudf/pull/7711)) [@galipremsagar](https://github.com/galipremsagar)
+- Don&#39;t identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr)
+- Fix return type of `DataFrame.argsort` ([#7706](https://github.com/rapidsai/cudf/pull/7706)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix/correct cudf installed package requirements ([#7688](https://github.com/rapidsai/cudf/pull/7688)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix SparkMurmurHash3_32 hash inconsistencies with Apache Spark ([#7672](https://github.com/rapidsai/cudf/pull/7672)) [@jlowe](https://github.com/jlowe)
+- Fix ORC reader issue with reading empty string columns ([#7656](https://github.com/rapidsai/cudf/pull/7656)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2)
+- Fixing empty null lists throwing explode_outer for a loop. ([#7649](https://github.com/rapidsai/cudf/pull/7649)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Fix internal compiler error during JNI Docker build ([#7645](https://github.com/rapidsai/cudf/pull/7645)) [@jlowe](https://github.com/jlowe)
+- Fix Debug build break with device_uvectors in grouped_rolling.cu ([#7633](https://github.com/rapidsai/cudf/pull/7633)) [@mythrocks](https://github.com/mythrocks)
+- Parquet reader:  Fix issue when using skip_rows on non-nested columns containing nulls ([#7627](https://github.com/rapidsai/cudf/pull/7627)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix ORC reader for empty DataFrame/Table ([#7624](https://github.com/rapidsai/cudf/pull/7624)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Fix specifying GPU architecture in JNI build ([#7612](https://github.com/rapidsai/cudf/pull/7612)) [@jlowe](https://github.com/jlowe)
+- Fix ORC writer OOM issue ([#7605](https://github.com/rapidsai/cudf/pull/7605)) [@vuule](https://github.com/vuule)
+- Fix 0.18 --&gt; 0.19 automerge ([#7589](https://github.com/rapidsai/cudf/pull/7589)) [@kkraus14](https://github.com/kkraus14)
+- Fix ORC issue with incorrect timestamp nanosecond values ([#7581](https://github.com/rapidsai/cudf/pull/7581)) [@vuule](https://github.com/vuule)
+- Fix missing Dask imports ([#7580](https://github.com/rapidsai/cudf/pull/7580)) [@kkraus14](https://github.com/kkraus14)
+- CMAKE_CUDA_ARCHITECTURES doesn&#39;t change when build-system invokes cmake ([#7579](https://github.com/rapidsai/cudf/pull/7579)) [@robertmaynard](https://github.com/robertmaynard)
+- Another fix for offsets_end() iterator in lists_column_view ([#7575](https://github.com/rapidsai/cudf/pull/7575)) [@ttnghia](https://github.com/ttnghia)
+- Fix ORC writer output corruption with string columns ([#7565](https://github.com/rapidsai/cudf/pull/7565)) [@vuule](https://github.com/vuule)
+- Fix cudf::lists::sort_lists failing for sliced column ([#7564](https://github.com/rapidsai/cudf/pull/7564)) [@ttnghia](https://github.com/ttnghia)
+- FIX Fix Anaconda upload args ([#7558](https://github.com/rapidsai/cudf/pull/7558)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- Fix index mismatch issue in equality related APIs ([#7555](https://github.com/rapidsai/cudf/pull/7555)) [@galipremsagar](https://github.com/galipremsagar)
+- FIX Revert gpuci_conda_retry on conda file output locations ([#7552](https://github.com/rapidsai/cudf/pull/7552)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- Fix offset_end iterator for lists_column_view, which was not correctl… ([#7551](https://github.com/rapidsai/cudf/pull/7551)) [@ttnghia](https://github.com/ttnghia)
+- Fix no such file dlpack.h error when build libcudf ([#7549](https://github.com/rapidsai/cudf/pull/7549)) [@chenrui17](https://github.com/chenrui17)
+- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar)
+- Decimal32 Build Fix ([#7544](https://github.com/rapidsai/cudf/pull/7544)) [@razajafri](https://github.com/razajafri)
+- FIX Retry conda output location ([#7540](https://github.com/rapidsai/cudf/pull/7540)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- fix missing renames of dask git branches from master to main ([#7535](https://github.com/rapidsai/cudf/pull/7535)) [@kkraus14](https://github.com/kkraus14)
+- Remove detail from device_span ([#7533](https://github.com/rapidsai/cudf/pull/7533)) [@rwlee](https://github.com/rwlee)
+- Change dask and distributed branch to main ([#7532](https://github.com/rapidsai/cudf/pull/7532)) [@dantegd](https://github.com/dantegd)
+- Update JNI build to use CUDF_USE_ARROW_STATIC ([#7526](https://github.com/rapidsai/cudf/pull/7526)) [@jlowe](https://github.com/jlowe)
+- Make sure rmm::rmm CMake target is visibile to cudf users ([#7524](https://github.com/rapidsai/cudf/pull/7524)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix contiguous_split not properly handling output partitions &gt; 2 GB. ([#7515](https://github.com/rapidsai/cudf/pull/7515)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Change jit launch to safe_launch ([#7510](https://github.com/rapidsai/cudf/pull/7510)) [@devavret](https://github.com/devavret)
+- Fix comparison between Datetime/Timedelta columns and NULL scalars ([#7504](https://github.com/rapidsai/cudf/pull/7504)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix off-by-one error in char-parallel string scalar replace ([#7502](https://github.com/rapidsai/cudf/pull/7502)) [@jlowe](https://github.com/jlowe)
+- Fix JNI deprecation of all, put it on the wrong version before ([#7501](https://github.com/rapidsai/cudf/pull/7501)) [@revans2](https://github.com/revans2)
+- Fix Series/Dataframe Mixed Arithmetic ([#7491](https://github.com/rapidsai/cudf/pull/7491)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix JNI build after removal of libcudf sub-libraries ([#7486](https://github.com/rapidsai/cudf/pull/7486)) [@jlowe](https://github.com/jlowe)
+- Correctly compile benchmarks ([#7485](https://github.com/rapidsai/cudf/pull/7485)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix bool column corruption with ORC Reader ([#7483](https://github.com/rapidsai/cudf/pull/7483)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Fix `__repr__` for categorical dtype ([#7476](https://github.com/rapidsai/cudf/pull/7476)) [@galipremsagar](https://github.com/galipremsagar)
+- Java cleaner synchronization ([#7474](https://github.com/rapidsai/cudf/pull/7474)) [@abellina](https://github.com/abellina)
+- Fix java float/double parsing tests ([#7473](https://github.com/rapidsai/cudf/pull/7473)) [@revans2](https://github.com/revans2)
+- Pass stream and user resource to make_default_constructed_scalar ([#7469](https://github.com/rapidsai/cudf/pull/7469)) [@magnatelee](https://github.com/magnatelee)
+- Improve stability of dask_cudf.DataFrame.var and dask_cudf.DataFrame.std ([#7453](https://github.com/rapidsai/cudf/pull/7453)) [@rjzamora](https://github.com/rjzamora)
+- Missing `device_storage_dispatch` change affecting `cudf::gather` ([#7449](https://github.com/rapidsai/cudf/pull/7449)) [@codereport](https://github.com/codereport)
+- fix cuFile JNI compile errors ([#7445](https://github.com/rapidsai/cudf/pull/7445)) [@rongou](https://github.com/rongou)
+- Support `Series.__setitem__` with key to a new row ([#7443](https://github.com/rapidsai/cudf/pull/7443)) [@isVoid](https://github.com/isVoid)
+- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source)
+- Make inclusive scan safe for cases with leading nulls ([#7432](https://github.com/rapidsai/cudf/pull/7432)) [@magnatelee](https://github.com/magnatelee)
+- Fix typo in list_device_view::pair_rep_end() ([#7423](https://github.com/rapidsai/cudf/pull/7423)) [@mythrocks](https://github.com/mythrocks)
+- Fix string to double conversion and row equivalent comparison ([#7410](https://github.com/rapidsai/cudf/pull/7410)) [@ttnghia](https://github.com/ttnghia)
+- Fix thrust failure when transfering data from device_vector to host_vector with vectors of size 1 ([#7382](https://github.com/rapidsai/cudf/pull/7382)) [@ttnghia](https://github.com/ttnghia)
+- Fix std::exeception catch-by-reference gcc9 compile error ([#7380](https://github.com/rapidsai/cudf/pull/7380)) [@davidwendt](https://github.com/davidwendt)
+- Fix skiprows issue with ORC Reader ([#7359](https://github.com/rapidsai/cudf/pull/7359)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- fix Arrow CMake file ([#7358](https://github.com/rapidsai/cudf/pull/7358)) [@rongou](https://github.com/rongou)
+- Fix lists::contains() for NaN and Decimals ([#7349](https://github.com/rapidsai/cudf/pull/7349)) [@mythrocks](https://github.com/mythrocks)
+- Handle cupy array in `Dataframe.__setitem__` ([#7340](https://github.com/rapidsai/cudf/pull/7340)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix invalid-device-fn error in cudf::strings::replace_re with multiple regex&#39;s ([#7336](https://github.com/rapidsai/cudf/pull/7336)) [@davidwendt](https://github.com/davidwendt)
+- FIX Add codecov upload block to gpu script ([#6860](https://github.com/rapidsai/cudf/pull/6860)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+
+## 📖 Documentation
+
+- Fix join API doxygen ([#7890](https://github.com/rapidsai/cudf/pull/7890)) [@shwina](https://github.com/shwina)
+- Add Resources to README. ([#7697](https://github.com/rapidsai/cudf/pull/7697)) [@bdice](https://github.com/bdice)
+- Add `isin` examples in Docstring ([#7479](https://github.com/rapidsai/cudf/pull/7479)) [@galipremsagar](https://github.com/galipremsagar)
+- Resolving unlinked type shorthands in cudf doc ([#7416](https://github.com/rapidsai/cudf/pull/7416)) [@isVoid](https://github.com/isVoid)
+- Fix typo in regex.md doc page ([#7363](https://github.com/rapidsai/cudf/pull/7363)) [@davidwendt](https://github.com/davidwendt)
+- Fix incorrect strings_column_view::chars_size documentation ([#7360](https://github.com/rapidsai/cudf/pull/7360)) [@jlowe](https://github.com/jlowe)
+
+## 🚀 New Features
+
+- Enable basic reductions for decimal columns ([#7776](https://github.com/rapidsai/cudf/pull/7776)) [@ChrisJar](https://github.com/ChrisJar)
+- Enable join on decimal columns ([#7764](https://github.com/rapidsai/cudf/pull/7764)) [@ChrisJar](https://github.com/ChrisJar)
+- Allow merging index column with data column using keyword &quot;on&quot; ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source)
+- Implement DecimalColumn + Scalar and add cudf.Scalars of Decimal64Dtype ([#7732](https://github.com/rapidsai/cudf/pull/7732)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add support for `unique` groupby aggregation ([#7726](https://github.com/rapidsai/cudf/pull/7726)) [@shwina](https://github.com/shwina)
+- Expose libcudf&#39;s label_bins function to cudf ([#7724](https://github.com/rapidsai/cudf/pull/7724)) [@vyasr](https://github.com/vyasr)
+- Adding support for equi-join on struct ([#7720](https://github.com/rapidsai/cudf/pull/7720)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Add decimal column comparison operations ([#7716](https://github.com/rapidsai/cudf/pull/7716)) [@isVoid](https://github.com/isVoid)
+- Implement scan operations for decimal columns ([#7707](https://github.com/rapidsai/cudf/pull/7707)) [@ChrisJar](https://github.com/ChrisJar)
+- Enable typecasting between decimal and int ([#7691](https://github.com/rapidsai/cudf/pull/7691)) [@ChrisJar](https://github.com/ChrisJar)
+- Enable decimal support in parquet writer ([#7673](https://github.com/rapidsai/cudf/pull/7673)) [@devavret](https://github.com/devavret)
+- Adds `list.unique` API ([#7664](https://github.com/rapidsai/cudf/pull/7664)) [@isVoid](https://github.com/isVoid)
+- Fix NaN handling in drop_list_duplicates ([#7662](https://github.com/rapidsai/cudf/pull/7662)) [@ttnghia](https://github.com/ttnghia)
+- Add `lists.sort_values` API ([#7657](https://github.com/rapidsai/cudf/pull/7657)) [@isVoid](https://github.com/isVoid)
+- Add is_integer API that can check for the validity of a string-to-integer conversion ([#7642](https://github.com/rapidsai/cudf/pull/7642)) [@ttnghia](https://github.com/ttnghia)
+- Adds `explode` API ([#7607](https://github.com/rapidsai/cudf/pull/7607)) [@isVoid](https://github.com/isVoid)
+- Adds `list.take`, python binding for `cudf::lists::segmented_gather` ([#7591](https://github.com/rapidsai/cudf/pull/7591)) [@isVoid](https://github.com/isVoid)
+- Implement cudf::label_bins() ([#7554](https://github.com/rapidsai/cudf/pull/7554)) [@vyasr](https://github.com/vyasr)
+- Add Python bindings for `lists::contains` ([#7547](https://github.com/rapidsai/cudf/pull/7547)) [@skirui-source](https://github.com/skirui-source)
+- cudf::row_bit_count() support. ([#7534](https://github.com/rapidsai/cudf/pull/7534)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Implement drop_list_duplicates ([#7528](https://github.com/rapidsai/cudf/pull/7528)) [@ttnghia](https://github.com/ttnghia)
+- Add Python bindings for `lists::extract_lists_element` ([#7505](https://github.com/rapidsai/cudf/pull/7505)) [@skirui-source](https://github.com/skirui-source)
+- Add explode_outer and explode_outer_position ([#7499](https://github.com/rapidsai/cudf/pull/7499)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret)
+- Enable type conversion from float to decimal type ([#7450](https://github.com/rapidsai/cudf/pull/7450)) [@ChrisJar](https://github.com/ChrisJar)
+- Add cython for converting strings/fixed-point functions ([#7429](https://github.com/rapidsai/cudf/pull/7429)) [@davidwendt](https://github.com/davidwendt)
+- Add struct column support to cudf::sort and cudf::sorted_order ([#7422](https://github.com/rapidsai/cudf/pull/7422)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement groupby collect_set ([#7420](https://github.com/rapidsai/cudf/pull/7420)) [@ttnghia](https://github.com/ttnghia)
+- Merge branch-0.18 into branch-0.19 ([#7411](https://github.com/rapidsai/cudf/pull/7411)) [@raydouglass](https://github.com/raydouglass)
+- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism)
+- Add groupby scan operations (sort groupby) ([#7387](https://github.com/rapidsai/cudf/pull/7387)) [@karthikeyann](https://github.com/karthikeyann)
+- Add cudf::explode_position ([#7376](https://github.com/rapidsai/cudf/pull/7376)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Add string conversion to/from decimal values libcudf APIs ([#7364](https://github.com/rapidsai/cudf/pull/7364)) [@davidwendt](https://github.com/davidwendt)
+- Add  groupby SUM_OF_SQUARES support ([#7362](https://github.com/rapidsai/cudf/pull/7362)) [@karthikeyann](https://github.com/karthikeyann)
+- Add `Series.drop` api ([#7304](https://github.com/rapidsai/cudf/pull/7304)) [@isVoid](https://github.com/isVoid)
+- get_json_object() implementation ([#7286](https://github.com/rapidsai/cudf/pull/7286)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Python API for `LIstMethods.len()` ([#7283](https://github.com/rapidsai/cudf/pull/7283)) [@isVoid](https://github.com/isVoid)
+- Support null_policy::EXCLUDE for COLLECT rolling aggregation ([#7264](https://github.com/rapidsai/cudf/pull/7264)) [@mythrocks](https://github.com/mythrocks)
+- Add support for special tokens in nvtext::subword_tokenizer ([#7254](https://github.com/rapidsai/cudf/pull/7254)) [@davidwendt](https://github.com/davidwendt)
+- Fix inplace update of data and add Series.update ([#7201](https://github.com/rapidsai/cudf/pull/7201)) [@galipremsagar](https://github.com/galipremsagar)
+- Implement `cudf::group_by` (hash) for `decimal32` and `decimal64` ([#7190](https://github.com/rapidsai/cudf/pull/7190)) [@codereport](https://github.com/codereport)
+- Adding support to specify &quot;level&quot; parameter  for `Dataframe.rename` ([#7135](https://github.com/rapidsai/cudf/pull/7135)) [@skirui-source](https://github.com/skirui-source)
+
+## 🛠️ Improvements
+
+- fix GDS include path for version 0.95 ([#7877](https://github.com/rapidsai/cudf/pull/7877)) [@rongou](https://github.com/rongou)
+- Update `dask` + `distributed` to `2021.4.0` ([#7858](https://github.com/rapidsai/cudf/pull/7858)) [@jakirkham](https://github.com/jakirkham)
+- Add ability to extract include dirs from `CUDF_HOME` ([#7848](https://github.com/rapidsai/cudf/pull/7848)) [@galipremsagar](https://github.com/galipremsagar)
+- Add USE_GDS as an option in build script ([#7833](https://github.com/rapidsai/cudf/pull/7833)) [@pxLi](https://github.com/pxLi)
+- add an allocate method with stream in java DeviceMemoryBuffer ([#7826](https://github.com/rapidsai/cudf/pull/7826)) [@rongou](https://github.com/rongou)
+- Constrain dask and distributed versions to 2021.3.1 ([#7825](https://github.com/rapidsai/cudf/pull/7825)) [@shwina](https://github.com/shwina)
+- Revert dask versioning of concat dispatch ([#7823](https://github.com/rapidsai/cudf/pull/7823)) [@galipremsagar](https://github.com/galipremsagar)
+- add copy methods in Java memory buffer ([#7791](https://github.com/rapidsai/cudf/pull/7791)) [@rongou](https://github.com/rongou)
+- Update README and CONTRIBUTING for 0.19 ([#7778](https://github.com/rapidsai/cudf/pull/7778)) [@robertmaynard](https://github.com/robertmaynard)
+- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee)
+- Turn on NVTX by default in java build ([#7761](https://github.com/rapidsai/cudf/pull/7761)) [@tgravescs](https://github.com/tgravescs)
+- Add Java bindings to join gather map APIs ([#7751](https://github.com/rapidsai/cudf/pull/7751)) [@jlowe](https://github.com/jlowe)
+- Add replacements column support for Java replaceNulls ([#7750](https://github.com/rapidsai/cudf/pull/7750)) [@jlowe](https://github.com/jlowe)
+- Add Java bindings for row_bit_count ([#7749](https://github.com/rapidsai/cudf/pull/7749)) [@jlowe](https://github.com/jlowe)
+- Remove unused JVM array creation ([#7748](https://github.com/rapidsai/cudf/pull/7748)) [@jlowe](https://github.com/jlowe)
+- Added JNI support for new is_integer ([#7739](https://github.com/rapidsai/cudf/pull/7739)) [@revans2](https://github.com/revans2)
+- Create and promote library aliases in libcudf installations ([#7734](https://github.com/rapidsai/cudf/pull/7734)) [@trxcllnt](https://github.com/trxcllnt)
+- Support groupby operations for decimal dtypes ([#7731](https://github.com/rapidsai/cudf/pull/7731)) [@vyasr](https://github.com/vyasr)
+- Memory map the input file only when GDS compatiblity mode is not used ([#7717](https://github.com/rapidsai/cudf/pull/7717)) [@vuule](https://github.com/vuule)
+- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism)
+- Struct hashing support for SerialMurmur3 and SparkMurmur3 ([#7714](https://github.com/rapidsai/cudf/pull/7714)) [@jlowe](https://github.com/jlowe)
+- Add gbenchmark for nvtext replace-tokens function ([#7708](https://github.com/rapidsai/cudf/pull/7708)) [@davidwendt](https://github.com/davidwendt)
+- Use stream in groupby calls ([#7705](https://github.com/rapidsai/cudf/pull/7705)) [@karthikeyann](https://github.com/karthikeyann)
+- Update codeowners file ([#7701](https://github.com/rapidsai/cudf/pull/7701)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Cleanup groupby to use host_span, device_span, device_uvector ([#7698](https://github.com/rapidsai/cudf/pull/7698)) [@karthikeyann](https://github.com/karthikeyann)
+- Add gbenchmark for nvtext ngrams functions ([#7693](https://github.com/rapidsai/cudf/pull/7693)) [@davidwendt](https://github.com/davidwendt)
+- Misc Python/Cython optimizations ([#7686](https://github.com/rapidsai/cudf/pull/7686)) [@shwina](https://github.com/shwina)
+- Add gbenchmark for nvtext tokenize functions ([#7684](https://github.com/rapidsai/cudf/pull/7684)) [@davidwendt](https://github.com/davidwendt)
+- Add column_device_view to orc writer ([#7676](https://github.com/rapidsai/cudf/pull/7676)) [@kaatish](https://github.com/kaatish)
+- cudf_kafka now uses cuDF CMake export targets (CPM) ([#7674](https://github.com/rapidsai/cudf/pull/7674)) [@robertmaynard](https://github.com/robertmaynard)
+- Add gbenchmark for nvtext normalize functions ([#7668](https://github.com/rapidsai/cudf/pull/7668)) [@davidwendt](https://github.com/davidwendt)
+- Resolve unnecessary import of thrust/optional.hpp in types.hpp ([#7667](https://github.com/rapidsai/cudf/pull/7667)) [@vyasr](https://github.com/vyasr)
+- Feature/optimize accessor copy ([#7660](https://github.com/rapidsai/cudf/pull/7660)) [@vyasr](https://github.com/vyasr)
+- Fix `find_package(cudf)` ([#7658](https://github.com/rapidsai/cudf/pull/7658)) [@trxcllnt](https://github.com/trxcllnt)
+- Work-around for gcc7 compile error on Centos7 ([#7652](https://github.com/rapidsai/cudf/pull/7652)) [@davidwendt](https://github.com/davidwendt)
+- Add in JNI support for count_elements ([#7651](https://github.com/rapidsai/cudf/pull/7651)) [@revans2](https://github.com/revans2)
+- Fix issues with building cudf in a non-conda environment ([#7647](https://github.com/rapidsai/cudf/pull/7647)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor ConfigureCUDA to not conditionally insert compiler flags ([#7643](https://github.com/rapidsai/cudf/pull/7643)) [@robertmaynard](https://github.com/robertmaynard)
+- Add gbenchmark for converting strings to/from timestamps ([#7641](https://github.com/rapidsai/cudf/pull/7641)) [@davidwendt](https://github.com/davidwendt)
+- Handle constructing a `cudf.Scalar` from a `cudf.Scalar` ([#7639](https://github.com/rapidsai/cudf/pull/7639)) [@shwina](https://github.com/shwina)
+- Add in JNI support for table partition ([#7637](https://github.com/rapidsai/cudf/pull/7637)) [@revans2](https://github.com/revans2)
+- Add explicit fixed_point merge test ([#7635](https://github.com/rapidsai/cudf/pull/7635)) [@codereport](https://github.com/codereport)
+- Add JNI support for IDENTITY hash partitioning ([#7626](https://github.com/rapidsai/cudf/pull/7626)) [@revans2](https://github.com/revans2)
+- Java support on explode_outer ([#7625](https://github.com/rapidsai/cudf/pull/7625)) [@sperlingxx](https://github.com/sperlingxx)
+- Java support of casting string from/to decimal ([#7623](https://github.com/rapidsai/cudf/pull/7623)) [@sperlingxx](https://github.com/sperlingxx)
+- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism)
+- Add gbenchmark for cudf::strings::translate function ([#7617](https://github.com/rapidsai/cudf/pull/7617)) [@davidwendt](https://github.com/davidwendt)
+- Use file(COPY ) over file(INSTALL ) so cmake output is reduced ([#7616](https://github.com/rapidsai/cudf/pull/7616)) [@robertmaynard](https://github.com/robertmaynard)
+- Use rmm::device_uvector in place of rmm::device_vector for ORC reader/writer and cudf::io::column_buffer ([#7614](https://github.com/rapidsai/cudf/pull/7614)) [@vuule](https://github.com/vuule)
+- Refactor Java host-side buffer concatenation to expose separate steps ([#7610](https://github.com/rapidsai/cudf/pull/7610)) [@jlowe](https://github.com/jlowe)
+- Add gbenchmarks for string substrings functions ([#7603](https://github.com/rapidsai/cudf/pull/7603)) [@davidwendt](https://github.com/davidwendt)
+- Refactor string conversion check ([#7599](https://github.com/rapidsai/cudf/pull/7599)) [@ttnghia](https://github.com/ttnghia)
+- JNI: Pass names of children struct columns to native Arrow IPC writer ([#7598](https://github.com/rapidsai/cudf/pull/7598)) [@firestarman](https://github.com/firestarman)
+- Revert &quot;ENH Fix stale GHA and prevent duplicates &quot; ([#7595](https://github.com/rapidsai/cudf/pull/7595)) [@mike-wendt](https://github.com/mike-wendt)
+- ENH Fix stale GHA and prevent duplicates ([#7594](https://github.com/rapidsai/cudf/pull/7594)) [@mike-wendt](https://github.com/mike-wendt)
+- Fix auto-detecting GPU architectures ([#7593](https://github.com/rapidsai/cudf/pull/7593)) [@trxcllnt](https://github.com/trxcllnt)
+- Reduce cudf library size ([#7583](https://github.com/rapidsai/cudf/pull/7583)) [@robertmaynard](https://github.com/robertmaynard)
+- Optimize cudf::make_strings_column for long strings ([#7576](https://github.com/rapidsai/cudf/pull/7576)) [@davidwendt](https://github.com/davidwendt)
+- Always build and export the cudf::cudftestutil target ([#7574](https://github.com/rapidsai/cudf/pull/7574)) [@trxcllnt](https://github.com/trxcllnt)
+- Eliminate literal parameters to uvector::set_element_async and device_scalar::set_value ([#7563](https://github.com/rapidsai/cudf/pull/7563)) [@harrism](https://github.com/harrism)
+- Add gbenchmark for strings::concatenate ([#7560](https://github.com/rapidsai/cudf/pull/7560)) [@davidwendt](https://github.com/davidwendt)
+- Update Changelog Link ([#7550](https://github.com/rapidsai/cudf/pull/7550)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Add gbenchmarks for strings replace regex functions ([#7541](https://github.com/rapidsai/cudf/pull/7541)) [@davidwendt](https://github.com/davidwendt)
+- Add `__repr__` for Column and ColumnAccessor ([#7531](https://github.com/rapidsai/cudf/pull/7531)) [@shwina](https://github.com/shwina)
+- Support Decimal DIV changes in cudf ([#7527](https://github.com/rapidsai/cudf/pull/7527)) [@razajafri](https://github.com/razajafri)
+- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt)
+- Use device_uvector, device_span in sort groupby ([#7523](https://github.com/rapidsai/cudf/pull/7523)) [@karthikeyann](https://github.com/karthikeyann)
+- Add gbenchmarks for strings extract function ([#7522](https://github.com/rapidsai/cudf/pull/7522)) [@davidwendt](https://github.com/davidwendt)
+- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt)
+- Reduce compile time/size for scan.cu ([#7516](https://github.com/rapidsai/cudf/pull/7516)) [@davidwendt](https://github.com/davidwendt)
+- Change device_vector to device_uvector in nvtext source files ([#7512](https://github.com/rapidsai/cudf/pull/7512)) [@davidwendt](https://github.com/davidwendt)
+- Removed unneeded includes from traits.hpp ([#7509](https://github.com/rapidsai/cudf/pull/7509)) [@davidwendt](https://github.com/davidwendt)
+- FIX Remove random build directory generation for ccache ([#7508](https://github.com/rapidsai/cudf/pull/7508)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- xfail failing pytest in pandas 1.2.3 ([#7507](https://github.com/rapidsai/cudf/pull/7507)) [@galipremsagar](https://github.com/galipremsagar)
+- JNI bit cast ([#7493](https://github.com/rapidsai/cudf/pull/7493)) [@revans2](https://github.com/revans2)
+- Combine rolling window function tests ([#7480](https://github.com/rapidsai/cudf/pull/7480)) [@mythrocks](https://github.com/mythrocks)
+- Prepare Changelog for Automation ([#7477](https://github.com/rapidsai/cudf/pull/7477)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Java support for explode position ([#7471](https://github.com/rapidsai/cudf/pull/7471)) [@sperlingxx](https://github.com/sperlingxx)
+- Update 0.18 changelog entry ([#7463](https://github.com/rapidsai/cudf/pull/7463)) [@ajschmidt8](https://github.com/ajschmidt8)
+- JNI: Support skipping nulls for collect aggregation ([#7457](https://github.com/rapidsai/cudf/pull/7457)) [@firestarman](https://github.com/firestarman)
+- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina)
+- Remove dependence on managed memory for multimap test ([#7451](https://github.com/rapidsai/cudf/pull/7451)) [@jrhemstad](https://github.com/jrhemstad)
+- Use cuFile for Parquet IO when available ([#7444](https://github.com/rapidsai/cudf/pull/7444)) [@vuule](https://github.com/vuule)
+- Statistics cleanup ([#7439](https://github.com/rapidsai/cudf/pull/7439)) [@kaatish](https://github.com/kaatish)
+- Add gbenchmarks for strings filter functions ([#7438](https://github.com/rapidsai/cudf/pull/7438)) [@davidwendt](https://github.com/davidwendt)
+- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport)
+- Improve string gather performance ([#7433](https://github.com/rapidsai/cudf/pull/7433)) [@jlowe](https://github.com/jlowe)
+- Don&#39;t use user resource for a temporary allocation in sort_by_key ([#7431](https://github.com/rapidsai/cudf/pull/7431)) [@magnatelee](https://github.com/magnatelee)
+- Detail APIs for datetime functions ([#7430](https://github.com/rapidsai/cudf/pull/7430)) [@magnatelee](https://github.com/magnatelee)
+- Replace thrust::max_element with thrust::reduce in strings findall_re ([#7428](https://github.com/rapidsai/cudf/pull/7428)) [@davidwendt](https://github.com/davidwendt)
+- Add gbenchmark for strings split/split_record functions ([#7427](https://github.com/rapidsai/cudf/pull/7427)) [@davidwendt](https://github.com/davidwendt)
+- Update JNI build to use CMAKE_CUDA_ARCHITECTURES ([#7425](https://github.com/rapidsai/cudf/pull/7425)) [@jlowe](https://github.com/jlowe)
+- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt)
+- Simplify type dispatch with `device_storage_dispatch` ([#7419](https://github.com/rapidsai/cudf/pull/7419)) [@codereport](https://github.com/codereport)
+- Java support for casting of nested child columns ([#7417](https://github.com/rapidsai/cudf/pull/7417)) [@razajafri](https://github.com/razajafri)
+- Improve scalar string replace performance for long strings ([#7415](https://github.com/rapidsai/cudf/pull/7415)) [@jlowe](https://github.com/jlowe)
+- Remove unneeded temporary device vector for strings scatter specialization ([#7409](https://github.com/rapidsai/cudf/pull/7409)) [@davidwendt](https://github.com/davidwendt)
+- bitmask_or implementation with bitmask refactor ([#7406](https://github.com/rapidsai/cudf/pull/7406)) [@rwlee](https://github.com/rwlee)
+- Add other cudf::strings::replace functions to current strings replace gbenchmark ([#7403](https://github.com/rapidsai/cudf/pull/7403)) [@davidwendt](https://github.com/davidwendt)
+- Clean up included headers in `device_operators.cuh` ([#7401](https://github.com/rapidsai/cudf/pull/7401)) [@codereport](https://github.com/codereport)
+- Move nullable index iterator to indexalator factory ([#7399](https://github.com/rapidsai/cudf/pull/7399)) [@davidwendt](https://github.com/davidwendt)
+- ENH Pass ccache variables to conda recipe &amp; use Ninja in CI ([#7398](https://github.com/rapidsai/cudf/pull/7398)) [@Ethyling](https://github.com/Ethyling)
+- upgrade maven-antrun-plugin to support maven parallel builds ([#7393](https://github.com/rapidsai/cudf/pull/7393)) [@rongou](https://github.com/rongou)
+- Add gbenchmark for strings find/contains functions ([#7392](https://github.com/rapidsai/cudf/pull/7392)) [@davidwendt](https://github.com/davidwendt)
+- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard)
+- Refactor libcudf strings::replace to use make_strings_children utility ([#7384](https://github.com/rapidsai/cudf/pull/7384)) [@davidwendt](https://github.com/davidwendt)
+- Added in JNI support for out of core sort algorithm ([#7381](https://github.com/rapidsai/cudf/pull/7381)) [@revans2](https://github.com/revans2)
+- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar)
+- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia)
+- jitify 2 support ([#7372](https://github.com/rapidsai/cudf/pull/7372)) [@cwharris](https://github.com/cwharris)
+- compile_udf: Cache PTX for similar functions ([#7371](https://github.com/rapidsai/cudf/pull/7371)) [@gmarkall](https://github.com/gmarkall)
+- Add string scalar replace benchmark ([#7369](https://github.com/rapidsai/cudf/pull/7369)) [@jlowe](https://github.com/jlowe)
+- Add gbenchmark for strings contains_re/count_re functions ([#7366](https://github.com/rapidsai/cudf/pull/7366)) [@davidwendt](https://github.com/davidwendt)
+- Update orc reader and writer fuzz tests ([#7357](https://github.com/rapidsai/cudf/pull/7357)) [@galipremsagar](https://github.com/galipremsagar)
+- Improve url_decode performance for long strings ([#7353](https://github.com/rapidsai/cudf/pull/7353)) [@jlowe](https://github.com/jlowe)
+- `cudf::ast` Small Refactorings ([#7352](https://github.com/rapidsai/cudf/pull/7352)) [@codereport](https://github.com/codereport)
+- Remove std::cout and print in the scatter test function EmptyListsOfNullableStrings. ([#7342](https://github.com/rapidsai/cudf/pull/7342)) [@ttnghia](https://github.com/ttnghia)
+- Use `cudf::detail::make_counting_transform_iterator` ([#7338](https://github.com/rapidsai/cudf/pull/7338)) [@codereport](https://github.com/codereport)
+- Change block size parameter from a global to a template param. ([#7333](https://github.com/rapidsai/cudf/pull/7333)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Partial clean up of ORC writer ([#7324](https://github.com/rapidsai/cudf/pull/7324)) [@vuule](https://github.com/vuule)
+- Add gbenchmark for cudf::strings::to_lower ([#7316](https://github.com/rapidsai/cudf/pull/7316)) [@davidwendt](https://github.com/davidwendt)
+- Update Java bindings version to 0.19-SNAPSHOT ([#7307](https://github.com/rapidsai/cudf/pull/7307)) [@pxLi](https://github.com/pxLi)
+- Move `cudf::test::make_counting_transform_iterator` to `cudf/detail/iterator.cuh` ([#7306](https://github.com/rapidsai/cudf/pull/7306)) [@codereport](https://github.com/codereport)
+- Use string literals in `fixed_point` `release_assert`s ([#7303](https://github.com/rapidsai/cudf/pull/7303)) [@codereport](https://github.com/codereport)
+- Fix merge conflicts for #7295 ([#7297](https://github.com/rapidsai/cudf/pull/7297)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Add UTF-8 chars to create_random_column&lt;string_view&gt; benchmark utility ([#7292](https://github.com/rapidsai/cudf/pull/7292)) [@davidwendt](https://github.com/davidwendt)
+- Abstracting block reduce and block scan from cuIO kernels with `cub` apis ([#7278](https://github.com/rapidsai/cudf/pull/7278)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Build.sh use cmake --build to drive build system invocation ([#7270](https://github.com/rapidsai/cudf/pull/7270)) [@robertmaynard](https://github.com/robertmaynard)
+- Refactor dictionary support for reductions any/all ([#7242](https://github.com/rapidsai/cudf/pull/7242)) [@davidwendt](https://github.com/davidwendt)
+- Replace stream.value() with stream for stream_view args ([#7236](https://github.com/rapidsai/cudf/pull/7236)) [@karthikeyann](https://github.com/karthikeyann)
+- Interval index and interval_range ([#7182](https://github.com/rapidsai/cudf/pull/7182)) [@marlenezw](https://github.com/marlenezw)
+- avro reader integration tests ([#7156](https://github.com/rapidsai/cudf/pull/7156)) [@cwharris](https://github.com/cwharris)
+- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt)
+- Adding Interval Dtype ([#6984](https://github.com/rapidsai/cudf/pull/6984)) [@marlenezw](https://github.com/marlenezw)
+- Cleaning up `for` loops with `make_(counting_)transform_iterator` ([#6546](https://github.com/rapidsai/cudf/pull/6546)) [@codereport](https://github.com/codereport)
 
 # cuDF 0.18.0 (24 Feb 2021)
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index cfed2e1a692..dde3e2107cf 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -131,14 +131,14 @@ run each time you commit changes.
 
 Compiler requirements:
 
-* `gcc`     version 7.1+
-* `nvcc`    version 10.1+
+* `gcc`     version 9.3+
+* `nvcc`    version 11.0+
 * `cmake`   version 3.18.0+
 
 CUDA/GPU requirements:
 
-* CUDA 10.1+
-* NVIDIA driver 410.48+
+* CUDA 11.0+
+* NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
 You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
@@ -160,7 +160,7 @@ git submodule update --init --remote --recursive
 ```bash
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda10.0.yml
+conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.0.yml
 # activate the environment
 conda activate cudf_dev
 ```
@@ -281,8 +281,8 @@ A Dockerfile is provided with a preconfigured conda environment for building and
 ### Prerequisites
 
 * Install [nvidia-docker2](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)) for Docker + GPU support
-* Verify NVIDIA driver is `410.48` or higher
-* Ensure CUDA 10.0+ is installed
+* Verify NVIDIA driver is `450.80.02` or higher
+* Ensure CUDA 11.0+ is installed
 
 ### Usage
 
@@ -309,16 +309,16 @@ flag. Below is a list of the available arguments and their purpose:
 
 | Build Argument | Default Value | Other Value(s) | Purpose |
 | --- | --- | --- | --- |
-| `CUDA_VERSION` | 10.0 | 10.1, 10.2 | set CUDA version |
-| `LINUX_VERSION` | ubuntu16.04 | ubuntu18.04 | set Ubuntu version |
-| `CC` & `CXX` | 5 | 7 | set gcc/g++ version; **NOTE:** gcc7 requires Ubuntu 18.04 |
+| `CUDA_VERSION` | 11.0 | 11.2.2 | set CUDA version |
+| `LINUX_VERSION` | ubuntu18.04 | ubuntu20.04 | set Ubuntu version |
+| `CC` & `CXX` | 9 | 10 | set gcc/g++ version |
 | `CUDF_REPO` | This repo | Forks of cuDF | set git URL to use for `git clone` |
 | `CUDF_BRANCH` | main | Any branch name | set git branch to checkout of `CUDF_REPO` |
 | `NUMBA_VERSION` | newest | >=0.40.0 | set numba version |
 | `NUMPY_VERSION` | newest | >=1.14.3 | set numpy version |
 | `PANDAS_VERSION` | newest | >=0.23.4 | set pandas version |
 | `PYARROW_VERSION` | 1.0.1 | Not supported | set pyarrow version |
-| `CMAKE_VERSION` | newest | >=3.14 | set cmake version |
+| `CMAKE_VERSION` | newest | >=3.18 | set cmake version |
 | `CYTHON_VERSION` | 0.29 | Not supported | set Cython version |
 | `PYTHON_VERSION` | 3.7 | 3.8 | set python version |
 
diff --git a/Dockerfile b/Dockerfile
index f48ed3646f4..eef8a04067d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,23 +1,24 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
 # An integration test & dev container which builds and installs cuDF from main
-ARG CUDA_VERSION=10.1
+ARG CUDA_VERSION=11.0
 ARG CUDA_SHORT_VERSION=${CUDA_VERSION}
-ARG LINUX_VERSION=ubuntu16.04
+ARG LINUX_VERSION=ubuntu18.04
 FROM nvidia/cuda:${CUDA_VERSION}-devel-${LINUX_VERSION}
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/lib
-# Needed for cudf.concat(), avoids "OSError: library nvvm not found"
-ENV NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so
-ENV NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/
 ENV DEBIAN_FRONTEND=noninteractive
 
-ARG CC=5
-ARG CXX=5
+ARG CC=9
+ARG CXX=9
 RUN apt update -y --fix-missing && \
     apt upgrade -y && \
+    apt install -y --no-install-recommends software-properties-common && \
+    add-apt-repository ppa:ubuntu-toolchain-r/test && \
+    apt update -y --fix-missing && \
     apt install -y --no-install-recommends \
       git \
       gcc-${CC} \
       g++-${CXX} \
-      libboost-all-dev \
       tzdata && \
     apt-get autoremove -y && \
     apt-get clean && \
@@ -66,18 +67,10 @@ RUN if [ -f /cudf/docker/package_versions.sh ]; \
          conda env create --name cudf --file /cudf/conda/environments/cudf_dev_cuda${CUDA_SHORT_VERSION}.yml ; \
     fi
 
-# libcudf build/install
-ENV CC=/usr/bin/gcc-${CC}
-ENV CXX=/usr/bin/g++-${CXX}
-RUN source activate cudf && \
-    mkdir -p /cudf/cpp/build && \
-    cd /cudf/cpp/build && \
-    cmake .. -DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX} && \
-    make -j"$(nproc)" install
+ENV CC=/opts/conda/envs/rapids/bin/gcc-${CC}
+ENV CXX=/opts/conda/envs/rapids/bin/g++-${CXX}
 
-# cuDF build/install
+# libcudf & cudf build/install
 RUN source activate cudf && \
-    cd /cudf/python/cudf && \
-    python setup.py build_ext --inplace && \
-    python setup.py install && \
-    python setup.py install
+    cd /cudf/ && \
+    ./build.sh libcudf cudf
diff --git a/README.md b/README.md
index 6d67251b845..545e3331681 100644
--- a/README.md
+++ b/README.md
@@ -57,15 +57,15 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### CUDA/GPU requirements
 
-* CUDA 10.1+
-* NVIDIA driver 418.39+
+* CUDA 11.0+
+* NVIDIA driver 450.80.02+
 * Pascal architecture or better (Compute Capability >=6.0)
 
 ### Conda
 
 cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel:
 
-For `cudf version == 0.19` :
+For `cudf version == 0.19.2` :
 ```bash
 # for CUDA 10.1
 conda install -c rapidsai -c nvidia -c numba -c conda-forge \
@@ -79,13 +79,13 @@ conda install -c rapidsai -c nvidia -c numba -c conda-forge \
 
 For the nightly version of `cudf` :
 ```bash
-# for CUDA 10.1
+# for CUDA 11.0
 conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge \
-    cudf python=3.7 cudatoolkit=10.1
+    cudf python=3.7 cudatoolkit=11.0
 
-# or, for CUDA 10.2
+# or, for CUDA 11.2
 conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge \
-    cudf python=3.7 cudatoolkit=10.2
+    cudf python=3.7 cudatoolkit=11.2
 ```
 
 Note: cuDF is supported only on Linux, and with Python versions 3.7 and later.
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 8dd133c8fa3..b2426e22605 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -21,15 +21,15 @@ function hasArg {
 export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
 export PARALLEL_LEVEL=4
 export CUDA_REL=${CUDA_VERSION%.*}
-export HOME=$WORKSPACE
+export HOME="$WORKSPACE"
 
 # Parse git describe
-cd $WORKSPACE
+cd "$WORKSPACE"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Set Benchmark Vars
-export GBENCH_BENCHMARKS_DIR=${WORKSPACE}/cpp/build/gbenchmarks/
+export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 
 # Set `LIBCUDF_KERNEL_CACHE_PATH` environment variable to $HOME/.jitify-cache because
 # it's local to the container's virtual file system, and not shared with other CI jobs
@@ -77,8 +77,8 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
 # Install the master version of dask, distributed, and streamz
 logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
 pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git@2021.05.1 --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git@2021.05.1" --upgrade --no-deps
 logger "pip install git+https://github.com/python-streamz/streamz.git --upgrade --no-deps"
 pip install "git+https://github.com/python-streamz/streamz.git" --upgrade --no-deps
 
@@ -96,9 +96,9 @@ conda list --show-channel-urls
 
 logger "Build libcudf..."
 if [[ ${BUILD_MODE} == "pull-request" ]]; then
-    $WORKSPACE/build.sh clean libcudf cudf dask_cudf benchmarks tests --ptds
+    "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds
 else
-    $WORKSPACE/build.sh clean libcudf cudf dask_cudf benchmarks tests -l --ptds
+    "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds
 fi
 
 ################################################################################
@@ -144,9 +144,9 @@ function getReqs() {
 
 REQS=$(getReqs "${LIBCUDF_DEPS[@]}")
 
-mkdir -p ${WORKSPACE}/tmp/benchmark
-touch ${WORKSPACE}/tmp/benchmark/benchmarks.txt
-ls ${GBENCH_BENCHMARKS_DIR} > ${WORKSPACE}/tmp/benchmark/benchmarks.txt
+mkdir -p "$WORKSPACE/tmp/benchmark"
+touch "$WORKSPACE/tmp/benchmark/benchmarks.txt"
+ls ${GBENCH_BENCHMARKS_DIR} > "$WORKSPACE/tmp/benchmark/benchmarks.txt"
 
 #Disable error aborting while tests run, failed tests will not generate data
 logger "Running libcudf GBenchmarks..."
@@ -161,13 +161,13 @@ do
         rm ./${BENCH}.json
 	JOBEXITCODE=1
     fi
-done < ${WORKSPACE}/tmp/benchmark/benchmarks.txt
+done < "$WORKSPACE/tmp/benchmark/benchmarks.txt"
 set -e
 
-rm ${WORKSPACE}/tmp/benchmark/benchmarks.txt
-cd ${WORKSPACE}
-mv ${GBENCH_BENCHMARKS_DIR}/*.json ${WORKSPACE}/tmp/benchmark/
-python GBenchToASV.py -d  ${WORKSPACE}/tmp/benchmark/ -t ${S3_ASV_DIR} -n libcudf -b branch-${MINOR_VERSION} -r "${REQS}"
+rm "$WORKSPACE/tmp/benchmark/benchmarks.txt"
+cd "$WORKSPACE"
+mv ${GBENCH_BENCHMARKS_DIR}/*.json "$WORKSPACE/tmp/benchmark/"
+python GBenchToASV.py -d  "$WORKSPACE/tmp/benchmark/" -t ${S3_ASV_DIR} -n libcudf -b branch-${MINOR_VERSION} -r "${REQS}"
 
 ###
 # Run Python Benchmarks
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 17599c6d74d..981e886d31c 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #####################
 # cuDF Style Tester #
 #####################
@@ -33,6 +33,10 @@ FLAKE_CYTHON_RETVAL=$?
 MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf`
 MYPY_CUDF_RETVAL=$?
 
+# Run pydocstyle and get results/return code
+PYDOCSTYLE=`pydocstyle --config=python/.flake8 python`
+PYDOCSTYLE_RETVAL=$?
+
 # Run clang-format and check for a consistent code format
 CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
 CLANG_FORMAT_RETVAL=$?
@@ -78,6 +82,14 @@ else
   echo -e "\n\n>>>> PASSED: mypy style check\n\n"
 fi
 
+if [ "$PYDOCSTYLE_RETVAL" != "0" ]; then
+  echo -e "\n\n>>>> FAILED: pydocstyle style check; begin output\n\n"
+  echo -e "$PYDOCSTYLE"
+  echo -e "\n\n>>>> FAILED: pydocstyle style check; end output\n\n"
+else
+  echo -e "\n\n>>>> PASSED: pydocstyle style check\n\n"
+fi
+
 if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
   echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
   echo -e "$CLANG_FORMAT"
@@ -91,7 +103,7 @@ HEADER_META=`ci/checks/headers_test.sh`
 HEADER_META_RETVAL=$?
 echo -e "$HEADER_META"
 
-RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
+RETVALS=($ISORT_RETVAL $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $PYDOCSTYLE_RETVAL $CLANG_FORMAT_RETVAL $HEADER_META_RETVAL $MYPY_CUDF_RETVAL)
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
 
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 588debc40db..e11a0488624 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -10,7 +10,7 @@ export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
 export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 
 # Set home to the job's workspace
-export HOME=$WORKSPACE
+export HOME="$WORKSPACE"
 
 # Determine CUDA release version
 export CUDA_REL=${CUDA_VERSION%.*}
@@ -21,10 +21,10 @@ export GPUCI_CONDA_RETRY_SLEEP=30
 
 # Use Ninja to build, setup Conda Build Dir
 export CMAKE_GENERATOR="Ninja"
-export CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
+export CONDA_BLD_DIR="$WORKSPACE/.conda-bld"
 
 # Switch to project root; also root of repo checkout
-cd $WORKSPACE
+cd "$WORKSPACE"
 
 # If nightly build, append current YYMMDD to version
 if [[ "$BUILD_MODE" = "branch" && "$SOURCE_BRANCH" = branch-* ]] ; then
@@ -42,6 +42,11 @@ gpuci_logger "Activate conda env"
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
+# Remove rapidsai-nightly channel if we are building main branch
+if [ "$SOURCE_BRANCH" = "main" ]; then
+  conda config --system --remove channels rapidsai-nightly
+fi
+
 gpuci_logger "Check compiler versions"
 python --version
 $CC --version
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index 76059867321..ed2484814fb 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -14,14 +14,14 @@ else
 fi
 
 # upload cudf_kafka for all versions of Python
-if [[ "$CUDA" == "10.1" ]]; then
+if [[ "$CUDA" == "11.0" ]]; then
     export UPLOAD_CUDF_KAFKA=1
 else
     export UPLOAD_CUDF_KAFKA=0
 fi
 
 #We only want to upload libcudf_kafka once per python/CUDA combo
-if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "10.1" ]]; then
+if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "11.0" ]]; then
     export UPLOAD_LIBCUDF_KAFKA=1
 else
     export UPLOAD_LIBCUDF_KAFKA=0
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 4f72f6dd772..40e80def8ae 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -29,8 +29,8 @@ fi
 
 gpuci_logger "Get conda file output locations"
 
-export LIBCUDF_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf --output`
-export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot ${WORKSPACE}/.conda-bld conda/recipes/libcudf_kafka --output`
+export LIBCUDF_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf --output`
+export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf_kafka --output`
 export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output`
 export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output`
 export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output`
diff --git a/ci/docs/build.sh b/ci/docs/build.sh
index 79aa513c58b..a7771124713 100755
--- a/ci/docs/build.sh
+++ b/ci/docs/build.sh
@@ -10,12 +10,11 @@ if [ -z "$PROJECT_WORKSPACE" ]; then
     exit 1
 fi
 
-export DOCS_WORKSPACE=$WORKSPACE/docs
+export DOCS_WORKSPACE="$WORKSPACE/docs"
 export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
-export HOME=$WORKSPACE
+export HOME="$WORKSPACE"
 export PROJECT_WORKSPACE=/rapids/cudf
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
-export NIGHTLY_VERSION=$(echo $BRANCH_VERSION | awk -F. '{print $2}')
 export PROJECTS=(cudf libcudf)
 
 gpuci_logger "Check environment..."
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 7614e19cc89..5f163f93410 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -17,14 +17,14 @@ export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
 export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 
 # Set home to the job's workspace
-export HOME=$WORKSPACE
+export HOME="$WORKSPACE"
 
 # Switch to project root; also root of repo checkout
-cd $WORKSPACE
+cd "$WORKSPACE"
 
 # Determine CUDA release version
 export CUDA_REL=${CUDA_VERSION%.*}
-export CONDA_ARTIFACT_PATH=${WORKSPACE}/ci/artifacts/cudf/cpu/.conda-bld/
+export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
 
 # Parse git describe
 export GIT_DESCRIBE_TAG=`git describe --tags`
@@ -80,7 +80,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=${MINOR_VERSION}"
+                  "ucx-py=0.20.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
@@ -101,8 +101,8 @@ function install_dask {
     # Install the main version of dask, distributed, and streamz
     gpuci_logger "Install the main version of dask, distributed, and streamz"
     set -x
-    pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-    pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+    pip install "git+https://github.com/dask/distributed.git@2021.05.1" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git@2021.05.1" --upgrade --no-deps
     pip install "git+https://github.com/python-streamz/streamz.git" --upgrade --no-deps
     set +x
 }
@@ -117,9 +117,9 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
 
     gpuci_logger "Build from source"
     if [[ ${BUILD_MODE} == "pull-request" ]]; then
-        $WORKSPACE/build.sh clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
+        "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
     else
-        $WORKSPACE/build.sh clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds
+        "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds
     fi
 
     ################################################################################
@@ -140,12 +140,12 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
 
         gpuci_logger "GoogleTests"
         set -x
-        cd $WORKSPACE/cpp/build
+        cd "$WORKSPACE/cpp/build"
 
-        for gt in ${WORKSPACE}/cpp/build/gtests/* ; do
+        for gt in "$WORKSPACE/cpp/build/gtests/"* ; do
             test_name=$(basename ${gt})
             echo "Running GoogleTest $test_name"
-            ${gt} --gtest_output=xml:${WORKSPACE}/test-results/
+            ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
         done
     fi
 else
@@ -168,7 +168,7 @@ else
     for gt in gtests/* ; do
         test_name=$(basename ${gt})
         echo "Running GoogleTest $test_name"
-        ${gt} --gtest_output=xml:${WORKSPACE}/test-results/
+        ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
     CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
@@ -185,9 +185,9 @@ else
 
     gpuci_logger "Build python libs from source"
     if [[ ${BUILD_MODE} == "pull-request" ]]; then
-        $WORKSPACE/build.sh cudf dask_cudf cudf_kafka --ptds
+        "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds
     else
-        $WORKSPACE/build.sh cudf dask_cudf cudf_kafka -l --ptds
+        "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds
     fi
 fi
 
@@ -205,21 +205,21 @@ fi
 # TEST - Run py.test, notebooks
 ################################################################################
 
-cd $WORKSPACE/python/cudf
+cd "$WORKSPACE/python/cudf"
 gpuci_logger "Python py.test for cuDF"
-py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-cudf.xml -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:${WORKSPACE}/python/cudf/cudf-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term
 
-cd $WORKSPACE/python/dask_cudf
+cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"
-py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/dask-cudf-cuda-tmp --junitxml=${WORKSPACE}/junit-dask-cudf.xml -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:${WORKSPACE}/python/dask_cudf/dask-cudf-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term
 
-cd $WORKSPACE/python/custreamz
+cd "$WORKSPACE/python/custreamz"
 gpuci_logger "Python py.test for cuStreamz"
-py.test -n 6 --cache-clear --basetemp=${WORKSPACE}/custreamz-cuda-tmp --junitxml=${WORKSPACE}/junit-custreamz.xml -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:${WORKSPACE}/python/custreamz/custreamz-coverage.xml --cov-report term
+py.test -n 6 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term
 
 gpuci_logger "Test notebooks"
-${WORKSPACE}/ci/gpu/test-notebooks.sh 2>&1 | tee nbtest.log
-python ${WORKSPACE}/ci/utils/nbtestlog2junitxml.py nbtest.log
+"$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log
+python "$WORKSPACE/ci/utils/nbtestlog2junitxml.py" nbtest.log
 
 if [ -n "${CODECOV_TOKEN}" ]; then
     codecov -t $CODECOV_TOKEN
diff --git a/ci/gpu/test-notebooks.sh b/ci/gpu/test-notebooks.sh
index ffa2e2a7214..1a5c2614000 100755
--- a/ci/gpu/test-notebooks.sh
+++ b/ci/gpu/test-notebooks.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 
-NOTEBOOKS_DIR=${WORKSPACE}/notebooks
-NBTEST=${WORKSPACE}/ci/utils/nbtest.sh
-LIBCUDF_KERNEL_CACHE_PATH=${WORKSPACE}/.jitcache
+NOTEBOOKS_DIR="$WORKSPACE/notebooks"
+NBTEST="$WORKSPACE/ci/utils/nbtest.sh"
+LIBCUDF_KERNEL_CACHE_PATH="$WORKSPACE/.jitcache"
 
 cd ${NOTEBOOKS_DIR}
 TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
diff --git a/ci/local/build.sh b/ci/local/build.sh
index 6ee415605b6..1bfb8b63fef 100755
--- a/ci/local/build.sh
+++ b/ci/local/build.sh
@@ -3,7 +3,7 @@
 GIT_DESCRIBE_TAG=`git describe --tags`
 MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
-DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7"
+DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.0-devel-ubuntu18.04-py3.7"
 REPO_PATH=${PWD}
 RAPIDS_DIR_IN_CONTAINER="/rapids"
 CPP_BUILD_DIR="cpp/build"
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 819a0dcf6bf..a6154e3db85 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -4,42 +4,25 @@
 ########################
 
 ## Usage
-# bash update-version.sh <type>
-#     where <type> is either `major`, `minor`, `patch`
+# bash update-version.sh <new_version>
 
-set -e
 
-# Grab argument for release type
-RELEASE_TYPE=$1
+# Format is YY.MM.PP - no leading 'v' or trailing 'a'
+NEXT_FULL_TAG=$1
 
-# Get current version and calculate next versions
-CURRENT_TAG=`git tag | grep -xE 'v[0-9\.]+' | sort --version-sort | tail -n 1 | tr -d 'v'`
-CURRENT_MAJOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}'`
-CURRENT_MINOR=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}'`
-CURRENT_PATCH=`echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}'`
-NEXT_MAJOR=$((CURRENT_MAJOR + 1))
-NEXT_MINOR=$((CURRENT_MINOR + 1))
-NEXT_PATCH=$((CURRENT_PATCH + 1))
+# Get current version
+CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v')
+CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}')
+CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
+CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
-NEXT_FULL_TAG=""
-NEXT_SHORT_TAG=""
 
-# Determine release type
-if [ "$RELEASE_TYPE" == "major" ]; then
-  NEXT_FULL_TAG="${NEXT_MAJOR}.0.0"
-  NEXT_SHORT_TAG="${NEXT_MAJOR}.0"
-elif [ "$RELEASE_TYPE" == "minor" ]; then
-  NEXT_FULL_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}.0"
-  NEXT_SHORT_TAG="${CURRENT_MAJOR}.${NEXT_MINOR}"
-elif [ "$RELEASE_TYPE" == "patch" ]; then
-  NEXT_FULL_TAG="${CURRENT_MAJOR}.${CURRENT_MINOR}.${NEXT_PATCH}"
-  NEXT_SHORT_TAG="${CURRENT_MAJOR}.${CURRENT_MINOR}"
-else
-  echo "Incorrect release type; use 'major', 'minor', or 'patch' as an argument"
-  exit 1
-fi
+#Get <major>.<minor> for next version
+NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
+NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
+NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
 
-echo "Preparing '$RELEASE_TYPE' release [$CURRENT_TAG -> $NEXT_FULL_TAG]"
+echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
 # Inplace sed replace; workaround for Linux and Mac
 function sed_runner() {
@@ -47,11 +30,14 @@ function sed_runner() {
 }
 
 # cpp update
-sed_runner 's/'"CUDA_DATAFRAME VERSION .* LANGUAGES"'/'"CUDA_DATAFRAME VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
 
 # cpp libcudf_kafka update
 sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
 
+# cpp cudf_jni update
+sed_runner 's/'"CUDF_JNI VERSION .* LANGUAGES"'/'"CUDF_JNI VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' java/src/main/native/CMakeLists.txt
+
 # doxyfile update
 sed_runner 's/PROJECT_NUMBER         = .*/PROJECT_NUMBER         = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile
 
@@ -69,4 +55,4 @@ sed_runner "s|\(TAGFILES.*librmm/\).*|\1${NEXT_SHORT_TAG}|" cpp/doxygen/Doxyfile
 
 # README.md update
 sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" README.md
-sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
+sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
\ No newline at end of file
diff --git a/ci/utils/nbtest.sh b/ci/utils/nbtest.sh
index f7b9774c6fd..1b39f267c65 100755
--- a/ci/utils/nbtest.sh
+++ b/ci/utils/nbtest.sh
@@ -22,7 +22,7 @@ get_ipython().run_cell_magic=my_run_cell_magic
 
 NO_COLORS=--colors=NoColor
 EXITCODE=0
-NBTMPDIR=${WORKSPACE}/tmp
+NBTMPDIR="$WORKSPACE/tmp"
 mkdir -p ${NBTMPDIR}
 
 for nb in $*; do
diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml
deleted file mode 100644
index 26d6067b768..00000000000
--- a/conda/environments/cudf_dev_cuda10.1.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-  - defaults
-dependencies:
-  - clang=8.0.1
-  - clang-tools=8.0.1
-  - cupy>7.1.0,<9.0.0a0
-  - rmm=0.19.*
-  - cmake>=3.14
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.49.0,!=0.51.0
-  - numpy
-  - pandas>=1.0,<=1.2.4
-  - pyarrow=1.0.1
-  - fastavro>=0.22.9
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinx_rtd_theme
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - recommonmark
-  - pandoc=<2.0.0
-  - cudatoolkit=10.1
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.0.7
-  - mypy=0.782
-  - typing_extensions
-  - pre_commit
-  - dask==2021.4.0
-  - distributed>=2.22.0,<=2021.4.0
-  - streamz
-  - dlpack
-  - arrow-cpp=1.0.1
-  - arrow-cpp-proc * cuda
-  - boost-cpp>=1.72.0
-  - double-conversion
-  - rapidjson
-  - flatbuffers
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git
-      - pyorc
diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml
deleted file mode 100644
index da7e4a91106..00000000000
--- a/conda/environments/cudf_dev_cuda10.2.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-  - defaults
-dependencies:
-  - clang=8.0.1
-  - clang-tools=8.0.1
-  - cupy>7.1.0,<9.0.0a0
-  - rmm=0.19.*
-  - cmake>=3.14
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.49,!=0.51.0
-  - numpy
-  - pandas>=1.0,<=1.2.4
-  - pyarrow=1.0.1
-  - fastavro>=0.22.9
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinx_rtd_theme
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - recommonmark
-  - pandoc=<2.0.0
-  - cudatoolkit=10.2
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.0.7
-  - mypy=0.782
-  - typing_extensions
-  - pre_commit
-  - dask==2021.4.0
-  - distributed>=2.22.0,<=2021.4.0
-  - streamz
-  - dlpack
-  - arrow-cpp=1.0.1
-  - arrow-cpp-proc * cuda
-  - boost-cpp>=1.72.0
-  - double-conversion
-  - rapidjson
-  - flatbuffers
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git
-      - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index b3aab1da1e5..6c742adbed1 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -6,18 +6,17 @@ channels:
   - nvidia
   - rapidsai-nightly
   - conda-forge
-  - defaults
 dependencies:
   - clang=8.0.1
   - clang-tools=8.0.1
-  - cupy>7.1.0,<9.0.0a0
-  - rmm=0.19.*
-  - cmake>=3.14
+  - cupy>7.1.0,<10.0.0a0
+  - rmm=21.06.*
+  - cmake>=3.18
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.49,!=0.51.0
+  - numba>=0.53.1
   - numpy
-  - pandas>=1.0,<=1.2.4
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0
@@ -42,13 +41,12 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask==2021.4.0
-  - distributed>=2.22.0,<=2021.4.0
+  - dask>=2021.4.0,<=2021.5.1
+  - distributed>=2.22.0,<=2021.5.1
   - streamz
-  - dlpack
+  - dlpack>=0.5,<0.6.0a0
   - arrow-cpp=1.0.1
   - arrow-cpp-proc * cuda
-  - boost-cpp>=1.72.0
   - double-conversion
   - rapidjson
   - flatbuffers
@@ -60,8 +58,9 @@ dependencies:
   - protobuf
   - nvtx>=0.2.1
   - cachetools
+  - transformers
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.05.1
+      - git+https://github.com/dask/distributed.git@2021.05.1
       - git+https://github.com/python-streamz/streamz.git
       - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.1.yml b/conda/environments/cudf_dev_cuda11.1.yml
deleted file mode 100644
index 7feadb5de82..00000000000
--- a/conda/environments/cudf_dev_cuda11.1.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-  - defaults
-dependencies:
-  - clang=8.0.1
-  - clang-tools=8.0.1
-  - cupy>7.1.0,<9.0.0a0
-  - rmm=0.19.*
-  - cmake>=3.14
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.49,!=0.51.0
-  - numpy
-  - pandas>=1.0,<=1.2.4
-  - pyarrow=1.0.1
-  - fastavro>=0.22.9
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinx_rtd_theme
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - recommonmark
-  - pandoc=<2.0.0
-  - cudatoolkit=11.1
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.0.7
-  - mypy=0.782
-  - typing_extensions
-  - pre_commit
-  - dask==2021.4.0
-  - distributed>=2.22.0,<=2021.4.0
-  - streamz
-  - dlpack
-  - arrow-cpp=1.0.1
-  - arrow-cpp-proc * cuda
-  - boost-cpp>=1.72.0
-  - double-conversion
-  - rapidjson
-  - flatbuffers
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git
-      - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 10ae1931d3c..41bc72d5c5b 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -6,18 +6,17 @@ channels:
   - nvidia
   - rapidsai-nightly
   - conda-forge
-  - defaults
 dependencies:
   - clang=8.0.1
   - clang-tools=8.0.1
-  - cupy>7.1.0,<9.0.0a0
-  - rmm=0.19.*
-  - cmake>=3.14
+  - cupy>7.1.0,<10.0.0a0
+  - rmm=21.06.*
+  - cmake>=3.18
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
-  - numba>=0.49,!=0.51.0
+  - numba>=0.53.1
   - numpy
-  - pandas>=1.0,<=1.2.4
+  - pandas>=1.0,<1.3.0dev0
   - pyarrow=1.0.1
   - fastavro>=0.22.9
   - notebook>=0.5.0
@@ -42,13 +41,12 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask==2021.4.0
-  - distributed>=2.22.0,<=2021.4.0
+  - dask>=2021.4.0,<=2021.5.1
+  - distributed>=2.22.0,<=2021.5.1
   - streamz
-  - dlpack
+  - dlpack>=0.5,<0.6.0a0
   - arrow-cpp=1.0.1
   - arrow-cpp-proc * cuda
-  - boost-cpp>=1.72.0
   - double-conversion
   - rapidjson
   - flatbuffers
@@ -60,8 +58,9 @@ dependencies:
   - protobuf
   - nvtx>=0.2.1
   - cachetools
+  - transformers
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.05.1
+      - git+https://github.com/dask/distributed.git@2021.05.1
       - git+https://github.com/python-streamz/streamz.git
       - pyorc
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index c9d2ee06d58..d1aaf924555 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: cudf
@@ -18,6 +18,9 @@ build:
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
+    - CC
+    - CXX
+    - CUDAHOSTCXX
 
 requirements:
   build:
@@ -25,8 +28,8 @@ requirements:
     - python
     - cython >=0.29,<0.30
     - setuptools
-    - numba >=0.49.0
-    - dlpack
+    - numba >=0.53.1
+    - dlpack>=0.5,<0.6.0a0
     - pyarrow 1.0.1
     - libcudf {{ version }}
     - rmm {{ minor_version }}
@@ -35,9 +38,9 @@ requirements:
     - protobuf
     - python
     - typing_extensions
-    - pandas >=1.0,<=1.2.4
-    - cupy >7.1.0,<9.0.0a0
-    - numba >=0.49.0
+    - pandas >=1.0,<1.3.0dev0
+    - cupy >7.1.0,<10.0.0a0
+    - numba >=0.53.1
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }}
     - fastavro >=0.22.0
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 0acd9ec4bb2..b59a49b0db7 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: cudf_kafka
@@ -24,7 +24,7 @@ build:
 
 requirements:
   build:
-    - cmake >=3.17.0
+    - cmake >=3.18
   host:
     - python
     - cython >=0.29,<0.30
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index f65b3cafbd7..34b83bb1492 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: custreamz
@@ -18,6 +18,9 @@ build:
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
+    - CC
+    - CXX
+    - CUDAHOSTCXX
 
 requirements:
   host:
@@ -28,8 +31,8 @@ requirements:
     - python
     - streamz 
     - cudf {{ version }}
-    - dask >=2.22.0,<=2021.4.0
-    - distributed >=2.22.0,<=2021.4.0
+    - dask>=2021.4.0,<=2021.5.1
+    - distributed>=2.22.0,<=2021.5.1
     - python-confluent-kafka
     - cudf_kafka {{ version }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 8b503840b34..11c3634d2aa 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: dask-cudf
@@ -18,18 +18,21 @@ build:
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
+    - CC
+    - CXX
+    - CUDAHOSTCXX
 
 requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask==2021.4.0
-    - distributed >=2.22.0,<=2021.4.0
+    - dask>=2021.4.0,<=2021.5.1
+    - distributed>=2.22.0,<=2021.5.1
   run:
     - python
     - cudf {{ version }}
-    - dask==2021.4.0
-    - distributed >=2.22.0,<=2021.4.0
+    - dask>=2021.4.0,<=2021.5.1
+    - distributed>=2.22.0,<=2021.5.1
   
 test:
   requires:
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index 3fc1182b33b..472e59149b5 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -12,8 +12,8 @@ function logger() {
 logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
 pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
 
-logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git@2021.05.1 --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git@2021.05.1" --upgrade --no-deps
 
 logger "python -c 'import dask_cudf'"
 python -c "import dask_cudf"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 75955428eab..dc41c439d27 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -2,7 +2,7 @@
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
 
 package:
   name: libcudf
@@ -39,12 +39,10 @@ requirements:
     - cudatoolkit {{ cuda_version }}.*
     - arrow-cpp 1.0.1
     - arrow-cpp-proc * cuda
-    - boost-cpp 1.72.0
-    - dlpack
+    - dlpack>=0.5,<0.6.0a0
   run:
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
     - arrow-cpp-proc * cuda
-    - {{ pin_compatible('boost-cpp', max_pin='x.x.x') }}
     - {{ pin_compatible('dlpack', max_pin='x.x') }}
 
 test:
@@ -55,7 +53,7 @@ test:
     - test -f $PREFIX/include/cudf/ast/transform.hpp
     - test -f $PREFIX/include/cudf/ast/detail/linearizer.hpp
     - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
-    - test -f $PREFIX/include/cudf/ast/linearizer.hpp
+    - test -f $PREFIX/include/cudf/ast/nodes.hpp
     - test -f $PREFIX/include/cudf/ast/operators.hpp
     - test -f $PREFIX/include/cudf/binaryop.hpp
     - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
@@ -76,8 +74,10 @@ test:
     - test -f $PREFIX/include/cudf/detail/gather.hpp
     - test -f $PREFIX/include/cudf/detail/groupby.hpp
     - test -f $PREFIX/include/cudf/detail/groupby/sort_helper.hpp
+    - test -f $PREFIX/include/cudf/detail/groupby/group_replace_nulls.hpp
     - test -f $PREFIX/include/cudf/detail/hashing.hpp
     - test -f $PREFIX/include/cudf/detail/interop.hpp
+    - test -f $PREFIX/include/cudf/detail/is_element_valid.hpp
     - test -f $PREFIX/include/cudf/detail/null_mask.hpp
     - test -f $PREFIX/include/cudf/detail/nvtx/nvtx3.hpp
     - test -f $PREFIX/include/cudf/detail/nvtx/ranges.hpp
@@ -86,7 +86,9 @@ test:
     - test -f $PREFIX/include/cudf/detail/repeat.hpp
     - test -f $PREFIX/include/cudf/detail/replace.hpp
     - test -f $PREFIX/include/cudf/detail/reshape.hpp
+    - test -f $PREFIX/include/cudf/detail/rolling.hpp
     - test -f $PREFIX/include/cudf/detail/round.hpp
+    - test -f $PREFIX/include/cudf/detail/scan.hpp
     - test -f $PREFIX/include/cudf/detail/scatter.hpp
     - test -f $PREFIX/include/cudf/detail/search.hpp
     - test -f $PREFIX/include/cudf/detail/sequence.hpp
@@ -132,10 +134,14 @@ test:
     - test -f $PREFIX/include/cudf/io/types.hpp
     - test -f $PREFIX/include/cudf/ipc.hpp
     - test -f $PREFIX/include/cudf/join.hpp
+    - test -f $PREFIX/include/cudf/lists/detail/combine.hpp
     - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
+    - test -f $PREFIX/include/cudf/lists/lists_column_factories.hpp
     - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp
+    - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
     - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
+    - test -f $PREFIX/include/cudf/lists/combine.hpp
     - test -f $PREFIX/include/cudf/lists/count_elements.hpp
     - test -f $PREFIX/include/cudf/lists/explode.hpp
     - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp
@@ -152,6 +158,7 @@ test:
     - test -f $PREFIX/include/cudf/replace.hpp
     - test -f $PREFIX/include/cudf/reshape.hpp
     - test -f $PREFIX/include/cudf/rolling.hpp
+    - test -f $PREFIX/include/cudf/rolling/range_window_bounds.hpp
     - test -f $PREFIX/include/cudf/round.hpp
     - test -f $PREFIX/include/cudf/scalar/scalar_factories.hpp
     - test -f $PREFIX/include/cudf/scalar/scalar.hpp
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 5348ec471e9..5e06c074433 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -23,7 +23,7 @@ build:
 
 requirements:
   build:
-    - cmake >=3.17.0
+    - cmake >=3.18
   host:
     - libcudf {{ version }}
     - librdkafka >=1.5.0,<1.5.3
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 525e5f9225d..b961080d162 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,7 +28,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
   set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
-project(CUDF VERSION 0.19.0 LANGUAGES C CXX)
+project(CUDF VERSION 21.06.00 LANGUAGES C CXX)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake,
 # causing subsequent runs to have different values for the `Threads::Threads` target.
@@ -121,8 +121,6 @@ find_package(ZLIB REQUIRED)
 find_package(Threads REQUIRED)
 # add third party dependencies using CPM
 include(cmake/thirdparty/CUDF_GetCPM.cmake)
-# find boost
-include(cmake/thirdparty/CUDF_FindBoost.cmake)
 # find jitify
 include(cmake/thirdparty/CUDF_GetJitify.cmake)
 # find thrust/cub
@@ -155,9 +153,11 @@ add_library(cudf
     src/binaryop/compiled/binary_ops.cu
     src/labeling/label_bins.cu
     src/bitmask/null_mask.cu
+    src/bitmask/is_element_valid.cpp
     src/column/column.cu
     src/column/column_device_view.cu
     src/column/column_factories.cpp
+    src/column/column_factories.cu
     src/column/column_view.cpp
     src/comms/ipc/ipc.cpp
     src/copying/concatenate.cu
@@ -173,6 +173,7 @@ add_library(cudf
     src/copying/shift.cu
     src/copying/slice.cpp
     src/copying/split.cpp
+    src/copying/segmented_shift.cu
     src/datetime/datetime_ops.cu
     src/dictionary/add_keys.cu
     src/dictionary/decode.cu
@@ -199,6 +200,7 @@ add_library(cudf
     src/groupby/sort/group_min.cu
     src/groupby/sort/group_nth_element.cu
     src/groupby/sort/group_nunique.cu
+    src/groupby/sort/group_product.cu
     src/groupby/sort/group_quantiles.cu
     src/groupby/sort/group_std.cu
     src/groupby/sort/group_sum.cu
@@ -207,11 +209,14 @@ add_library(cudf
     src/groupby/sort/group_max_scan.cu
     src/groupby/sort/group_min_scan.cu
     src/groupby/sort/group_sum_scan.cu
+    src/groupby/sort/group_replace_nulls.cu
     src/groupby/sort/sort_helper.cu
     src/hash/hashing.cu
+    src/hash/md5_hash.cu
+    src/hash/murmur_hash.cu
     src/interop/dlpack.cpp
-    src/interop/from_arrow.cpp
-    src/interop/to_arrow.cpp
+    src/interop/from_arrow.cu
+    src/interop/to_arrow.cu
     src/io/avro/avro.cpp
     src/io/avro/avro_gpu.cu
     src/io/avro/reader_impl.cu
@@ -246,11 +251,14 @@ add_library(cudf
     src/io/parquet/parquet.cpp
     src/io/parquet/reader_impl.cu
     src/io/parquet/writer_impl.cu
-    src/io/statistics/column_stats.cu
+    src/io/statistics/orc_column_statistics.cu
+    src/io/statistics/parquet_column_statistics.cu
+    src/io/utilities/column_buffer.cpp
     src/io/utilities/data_sink.cpp
     src/io/utilities/datasource.cpp
     src/io/utilities/file_io_utilities.cpp
     src/io/utilities/parsing_utils.cu
+    src/io/utilities/trie.cu
     src/io/utilities/type_conversion.cpp
     src/jit/cache.cpp
     src/jit/parser.cpp
@@ -260,14 +268,17 @@ add_library(cudf
     src/join/join.cu
     src/join/semi_join.cu
     src/lists/contains.cu
+    src/lists/combine/concatenate_list_elements.cu		
+    src/lists/combine/concatenate_rows.cu
     src/lists/copying/concatenate.cu
     src/lists/copying/copying.cu
     src/lists/copying/gather.cu
     src/lists/copying/segmented_gather.cu
     src/lists/count_elements.cu
+    src/lists/drop_list_duplicates.cu
     src/lists/explode.cu
     src/lists/extract.cu
-    src/lists/drop_list_duplicates.cu
+    src/lists/interleave_columns.cu
     src/lists/lists_column_factories.cu
     src/lists/lists_column_view.cu
     src/lists/segmented_sort.cu
@@ -285,7 +296,9 @@ add_library(cudf
     src/reductions/nth_element.cu
     src/reductions/product.cu
     src/reductions/reductions.cpp
-    src/reductions/scan.cu
+    src/reductions/scan/scan.cpp
+    src/reductions/scan/scan_exclusive.cu
+    src/reductions/scan/scan_inclusive.cu
     src/reductions/std.cu
     src/reductions/sum.cu
     src/reductions/sum_of_squares.cu
@@ -299,6 +312,7 @@ add_library(cudf
     src/reshape/tile.cu
     src/rolling/grouped_rolling.cu
     src/rolling/rolling.cu
+    src/rolling/range_window_bounds.cpp
     src/round/round.cu
     src/scalar/scalar.cpp
     src/scalar/scalar_factories.cpp
@@ -320,7 +334,9 @@ add_library(cudf
     src/strings/case.cu
     src/strings/char_types/char_cases.cu
     src/strings/char_types/char_types.cu
-    src/strings/combine.cu
+    src/strings/combine/concatenate.cu
+    src/strings/combine/join.cu
+    src/strings/combine/join_list_elements.cu
     src/strings/contains.cu
     src/strings/convert/convert_booleans.cu
     src/strings/convert/convert_datetime.cu
@@ -363,7 +379,7 @@ add_library(cudf
     src/structs/copying/concatenate.cu
     src/structs/structs_column_factories.cu
     src/structs/structs_column_view.cpp
-    src/structs/utilities.cu
+    src/structs/utilities.cpp
     src/table/table.cpp
     src/table/table_device_view.cu
     src/table/table_view.cpp
@@ -397,9 +413,9 @@ set_target_properties(cudf
     PROPERTIES BUILD_RPATH                         "\$ORIGIN"
                INSTALL_RPATH                       "\$ORIGIN"
                # set target compile options
-               CXX_STANDARD                        14
+               CXX_STANDARD                        17
                CXX_STANDARD_REQUIRED               ON
-               CUDA_STANDARD                       14
+               CUDA_STANDARD                       17
                CUDA_STANDARD_REQUIRED              ON
                POSITION_INDEPENDENT_CODE           ON
                INTERFACE_POSITION_INDEPENDENT_CODE ON
@@ -464,7 +480,6 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(cudf
            PUBLIC ZLIB::ZLIB
-                  Boost::filesystem
                   ${ARROW_LIBRARIES}
                   cudf::Thrust
                   rmm::rmm)
@@ -517,7 +532,7 @@ target_compile_options(cudftestutil
 )
 
 target_compile_features(cudftestutil
-    PUBLIC cxx_std_14 $<BUILD_INTERFACE:cuda_std_14>)
+    PUBLIC cxx_std_17 $<BUILD_INTERFACE:cuda_std_17>)
 
 target_link_libraries(cudftestutil
                PUBLIC GTest::gmock
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 11af408f1c5..25d012b1b33 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -17,7 +17,7 @@
 find_package(Threads REQUIRED)
 
 add_library(cudf_datagen STATIC common/generate_benchmark_input.cpp)
-target_compile_features(cudf_datagen PUBLIC cxx_std_14 cuda_std_14)
+target_compile_features(cudf_datagen PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(cudf_datagen
             PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
@@ -97,14 +97,20 @@ ConfigureBench(ITERATOR_BENCH iterator/iterator_benchmark.cu)
 
 ###################################################################################################
 # - search benchmark ------------------------------------------------------------------------------
-ConfigureBench(SEARCH_BENCH search/search_benchmark.cu)
+ConfigureBench(SEARCH_BENCH search/search_benchmark.cpp)
 
 ###################################################################################################
 # - sort benchmark --------------------------------------------------------------------------------
 ConfigureBench(SORT_BENCH
+  sort/rank_benchmark.cpp
   sort/sort_benchmark.cpp
   sort/sort_strings_benchmark.cpp)
 
+###################################################################################################
+# - quantiles benchmark --------------------------------------------------------------------------------
+ConfigureBench(QUANTILES_BENCH
+  quantiles/quantiles_benchmark.cpp)
+
 ###################################################################################################
 # - type_dispatcher benchmark ---------------------------------------------------------------------
 ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher_benchmark.cu)
@@ -118,15 +124,28 @@ ConfigureBench(REDUCTION_BENCH
   reduction/scan_benchmark.cpp
   reduction/minmax_benchmark.cpp)
 
+###################################################################################################
+# - reduction benchmark ---------------------------------------------------------------------------
+ConfigureBench(REPLACE_BENCH
+  replace/clamp_benchmark.cpp)
+
+###################################################################################################
+# - filling benchmark -----------------------------------------------------------------------------
+ConfigureBench(FILL_BENCH
+  filling/repeat_benchmark.cpp)
+
 ###################################################################################################
 # - groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(GROUPBY_BENCH
   groupby/group_sum_benchmark.cu
-  groupby/group_nth_benchmark.cu)
+  groupby/group_nth_benchmark.cu
+  groupby/group_shift_benchmark.cu)
 
 ###################################################################################################
 # - hashing benchmark -----------------------------------------------------------------------------
-ConfigureBench(HASHING_BENCH hashing/hashing_benchmark.cpp)
+ConfigureBench(HASHING_BENCH
+  hashing/hash_benchmark.cpp
+  hashing/partition_benchmark.cpp)
 
 ###################################################################################################
 # - merge benchmark -------------------------------------------------------------------------------
@@ -170,7 +189,9 @@ ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 
 ###################################################################################################
 # - binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH binaryop/binaryop_benchmark.cu)
+ConfigureBench(BINARYOP_BENCH
+  binaryop/binaryop_benchmark.cpp
+  binaryop/jit_binaryop_benchmark.cpp)
 
 ###################################################################################################
 # - nvtext benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/binaryop/binaryop_benchmark.cu b/cpp/benchmarks/binaryop/binaryop_benchmark.cpp
similarity index 100%
rename from cpp/benchmarks/binaryop/binaryop_benchmark.cu
rename to cpp/benchmarks/binaryop/binaryop_benchmark.cpp
diff --git a/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
new file mode 100644
index 00000000000..29ca02a843d
--- /dev/null
+++ b/cpp/benchmarks/binaryop/jit_binaryop_benchmark.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/binaryop.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+template <typename TypeLhs, typename TypeRhs, typename TypeOut>
+class JIT_BINARYOP : public cudf::benchmark {
+};
+
+template <typename TypeLhs, typename TypeRhs, typename TypeOut>
+void BM_binaryop(benchmark::State& state, cudf::binary_operator binop)
+{
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = thrust::make_counting_iterator(0);
+  cudf::test::fixed_width_column_wrapper<TypeLhs> input1(data_it, data_it + column_size);
+  cudf::test::fixed_width_column_wrapper<TypeRhs> input2(data_it, data_it + column_size);
+
+  auto lhs          = cudf::column_view(input1);
+  auto rhs          = cudf::column_view(input2);
+  auto output_dtype = cudf::data_type(cudf::type_to_id<TypeOut>());
+
+  // Call once for hot cache.
+  cudf::binary_operation(lhs, rhs, binop, output_dtype);
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::binary_operation(lhs, rhs, binop, output_dtype);
+  }
+}
+
+// TODO tparam boolean for null.
+#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut)           \
+  BENCHMARK_TEMPLATE_DEFINE_F(JIT_BINARYOP, binop, TypeLhs, TypeRhs, TypeOut) \
+  (::benchmark::State & st)                                                   \
+  {                                                                           \
+    BM_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
+  }                                                                           \
+  BENCHMARK_REGISTER_F(JIT_BINARYOP, binop)                                   \
+    ->Unit(benchmark::kMillisecond)                                           \
+    ->UseManualTime()                                                         \
+    ->Arg(10000)      /* 10k */                                               \
+    ->Arg(100000)     /* 100k */                                              \
+    ->Arg(1000000)    /* 1M */                                                \
+    ->Arg(10000000)   /* 10M */                                               \
+    ->Arg(100000000); /* 100M */
+
+using namespace cudf;
+
+// clang-format off
+BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
+BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
+BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
+BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
+BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
+BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
diff --git a/cpp/benchmarks/column/concatenate_benchmark.cpp b/cpp/benchmarks/column/concatenate_benchmark.cpp
index b04cfba7d07..3634b2f08a2 100644
--- a/cpp/benchmarks/column/concatenate_benchmark.cpp
+++ b/cpp/benchmarks/column/concatenate_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ static void BM_concatenate(benchmark::State& state)
   CHECK_CUDA(0);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     auto result = cudf::concatenate(column_views);
   }
 
@@ -124,7 +124,7 @@ static void BM_concatenate_tables(benchmark::State& state)
   CHECK_CUDA(0);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     auto result = cudf::concatenate(table_views);
   }
 
@@ -184,7 +184,7 @@ static void BM_concatenate_strings(benchmark::State& state)
   CHECK_CUDA(0);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     auto result = cudf::concatenate(column_views);
   }
 
diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index a66416ad40b..591e42ceddf 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -18,6 +18,7 @@
 #include "random_distribution_factory.hpp"
 
 #include <cudf/column/column.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -26,7 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <future>
 #include <memory>
@@ -413,9 +414,9 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
     }
   }
 
-  rmm::device_vector<char> d_chars(out_col.chars);
-  rmm::device_vector<cudf::size_type> d_offsets(out_col.offsets);
-  rmm::device_vector<cudf::bitmask_type> d_null_mask(out_col.null_mask);
+  auto d_chars     = cudf::detail::make_device_uvector_sync(out_col.chars);
+  auto d_offsets   = cudf::detail::make_device_uvector_sync(out_col.offsets);
+  auto d_null_mask = cudf::detail::make_device_uvector_sync(out_col.null_mask);
   return cudf::make_strings_column(d_chars, d_offsets, d_null_mask);
 }
 
diff --git a/cpp/benchmarks/filling/repeat_benchmark.cpp b/cpp/benchmarks/filling/repeat_benchmark.cpp
new file mode 100644
index 00000000000..3cedd55767d
--- /dev/null
+++ b/cpp/benchmarks/filling/repeat_benchmark.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+
+#include <cudf/filling.hpp>
+#include <cudf/null_mask.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <random>
+
+#include "../fixture/benchmark_fixture.hpp"
+#include "../synchronization/synchronization.hpp"
+
+class Repeat : public cudf::benchmark {
+};
+
+template <class TypeParam, bool nulls>
+void BM_repeat(benchmark::State& state)
+{
+  using column_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  auto const n_rows    = static_cast<cudf::size_type>(state.range(0));
+  auto const n_cols    = static_cast<cudf::size_type>(state.range(1));
+
+  auto idx_begin = thrust::make_counting_iterator<cudf::size_type>(0);
+  auto idx_end   = thrust::make_counting_iterator<cudf::size_type>(n_rows);
+
+  std::vector<column_wrapper> columns;
+  columns.reserve(n_rows);
+  std::generate_n(std::back_inserter(columns), n_cols, [&]() {
+    return nulls ? column_wrapper(
+                     idx_begin,
+                     idx_end,
+                     thrust::make_transform_iterator(idx_begin, [](auto idx) { return true; }))
+                 : column_wrapper(idx_begin, idx_end);
+  });
+
+  // repeat counts
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> distribution(0, 3);
+
+  std::vector<cudf::size_type> host_repeat_count(n_rows);
+  std::generate(
+    host_repeat_count.begin(), host_repeat_count.end(), [&] { return distribution(generator); });
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> repeat_count(host_repeat_count.begin(),
+                                                                       host_repeat_count.end());
+
+  // Create column views
+  auto const column_views = std::vector<cudf::column_view>(columns.begin(), columns.end());
+
+  // Create table view
+  auto input = cudf::table_view(column_views);
+
+  // warm up
+  auto output = cudf::repeat(input, repeat_count);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+    cudf::repeat(input, repeat_count);
+  }
+
+  auto data_bytes =
+    (input.num_columns() * input.num_rows() + output->num_columns() * output->num_rows()) *
+    sizeof(TypeParam);
+  auto null_bytes =
+    nulls ? input.num_columns() * cudf::bitmask_allocation_size_bytes(input.num_rows()) +
+              output->num_columns() * cudf::bitmask_allocation_size_bytes(output->num_rows())
+          : 0;
+  state.SetBytesProcessed(state.iterations() * (data_bytes + null_bytes));
+}
+
+#define REPEAT_BENCHMARK_DEFINE(name, type, nulls)                                                \
+  BENCHMARK_DEFINE_F(Repeat, name)(::benchmark::State & state) { BM_repeat<type, nulls>(state); } \
+  BENCHMARK_REGISTER_F(Repeat, name)                                                              \
+    ->RangeMultiplier(8)                                                                          \
+    ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                                                        \
+    ->UseManualTime()                                                                             \
+    ->Unit(benchmark::kMillisecond);
+
+REPEAT_BENCHMARK_DEFINE(double_nulls, double, true);
+REPEAT_BENCHMARK_DEFINE(double_no_nulls, double, false);
diff --git a/cpp/benchmarks/groupby/group_shift_benchmark.cu b/cpp/benchmarks/groupby/group_shift_benchmark.cu
new file mode 100644
index 00000000000..81afcdd80e1
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_shift_benchmark.cu
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/groupby.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <benchmark/benchmark.h>
+
+#include <random>
+
+class Groupby : public cudf::benchmark {
+};
+
+// TODO: put it in a struct so `uniform` can be remade with different min, max
+template <typename T>
+T random_int(T min, T max)
+{
+  static unsigned seed = 13377331;
+  static std::mt19937 engine{seed};
+  static std::uniform_int_distribution<T> uniform{min, max};
+
+  return uniform(engine);
+}
+
+void BM_group_shift(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  const int num_groups = 100;
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [](cudf::size_type row) { return random_int(0, num_groups); });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size);
+
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+
+  std::vector<cudf::size_type> offsets{
+    static_cast<cudf::size_type>(column_size / float(num_groups) * 0.5)};  // forward shift half way
+  // null fill value
+  auto fill_value = cudf::make_default_constructed_scalar(cudf::data_type(cudf::type_id::INT64));
+  // non null fill value
+  // auto fill_value = cudf::make_fixed_width_scalar(static_cast<int64_t>(42));
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    auto result = gb_obj.shift(cudf::table_view{{vals}}, offsets, {*fill_value});
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, Shift)(::benchmark::State& state) { BM_group_shift(state); }
+
+BENCHMARK_REGISTER_F(Groupby, Shift)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond);
diff --git a/cpp/benchmarks/hashing/hash_benchmark.cpp b/cpp/benchmarks/hashing/hash_benchmark.cpp
new file mode 100644
index 00000000000..77b10399693
--- /dev/null
+++ b/cpp/benchmarks/hashing/hash_benchmark.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/hashing.hpp>
+#include <cudf/table/table.hpp>
+
+class HashBenchmark : public cudf::benchmark {
+};
+
+static void BM_hash(benchmark::State& state, cudf::hash_id hid)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const data = create_random_table({cudf::type_id::INT64}, 1, row_count{n_rows});
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+    cudf::hash(data->view(), hid);
+  }
+}
+
+#define HASH_BENCHMARK_DEFINE(name)                               \
+  BENCHMARK_DEFINE_F(HashBenchmark, name)                         \
+  (::benchmark::State & st) { BM_hash(st, cudf::hash_id::name); } \
+  BENCHMARK_REGISTER_F(HashBenchmark, name)                       \
+    ->RangeMultiplier(4)                                          \
+    ->Ranges({{1 << 14, 1 << 24}})                                \
+    ->UseManualTime()                                             \
+    ->Unit(benchmark::kMillisecond);
+
+HASH_BENCHMARK_DEFINE(HASH_MURMUR3)
+HASH_BENCHMARK_DEFINE(HASH_MD5)
+HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3)
+HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3)
diff --git a/cpp/benchmarks/hashing/hashing_benchmark.cpp b/cpp/benchmarks/hashing/partition_benchmark.cpp
similarity index 100%
rename from cpp/benchmarks/hashing/hashing_benchmark.cpp
rename to cpp/benchmarks/hashing/partition_benchmark.cpp
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
index d38747b934f..2f3f454fda6 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -84,7 +84,6 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const flags         = state.range(state_idx++);
   auto const use_index     = (flags & 1) != 0;
   auto const use_np_dtypes = (flags & 2) != 0;
-  auto const dec_as_float  = (flags & 4) != 0;
   auto const ts_type       = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
   auto const data_types =
@@ -107,8 +106,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
       .columns(cols_to_read)
       .use_index(use_index)
       .use_np_dtypes(use_np_dtypes)
-      .timestamp_type(ts_type)
-      .decimals_as_float64(dec_as_float);
+      .timestamp_type(ts_type);
 
   auto const num_stripes              = data_size / (64 << 20);
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
@@ -167,7 +165,7 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection)
                   int32_t(column_selection::SECOND_HALF)},
                  {int32_t(row_selection::ALL)},
                  {1},
-                 {0b111},  // defaults
+                 {0b11},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
@@ -178,7 +176,7 @@ BENCHMARK_REGISTER_F(OrcRead, row_selection)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
                  {int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)},
                  {1, 8},
-                 {0b111},  // defaults
+                 {0b11},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
@@ -189,7 +187,7 @@ BENCHMARK_REGISTER_F(OrcRead, misc_options)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
                  {int32_t(row_selection::NROWS)},
                  {1},
-                 {0b111, 0b110, 0b101, 0b011},  // `true` is default for each boolean parameter here
+                 {0b11, 0b10, 0b01},  // `true` is default for each boolean parameter here
                  {int32_t(cudf::type_id::EMPTY), int32_t(cudf::type_id::TIMESTAMP_NANOSECONDS)}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
diff --git a/cpp/benchmarks/iterator/iterator_benchmark.cu b/cpp/benchmarks/iterator/iterator_benchmark.cu
index 6c3255328cb..04307f5db25 100644
--- a/cpp/benchmarks/iterator/iterator_benchmark.cu
+++ b/cpp/benchmarks/iterator/iterator_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,19 +14,21 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include "../fixture/benchmark_fixture.hpp"
+#include "../synchronization/synchronization.hpp"
 
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <random>
 
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <rmm/device_uvector.hpp>
 
-#include <cudf/detail/iterator.cuh>  // include iterator header
-// for reduction tests
-#include <thrust/device_vector.h>
 #include <cub/device/device_reduce.cuh>
-#include <cudf/detail/utilities/device_operators.cuh>
+
+#include <benchmark/benchmark.h>
+
+#include <random>
 
 template <typename T>
 T random_int(T min, T max)
@@ -48,7 +50,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite
     nullptr, temp_storage_bytes, d_in, result, num_items, cudf::DeviceSum{}, init);
 
   // Allocate temporary storage
-  rmm::device_buffer d_temp_storage(temp_storage_bytes);
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default);
 
   // Run reduction
   cub::DeviceReduce::Reduce(
@@ -59,7 +61,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite
 
 // -----------------------------------------------------------------------------
 template <typename T>
-void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
+void raw_stream_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "raw stream cub: " << "\t";
 
@@ -71,7 +73,7 @@ void raw_stream_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
 };
 
 template <typename T, bool has_null>
-void iterator_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
+void iterator_bench_cub(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "iterator cub " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";
 
@@ -89,7 +91,7 @@ void iterator_bench_cub(cudf::column_view &col, rmm::device_vector<T> &result)
 
 // -----------------------------------------------------------------------------
 template <typename T>
-void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &result)
+void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "raw stream thust: " << "\t\t";
 
@@ -100,7 +102,7 @@ void raw_stream_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &resu
 }
 
 template <typename T, bool has_null>
-void iterator_bench_thrust(cudf::column_view &col, rmm::device_vector<T> &result)
+void iterator_bench_thrust(cudf::column_view &col, rmm::device_uvector<T> &result)
 {
   // std::cout << "iterator thust " << ( (has_null) ? "<true>: " : "<false>: " ) << "\t";
 
@@ -131,7 +133,8 @@ void BM_iterator(benchmark::State &state)
   cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
   cudf::column_view hasnull_F = wrap_hasnull_F;
 
-  rmm::device_vector<T> dev_result(1, T{0});
+  // Initialize dev_result to false
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
@@ -163,7 +166,7 @@ __device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pa
 // -----------------------------------------------------------------------------
 template <typename T, bool has_null>
 void pair_iterator_bench_cub(cudf::column_view &col,
-                             rmm::device_vector<thrust::pair<T, bool>> &result)
+                             rmm::device_uvector<thrust::pair<T, bool>> &result)
 {
   thrust::pair<T, bool> init{0, false};
   auto d_col    = cudf::column_device_view::create(col);
@@ -174,7 +177,7 @@ void pair_iterator_bench_cub(cudf::column_view &col,
 
 template <typename T, bool has_null>
 void pair_iterator_bench_thrust(cudf::column_view &col,
-                                rmm::device_vector<thrust::pair<T, bool>> &result)
+                                rmm::device_uvector<thrust::pair<T, bool>> &result)
 {
   thrust::pair<T, bool> init{0, false};
   auto d_col = cudf::column_device_view::create(col);
@@ -198,7 +201,8 @@ void BM_pair_iterator(benchmark::State &state)
   cudf::column_view hasnull_F = wrap_hasnull_F;
   cudf::column_view hasnull_T = wrap_hasnull_T;
 
-  rmm::device_vector<thrust::pair<T, bool>> dev_result(1, {T{0}, false});
+  // Initialize dev_result to false
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(1);
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 79cb2d3e44d..285a9241a26 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef __GENERATE_INPUT_TABLES_CUH
-#define __GENERATE_INPUT_TABLES_CUH
+#pragma once
+
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
-#include <curand.h>
-#include <curand_kernel.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/sequence.h>
-#include <cassert>
 
-#include <cudf/detail/utilities/device_atomics.cuh>
-#include <cudf/utilities/error.hpp>
+#include <curand.h>
+#include <curand_kernel.h>
+
+#include <cassert>
 
 __global__ static void init_curand(curandState* state, const int nstates)
 {
@@ -188,64 +192,63 @@ void generate_input_tables(key_type* const build_tbl,
 
   const int num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
-  rmm::device_vector<curandState> devStates(num_states);
+  rmm::device_uvector<curandState> devStates(num_states, rmm::cuda_stream_default);
 
-  init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data().get(),
-                                                                 num_states);
+  init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);
 
   CHECK_CUDA(0);
 
-  rmm::device_vector<key_type> build_tbl_sorted(build_tbl_size);
-
   size_type lottery_size =
     rand_max < std::numeric_limits<key_type>::max() - 1 ? rand_max + 1 : rand_max;
-  rmm::device_vector<key_type> lottery(lottery_size);
+  rmm::device_uvector<key_type> lottery(lottery_size, rmm::cuda_stream_default);
 
-  if (uniq_build_tbl_keys) { thrust::sequence(thrust::device, lottery.begin(), lottery.end(), 0); }
+  if (uniq_build_tbl_keys) {
+    thrust::sequence(rmm::exec_policy(), lottery.begin(), lottery.end(), 0);
+  }
 
   init_build_tbl<key_type, size_type>
     <<<num_sms * num_blocks_init_build_tbl, block_size>>>(build_tbl,
                                                           build_tbl_size,
                                                           rand_max,
                                                           uniq_build_tbl_keys,
-                                                          lottery.data().get(),
+                                                          lottery.data(),
                                                           lottery_size,
-                                                          devStates.data().get(),
+                                                          devStates.data(),
                                                           num_states);
 
   CHECK_CUDA(0);
 
-  CUDA_TRY(cudaMemcpy(build_tbl_sorted.data().get(),
+  rmm::device_uvector<key_type> build_tbl_sorted(build_tbl_size, rmm::cuda_stream_default);
+
+  CUDA_TRY(cudaMemcpy(build_tbl_sorted.data(),
                       build_tbl,
                       build_tbl_size * sizeof(key_type),
                       cudaMemcpyDeviceToDevice));
 
-  thrust::sort(thrust::device, build_tbl_sorted.begin(), build_tbl_sorted.end());
+  thrust::sort(rmm::exec_policy(), build_tbl_sorted.begin(), build_tbl_sorted.end());
 
   // Exclude keys used in build table from lottery
   thrust::counting_iterator<key_type> first_lottery_elem(0);
   thrust::counting_iterator<key_type> last_lottery_elem = first_lottery_elem + lottery_size;
-  key_type* lottery_end                                 = thrust::set_difference(thrust::device,
+  key_type* lottery_end                                 = thrust::set_difference(rmm::exec_policy(),
                                                  first_lottery_elem,
                                                  last_lottery_elem,
                                                  build_tbl_sorted.begin(),
                                                  build_tbl_sorted.end(),
-                                                 lottery.data().get());
+                                                 lottery.data());
 
-  lottery_size = thrust::distance(lottery.data().get(), lottery_end);
+  lottery_size = thrust::distance(lottery.data(), lottery_end);
 
   init_probe_tbl<key_type, size_type>
     <<<num_sms * num_blocks_init_build_tbl, block_size>>>(probe_tbl,
                                                           probe_tbl_size,
                                                           build_tbl,
                                                           build_tbl_size,
-                                                          lottery.data().get(),
+                                                          lottery.data(),
                                                           lottery_size,
                                                           selectivity,
-                                                          devStates.data().get(),
+                                                          devStates.data(),
                                                           num_states);
 
   CHECK_CUDA(0);
 }
-
-#endif  // __GENERATE_INPUT_TABLES_CUH
diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu
index fa6afdd908c..a7c109db9b4 100644
--- a/cpp/benchmarks/join/join_benchmark.cu
+++ b/cpp/benchmarks/join/join_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,8 @@ template <typename key_type, typename payload_type>
 class Join : public cudf::benchmark {
 };
 
-template <typename key_type, typename payload_type, bool Nullable>
-static void BM_join(benchmark::State &state)
+template <typename key_type, typename payload_type, bool Nullable, typename Join>
+static void BM_join(benchmark::State& state, Join JoinFunc)
 {
   const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
   const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)};
@@ -103,22 +103,71 @@ static void BM_join(benchmark::State &state)
   // Benchmark the inner join operation
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
-    auto result = cudf::inner_join(
+    auto result = JoinFunc(
       probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
   }
 }
 
-#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)     \
-  (::benchmark::State & st) { BM_join<key_type, payload_type, nullable>(st); }
+#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)         \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)             \
+  (::benchmark::State & st)                                                   \
+  {                                                                           \
+    auto join = [](cudf::table_view const& left,                              \
+                   cudf::table_view const& right,                             \
+                   std::vector<cudf::size_type> const& left_on,               \
+                   std::vector<cudf::size_type> const& right_on,              \
+                   cudf::null_equality compare_nulls) {                       \
+      return cudf::inner_join(left, right, left_on, right_on, compare_nulls); \
+    };                                                                        \
+    BM_join<key_type, payload_type, nullable>(st, join);                      \
+  }
 
 JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t, false);
 JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t, false);
 JOIN_BENCHMARK_DEFINE(join_32bit_nulls, int32_t, int32_t, true);
 JOIN_BENCHMARK_DEFINE(join_64bit_nulls, int64_t, int64_t, true);
 
+#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)                 \
+  (::benchmark::State & st)                                                       \
+  {                                                                               \
+    auto join = [](cudf::table_view const& left,                                  \
+                   cudf::table_view const& right,                                 \
+                   std::vector<cudf::size_type> const& left_on,                   \
+                   std::vector<cudf::size_type> const& right_on,                  \
+                   cudf::null_equality compare_nulls) {                           \
+      return cudf::left_anti_join(left, right, left_on, right_on, compare_nulls); \
+    };                                                                            \
+    BM_join<key_type, payload_type, nullable>(st, join);                          \
+  }
+
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, int32_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, int64_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, int32_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, int64_t, true);
+
+#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)                 \
+  (::benchmark::State & st)                                                       \
+  {                                                                               \
+    auto join = [](cudf::table_view const& left,                                  \
+                   cudf::table_view const& right,                                 \
+                   std::vector<cudf::size_type> const& left_on,                   \
+                   std::vector<cudf::size_type> const& right_on,                  \
+                   cudf::null_equality compare_nulls) {                           \
+      return cudf::left_semi_join(left, right, left_on, right_on, compare_nulls); \
+    };                                                                            \
+    BM_join<key_type, payload_type, nullable>(st, join);                          \
+  }
+
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, int32_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true);
+
+// join -----------------------------------------------------------------------
 BENCHMARK_REGISTER_F(Join, join_32bit)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
@@ -154,3 +203,77 @@ BENCHMARK_REGISTER_F(Join, join_64bit_nulls)
   ->Args({50'000'000, 50'000'000})
   ->Args({40'000'000, 120'000'000})
   ->UseManualTime();
+
+// left anti-join -------------------------------------------------------------
+BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_anti_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_anti_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_anti_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
+
+// left semi-join -------------------------------------------------------------
+BENCHMARK_REGISTER_F(Join, left_semi_join_32bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_semi_join_64bit)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_semi_join_32bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({100'000, 100'000})
+  ->Args({100'000, 400'000})
+  ->Args({100'000, 1'000'000})
+  ->Args({10'000'000, 10'000'000})
+  ->Args({10'000'000, 40'000'000})
+  ->Args({10'000'000, 100'000'000})
+  ->Args({100'000'000, 100'000'000})
+  ->Args({80'000'000, 240'000'000})
+  ->UseManualTime();
+
+BENCHMARK_REGISTER_F(Join, left_semi_join_64bit_nulls)
+  ->Unit(benchmark::kMillisecond)
+  ->Args({50'000'000, 50'000'000})
+  ->Args({40'000'000, 120'000'000})
+  ->UseManualTime();
diff --git a/cpp/benchmarks/merge/merge_benchmark.cpp b/cpp/benchmarks/merge/merge_benchmark.cpp
index 13eb284a903..1af0fcbb237 100644
--- a/cpp/benchmarks/merge/merge_benchmark.cpp
+++ b/cpp/benchmarks/merge/merge_benchmark.cpp
@@ -88,7 +88,7 @@ void BM_merge(benchmark::State& state)
 #define MBM_BENCHMARK_DEFINE(name)                                                 \
   BENCHMARK_DEFINE_F(Merge, name)(::benchmark::State & state) { BM_merge(state); } \
   BENCHMARK_REGISTER_F(Merge, name)                                                \
-    ->Unit(benchmark::kNanosecond)                                                 \
+    ->Unit(benchmark::kMillisecond)                                                \
     ->UseManualTime()                                                              \
     ->RangeMultiplier(2)                                                           \
     ->Ranges({{2, 128}});
diff --git a/cpp/benchmarks/quantiles/quantiles_benchmark.cpp b/cpp/benchmarks/quantiles/quantiles_benchmark.cpp
new file mode 100644
index 00000000000..fa602304dec
--- /dev/null
+++ b/cpp/benchmarks/quantiles/quantiles_benchmark.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/quantiles.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <thrust/tabulate.h>
+
+class Quantiles : public cudf::benchmark {
+};
+
+static void BM_quantiles(benchmark::State& state, bool nulls)
+{
+  using Type           = int;
+  using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> distribution(0, 100);
+
+  const cudf::size_type n_rows{(cudf::size_type)state.range(0)};
+  const cudf::size_type n_cols{(cudf::size_type)state.range(1)};
+  const cudf::size_type n_quantiles{(cudf::size_type)state.range(2)};
+
+  // Create columns with values in the range [0,100)
+  std::vector<column_wrapper> columns;
+  columns.reserve(n_cols);
+  std::generate_n(std::back_inserter(columns), n_cols, [&, n_rows]() {
+    auto elements = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto row) { return distribution(generator); });
+    if (!nulls) return column_wrapper(elements, elements + n_rows);
+    auto valids = cudf::detail::make_counting_transform_iterator(
+      0, [](auto i) { return i % 100 == 0 ? false : true; });
+    return column_wrapper(elements, elements + n_rows, valids);
+  });
+
+  // Create column views
+  auto column_views = std::vector<cudf::column_view>(columns.begin(), columns.end());
+
+  // Create table view
+  auto input = cudf::table_view(column_views);
+
+  std::vector<double> q(n_quantiles);
+  thrust::tabulate(
+    thrust::seq, q.begin(), q.end(), [n_quantiles](auto i) { return i * (1.0f / n_quantiles); });
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto result = cudf::quantiles(input, q);
+    // auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input);
+  }
+}
+
+#define QUANTILES_BENCHMARK_DEFINE(name, nulls)          \
+  BENCHMARK_DEFINE_F(Quantiles, name)                    \
+  (::benchmark::State & st) { BM_quantiles(st, nulls); } \
+  BENCHMARK_REGISTER_F(Quantiles, name)                  \
+    ->RangeMultiplier(4)                                 \
+    ->Ranges({{1 << 16, 1 << 26}, {1, 8}, {1, 12}})      \
+    ->UseManualTime()                                    \
+    ->Unit(benchmark::kMillisecond);
+
+QUANTILES_BENCHMARK_DEFINE(no_nulls, false)
+QUANTILES_BENCHMARK_DEFINE(nulls, true)
diff --git a/cpp/benchmarks/replace/clamp_benchmark.cpp b/cpp/benchmarks/replace/clamp_benchmark.cpp
new file mode 100644
index 00000000000..f897b9d82cc
--- /dev/null
+++ b/cpp/benchmarks/replace/clamp_benchmark.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+class ReplaceClamp : public cudf::benchmark {
+};
+
+template <typename type>
+static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const dtype = cudf::type_to_id<type>();
+  auto const table = create_random_table({dtype}, 1, row_count{n_rows});
+  if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
+  cudf::column_view input(table->view().column(0));
+
+  auto [low_scalar, high_scalar] = cudf::minmax(input);
+
+  // set the clamps 2 in from the min and max
+  {
+    using ScalarType = cudf::scalar_type_t<type>;
+    auto lvalue      = static_cast<ScalarType*>(low_scalar.get());
+    auto hvalue      = static_cast<ScalarType*>(high_scalar.get());
+
+    // super heavy clamp
+    auto mid = lvalue->value() + (hvalue->value() - lvalue->value()) / 2;
+    lvalue->set_value(mid - 10);
+    hvalue->set_value(mid + 10);
+  }
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    auto result = cudf::clamp(input, *low_scalar, *high_scalar);
+  }
+}
+
+#define CLAMP_BENCHMARK_DEFINE(name, type, nulls)                         \
+  BENCHMARK_DEFINE_F(ReplaceClamp, name)                                  \
+  (::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
+  BENCHMARK_REGISTER_F(ReplaceClamp, name)                                \
+    ->UseManualTime()                                                     \
+    ->Arg(10000)      /* 10k */                                           \
+    ->Arg(100000)     /* 100k */                                          \
+    ->Arg(1000000)    /* 1M */                                            \
+    ->Arg(10000000)   /* 10M */                                           \
+    ->Arg(100000000); /* 100M */
+
+CLAMP_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
+CLAMP_BENCHMARK_DEFINE(int32_no_nulls, int32_t, false);
+CLAMP_BENCHMARK_DEFINE(uint64_no_nulls, uint64_t, false);
+CLAMP_BENCHMARK_DEFINE(float_no_nulls, float, false);
+CLAMP_BENCHMARK_DEFINE(int16_nulls, int16_t, true);
+CLAMP_BENCHMARK_DEFINE(uint32_nulls, uint32_t, true);
+CLAMP_BENCHMARK_DEFINE(double_nulls, double, true);
diff --git a/cpp/benchmarks/search/search_benchmark.cu b/cpp/benchmarks/search/search_benchmark.cpp
similarity index 63%
rename from cpp/benchmarks/search/search_benchmark.cu
rename to cpp/benchmarks/search/search_benchmark.cpp
index 7b4b8060514..7fb196fb500 100644
--- a/cpp/benchmarks/search/search_benchmark.cu
+++ b/cpp/benchmarks/search/search_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,35 +31,6 @@
 class Search : public cudf::benchmark {
 };
 
-void BM_non_null_column(benchmark::State& state)
-{
-  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type values_size = column_size;
-
-  auto col_data_it = cudf::detail::make_counting_transform_iterator(
-    0, [=](cudf::size_type row) { return static_cast<float>(row); });
-  auto val_data_it = cudf::detail::make_counting_transform_iterator(
-    0, [=](cudf::size_type row) { return static_cast<float>(values_size - row); });
-
-  cudf::test::fixed_width_column_wrapper<float> column(col_data_it, col_data_it + column_size);
-  cudf::test::fixed_width_column_wrapper<float> values(val_data_it, val_data_it + values_size);
-
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto col = cudf::upper_bound(cudf::table_view({column}),
-                                 cudf::table_view({values}),
-                                 {cudf::order::ASCENDING},
-                                 {cudf::null_order::BEFORE});
-  }
-}
-
-BENCHMARK_DEFINE_F(Search, AllValidColumn)(::benchmark::State& state) { BM_non_null_column(state); }
-
-BENCHMARK_REGISTER_F(Search, AllValidColumn)
-  ->UseManualTime()
-  ->Unit(benchmark::kMillisecond)
-  ->Arg(100000000);
-
 auto make_validity_iter()
 {
   static constexpr int r_min = 1;
@@ -71,7 +42,7 @@ auto make_validity_iter()
     0, [mod_base](auto row) { return (row % mod_base) > 0; });
 }
 
-void BM_nullable_column(benchmark::State& state)
+void BM_column(benchmark::State& state, bool nulls)
 {
   const cudf::size_type column_size{(cudf::size_type)state.range(0)};
   const cudf::size_type values_size = column_size;
@@ -81,25 +52,39 @@ void BM_nullable_column(benchmark::State& state)
   auto val_data_it = cudf::detail::make_counting_transform_iterator(
     0, [=](cudf::size_type row) { return static_cast<float>(values_size - row); });
 
-  cudf::test::fixed_width_column_wrapper<float> column(
-    col_data_it, col_data_it + column_size, make_validity_iter());
-  cudf::test::fixed_width_column_wrapper<float> values(
-    val_data_it, val_data_it + values_size, make_validity_iter());
-
-  auto sorted = cudf::sort(cudf::table_view({column}));
+  auto column = [&]() {
+    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
+                     col_data_it, col_data_it + column_size, make_validity_iter())
+                 : cudf::test::fixed_width_column_wrapper<float>(col_data_it,
+                                                                 col_data_it + column_size);
+  }();
+  auto values = [&]() {
+    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
+                     val_data_it, val_data_it + values_size, make_validity_iter())
+                 : cudf::test::fixed_width_column_wrapper<float>(val_data_it,
+                                                                 val_data_it + values_size);
+  }();
+
+  auto data_table = cudf::sort(cudf::table_view({column}));
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto col = cudf::upper_bound(sorted->view(),
+    auto col = cudf::upper_bound(data_table->view(),
                                  cudf::table_view({values}),
                                  {cudf::order::ASCENDING},
                                  {cudf::null_order::BEFORE});
   }
 }
 
-BENCHMARK_DEFINE_F(Search, NullableColumn)(::benchmark::State& state) { BM_nullable_column(state); }
+BENCHMARK_DEFINE_F(Search, Column_AllValid)(::benchmark::State& state) { BM_column(state, false); }
+BENCHMARK_DEFINE_F(Search, Column_Nulls)(::benchmark::State& state) { BM_column(state, true); }
+
+BENCHMARK_REGISTER_F(Search, Column_AllValid)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(100000000);
 
-BENCHMARK_REGISTER_F(Search, NullableColumn)
+BENCHMARK_REGISTER_F(Search, Column_Nulls)
   ->UseManualTime()
   ->Unit(benchmark::kMillisecond)
   ->Arg(100000000);
@@ -153,3 +138,53 @@ BENCHMARK_REGISTER_F(Search, Table)
   ->UseManualTime()
   ->Unit(benchmark::kMillisecond)
   ->Apply(CustomArguments);
+
+void BM_contains(benchmark::State& state, bool nulls)
+{
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+  const cudf::size_type values_size = column_size;
+
+  auto col_data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return static_cast<float>(row); });
+  auto val_data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return static_cast<float>(values_size - row); });
+
+  auto column = [&]() {
+    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
+                     col_data_it, col_data_it + column_size, make_validity_iter())
+                 : cudf::test::fixed_width_column_wrapper<float>(col_data_it,
+                                                                 col_data_it + column_size);
+  }();
+  auto values = [&]() {
+    return nulls ? cudf::test::fixed_width_column_wrapper<float>(
+                     val_data_it, val_data_it + values_size, make_validity_iter())
+                 : cudf::test::fixed_width_column_wrapper<float>(val_data_it,
+                                                                 val_data_it + values_size);
+  }();
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    auto col = cudf::contains(column, values);
+  }
+}
+
+BENCHMARK_DEFINE_F(Search, ColumnContains_AllValid)(::benchmark::State& state)
+{
+  BM_contains(state, false);
+}
+BENCHMARK_DEFINE_F(Search, ColumnContains_Nulls)(::benchmark::State& state)
+{
+  BM_contains(state, true);
+}
+
+BENCHMARK_REGISTER_F(Search, ColumnContains_AllValid)
+  ->RangeMultiplier(8)
+  ->Ranges({{1 << 10, 1 << 26}})
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_REGISTER_F(Search, ColumnContains_Nulls)
+  ->RangeMultiplier(8)
+  ->Ranges({{1 << 10, 1 << 26}})
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond);
diff --git a/cpp/benchmarks/sort/rank_benchmark.cpp b/cpp/benchmarks/sort/rank_benchmark.cpp
new file mode 100644
index 00000000000..60be95b9112
--- /dev/null
+++ b/cpp/benchmarks/sort/rank_benchmark.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/sorting.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+class Rank : public cudf::benchmark {
+};
+
+static void BM_rank(benchmark::State& state, bool nulls)
+{
+  using Type           = int;
+  using column_wrapper = cudf::test::fixed_width_column_wrapper<Type>;
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> distribution(0, 100);
+
+  const cudf::size_type n_rows{(cudf::size_type)state.range(0)};
+
+  // Create columns with values in the range [0,100)
+  column_wrapper input = [&, n_rows]() {
+    auto elements = cudf::detail::make_counting_transform_iterator(
+      0, [&](auto row) { return distribution(generator); });
+    if (!nulls) return column_wrapper(elements, elements + n_rows);
+    auto valids = cudf::detail::make_counting_transform_iterator(
+      0, [](auto i) { return i % 100 == 0 ? false : true; });
+    return column_wrapper(elements, elements + n_rows, valids);
+  }();
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+    auto result = cudf::rank(input,
+                             cudf::rank_method::FIRST,
+                             cudf::order::ASCENDING,
+                             nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
+                             cudf::null_order::AFTER,
+                             false);
+  }
+}
+
+#define RANK_BENCHMARK_DEFINE(name, nulls)          \
+  BENCHMARK_DEFINE_F(Rank, name)                    \
+  (::benchmark::State & st) { BM_rank(st, nulls); } \
+  BENCHMARK_REGISTER_F(Rank, name)                  \
+    ->RangeMultiplier(8)                            \
+    ->Ranges({{1 << 10, 1 << 26}})                  \
+    ->UseManualTime()                               \
+    ->Unit(benchmark::kMillisecond);
+
+RANK_BENCHMARK_DEFINE(no_nulls, false)
+RANK_BENCHMARK_DEFINE(nulls, true)
diff --git a/cpp/benchmarks/sort/sort_benchmark.cpp b/cpp/benchmarks/sort/sort_benchmark.cpp
index fb74469e7c0..fe68ddd0051 100644
--- a/cpp/benchmarks/sort/sort_benchmark.cpp
+++ b/cpp/benchmarks/sort/sort_benchmark.cpp
@@ -61,7 +61,7 @@ static void BM_sort(benchmark::State& state, bool nulls)
   auto input = cudf::table_view(column_views);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
 
     auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input);
   }
diff --git a/cpp/benchmarks/sort/sort_strings_benchmark.cpp b/cpp/benchmarks/sort/sort_strings_benchmark.cpp
index 54e85b7ea8c..f5effcafcfb 100644
--- a/cpp/benchmarks/sort/sort_strings_benchmark.cpp
+++ b/cpp/benchmarks/sort/sort_strings_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ static void BM_sort(benchmark::State& state)
   auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     cudf::sort(table->view());
   }
 }
diff --git a/cpp/benchmarks/string/case_benchmark.cpp b/cpp/benchmarks/string/case_benchmark.cpp
index 9c1c81da22a..508ae49e093 100644
--- a/cpp/benchmarks/string/case_benchmark.cpp
+++ b/cpp/benchmarks/string/case_benchmark.cpp
@@ -32,7 +32,7 @@ static void BM_case(benchmark::State& state)
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     cudf::strings::to_lower(input);
   }
 
diff --git a/cpp/benchmarks/string/combine_benchmark.cpp b/cpp/benchmarks/string/combine_benchmark.cpp
index 2a5013a9ae7..7dabd32e874 100644
--- a/cpp/benchmarks/string/combine_benchmark.cpp
+++ b/cpp/benchmarks/string/combine_benchmark.cpp
@@ -43,7 +43,7 @@ static void BM_combine(benchmark::State& state)
   cudf::string_scalar separator("+");
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     cudf::strings::concatenate(table->view(), separator);
   }
 
diff --git a/cpp/benchmarks/string/contains_benchmark.cpp b/cpp/benchmarks/string/contains_benchmark.cpp
index 1a2ac8ad602..79bdda77634 100644
--- a/cpp/benchmarks/string/contains_benchmark.cpp
+++ b/cpp/benchmarks/string/contains_benchmark.cpp
@@ -35,7 +35,7 @@ static void BM_contains(benchmark::State& state, contains_type ct)
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     // contains_re(), matches_re(), and count_re() all have similar functions
     // with count_re() being the most regex intensive
     switch (ct) {
diff --git a/cpp/benchmarks/string/copy_benchmark.cpp b/cpp/benchmarks/string/copy_benchmark.cpp
index af9f5b4fa4a..b49bc878ca7 100644
--- a/cpp/benchmarks/string/copy_benchmark.cpp
+++ b/cpp/benchmarks/string/copy_benchmark.cpp
@@ -54,7 +54,7 @@ static void BM_copy(benchmark::State& state, copy_type ct)
                                                                     host_map_data.end());
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (ct) {
       case gather: cudf::gather(source->view(), index_map); break;
       case scatter: cudf::scatter(source->view(), index_map, target->view()); break;
diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract_benchmark.cpp
index dbae18dde3b..aa1e59a22bf 100644
--- a/cpp/benchmarks/string/extract_benchmark.cpp
+++ b/cpp/benchmarks/string/extract_benchmark.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "string_bench_args.hpp"
+
 #include <benchmark/benchmark.h>
 #include <benchmarks/common/generate_benchmark_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
@@ -23,43 +25,55 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include "string_bench_args.hpp"
+#include <random>
 
 class StringExtract : public cudf::benchmark {
 };
 
-static void BM_extract(benchmark::State& state, int re_instructions)
+static void BM_extract(benchmark::State& state, int groups)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile table_profile;
-  table_profile.set_distribution_params(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const table =
-    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
-  cudf::strings_column_view input(table->view().column(0));
-  std::string const raw_pattern =
-    "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234"
-    "5678901234567890123456789012345678901234567890";
-  std::string const pattern = "(" + raw_pattern.substr(0, re_instructions) + ")";
+  auto const n_rows   = static_cast<cudf::size_type>(state.range(0));
+  auto const n_length = static_cast<cudf::size_type>(state.range(1));
+
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> words_dist(0, 999);
+
+  std::vector<std::string> samples(100);  // 100 unique rows of data to reuse
+  std::generate(samples.begin(), samples.end(), [&]() {
+    std::string row;  // build a row of random tokens
+    while (static_cast<int>(row.size()) < n_length) {
+      row += std::to_string(words_dist(generator)) + " ";
+    }
+    return row;
+  });
+
+  std::string pattern;
+  while (static_cast<int>(pattern.size()) < groups) { pattern += "(\\d+) "; }
+
+  std::uniform_int_distribution<int> distribution(0, samples.size() - 1);
+  auto elements = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto idx) { return samples.at(distribution(generator)); });
+  cudf::test::strings_column_wrapper input(elements, elements + n_rows);
+  cudf::strings_column_view view(input);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
-    auto results = cudf::strings::extract(input, pattern);
+    cuda_event_timer raii(state, true);
+    auto results = cudf::strings::extract(view, pattern);
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * view.chars_size());
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
 {
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+  int const min_rows          = 1 << 12;
+  int const max_rows          = 1 << 24;
+  int const row_multiplier    = 8;
+  int const min_row_length    = 1 << 5;
+  int const max_row_length    = 1 << 13;
+  int const length_multiplier = 4;
+  generate_string_bench_args(
+    b, min_rows, max_rows, row_multiplier, min_row_length, max_row_length, length_multiplier);
 }
 
 #define STRINGS_BENCHMARK_DEFINE(name, instructions)          \
@@ -70,6 +84,6 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
     ->UseManualTime()                                         \
     ->Unit(benchmark::kMillisecond);
 
-STRINGS_BENCHMARK_DEFINE(small, 4)
-STRINGS_BENCHMARK_DEFINE(medium, 48)
-STRINGS_BENCHMARK_DEFINE(large, 128)
+STRINGS_BENCHMARK_DEFINE(small, 2)
+STRINGS_BENCHMARK_DEFINE(medium, 10)
+STRINGS_BENCHMARK_DEFINE(large, 30)
diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu
index 6c5dceffaa8..bae08431b51 100644
--- a/cpp/benchmarks/string/factory_benchmark.cu
+++ b/cpp/benchmarks/string/factory_benchmark.cu
@@ -25,6 +25,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/execution_policy.h>
@@ -55,7 +56,7 @@ static void BM_factory(benchmark::State& state)
   auto const table =
     create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
   auto d_column = cudf::column_device_view::create(table->view().column(0));
-  rmm::device_vector<string_pair> pairs(d_column->size());
+  rmm::device_uvector<string_pair> pairs(d_column->size(), rmm::cuda_stream_default);
   thrust::transform(thrust::device,
                     d_column->pair_begin<cudf::string_view, true>(),
                     d_column->pair_end<cudf::string_view, true>(),
@@ -63,7 +64,7 @@ static void BM_factory(benchmark::State& state)
                     string_view_to_pair{});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     cudf::make_strings_column(pairs);
   }
 
diff --git a/cpp/benchmarks/string/filter_benchmark.cpp b/cpp/benchmarks/string/filter_benchmark.cpp
index 123c5597df9..97228122c42 100644
--- a/cpp/benchmarks/string/filter_benchmark.cpp
+++ b/cpp/benchmarks/string/filter_benchmark.cpp
@@ -50,7 +50,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api)
     {cudf::char_utf8{'a'}, cudf::char_utf8{'c'}}};
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (api) {
       case filter: cudf::strings::filter_characters_of_type(input, types); break;
       case filter_chars: cudf::strings::filter_characters(input, filter_table); break;
@@ -73,7 +73,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
     for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
       // avoid generating combinations that exceed the cudf column limit
       size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
         b->Args({row_count, rowlen});
       }
     }
diff --git a/cpp/benchmarks/string/find_benchmark.cpp b/cpp/benchmarks/string/find_benchmark.cpp
index 200527d606e..8e570a55440 100644
--- a/cpp/benchmarks/string/find_benchmark.cpp
+++ b/cpp/benchmarks/string/find_benchmark.cpp
@@ -46,7 +46,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
   cudf::test::strings_column_wrapper targets({"+", "-"});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (find_api) {
       case find: cudf::strings::find(input, target); break;
       case find_multi:
@@ -73,7 +73,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
     for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
       // avoid generating combinations that exceed the cudf column limit
       size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
         b->Args({row_count, rowlen});
       }
     }
diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp
index 6fb6a07a8d0..c6a6b757951 100644
--- a/cpp/benchmarks/string/json_benchmark.cpp
+++ b/cpp/benchmarks/string/json_benchmark.cpp
@@ -113,7 +113,7 @@ static void BM_case(benchmark::State& state, QueryArg&&... query_arg)
   std::string json_path(query_arg...);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true);
     auto result = cudf::strings::get_json_object(scv, json_path);
     cudaStreamSynchronize(0);
   }
diff --git a/cpp/benchmarks/string/replace_benchmark.cpp b/cpp/benchmarks/string/replace_benchmark.cpp
index 968b8f5abb0..0d785fd25aa 100644
--- a/cpp/benchmarks/string/replace_benchmark.cpp
+++ b/cpp/benchmarks/string/replace_benchmark.cpp
@@ -49,7 +49,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
   cudf::test::strings_column_wrapper repls({"", ""});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (rt) {
       case scalar: cudf::strings::replace(input, target, repl); break;
       case slice: cudf::strings::replace_slice(input, repl, 1, 10); break;
diff --git a/cpp/benchmarks/string/replace_re_benchmark.cpp b/cpp/benchmarks/string/replace_re_benchmark.cpp
index 616e2c0f22c..18ec28371e3 100644
--- a/cpp/benchmarks/string/replace_re_benchmark.cpp
+++ b/cpp/benchmarks/string/replace_re_benchmark.cpp
@@ -43,7 +43,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
   cudf::test::strings_column_wrapper repls({"#", ""});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (rt) {
       case replace_type::replace_re:  // contains_re and matches_re use the same main logic
         cudf::strings::replace_re(input, "\\d+");
diff --git a/cpp/benchmarks/string/split_benchmark.cpp b/cpp/benchmarks/string/split_benchmark.cpp
index 35bedb1b767..cab477754a6 100644
--- a/cpp/benchmarks/string/split_benchmark.cpp
+++ b/cpp/benchmarks/string/split_benchmark.cpp
@@ -44,7 +44,7 @@ static void BM_split(benchmark::State& state, split_type rt)
   cudf::string_scalar target("+");
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (rt) {
       case split: cudf::strings::split(input, target); break;
       case split_ws: cudf::strings::split(input); break;
@@ -68,7 +68,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
     for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
       // avoid generating combinations that exceed the cudf column limit
       size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
         b->Args({row_count, rowlen});
       }
     }
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
index 05ed1bf5b33..92a46374438 100644
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ b/cpp/benchmarks/string/string_bench_args.hpp
@@ -48,7 +48,7 @@ inline void generate_string_bench_args(benchmark::internal::Benchmark* b,
     for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) {
       // avoid generating combinations that exceed the cudf column limit
       size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
         b->Args({row_count, rowlen});
       }
     }
diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring_benchmark.cpp
index d47c42e45be..e8a66f7b323 100644
--- a/cpp/benchmarks/string/substring_benchmark.cpp
+++ b/cpp/benchmarks/string/substring_benchmark.cpp
@@ -54,7 +54,7 @@ static void BM_substring(benchmark::State& state, substring_type rt)
   cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (rt) {
       case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break;
       case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
diff --git a/cpp/benchmarks/string/translate_benchmark.cpp b/cpp/benchmarks/string/translate_benchmark.cpp
index c49a986d744..49396b0ce71 100644
--- a/cpp/benchmarks/string/translate_benchmark.cpp
+++ b/cpp/benchmarks/string/translate_benchmark.cpp
@@ -54,7 +54,7 @@ static void BM_translate(benchmark::State& state, int entry_count)
                  });
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     cudf::strings::translate(input, entries);
   }
 
diff --git a/cpp/benchmarks/string/url_decode_benchmark.cpp b/cpp/benchmarks/string/url_decode_benchmark.cpp
index 26c23ea23b4..fbb99bf3e8f 100644
--- a/cpp/benchmarks/string/url_decode_benchmark.cpp
+++ b/cpp/benchmarks/string/url_decode_benchmark.cpp
@@ -80,7 +80,7 @@ void BM_url_decode(benchmark::State& state)
   auto strings_view = cudf::strings_column_view(column);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     auto result = cudf::strings::url_decode(strings_view);
   }
 
diff --git a/cpp/benchmarks/text/ngrams_benchmark.cpp b/cpp/benchmarks/text/ngrams_benchmark.cpp
index 1fe8e3b7f2e..52f55249631 100644
--- a/cpp/benchmarks/text/ngrams_benchmark.cpp
+++ b/cpp/benchmarks/text/ngrams_benchmark.cpp
@@ -43,7 +43,7 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true);
     switch (nt) {
       case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
       case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
diff --git a/cpp/benchmarks/text/normalize_benchmark.cpp b/cpp/benchmarks/text/normalize_benchmark.cpp
index 32c4fb7dcde..f041547d021 100644
--- a/cpp/benchmarks/text/normalize_benchmark.cpp
+++ b/cpp/benchmarks/text/normalize_benchmark.cpp
@@ -41,7 +41,7 @@ static void BM_normalize(benchmark::State& state, bool to_lower)
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     nvtext::normalize_characters(input, to_lower);
   }
 
@@ -60,7 +60,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
     for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
       // avoid generating combinations that exceed the cudf column limit
       size_t total_chars = static_cast<size_t>(row_count) * rowlen * 4;
-      if (total_chars < std::numeric_limits<cudf::size_type>::max()) {
+      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
         b->Args({row_count, rowlen});
       }
     }
diff --git a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
index dcabb0c225c..6260bb02c55 100644
--- a/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
+++ b/cpp/benchmarks/text/normalize_spaces_benchmark.cpp
@@ -42,7 +42,7 @@ static void BM_normalize(benchmark::State& state)
   cudf::strings_column_view input(table->view().column(0));
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     nvtext::normalize_spaces(input);
   }
 
diff --git a/cpp/benchmarks/text/replace_benchmark.cpp b/cpp/benchmarks/text/replace_benchmark.cpp
index f5428aee225..8f6704ab1af 100644
--- a/cpp/benchmarks/text/replace_benchmark.cpp
+++ b/cpp/benchmarks/text/replace_benchmark.cpp
@@ -54,7 +54,7 @@ static void BM_replace(benchmark::State& state)
   cudf::test::strings_column_wrapper replacements({"1", "2", "7", "0"});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true);
     nvtext::replace_tokens(
       view, cudf::strings_column_view(targets), cudf::strings_column_view(replacements));
   }
diff --git a/cpp/benchmarks/text/tokenize_benchmark.cpp b/cpp/benchmarks/text/tokenize_benchmark.cpp
index f9e742f0f31..cd6428a9406 100644
--- a/cpp/benchmarks/text/tokenize_benchmark.cpp
+++ b/cpp/benchmarks/text/tokenize_benchmark.cpp
@@ -31,7 +31,7 @@
 class TextTokenize : public cudf::benchmark {
 };
 
-enum class tokenize_type { single, multi, count, count_multi, ngrams };
+enum class tokenize_type { single, multi, count, count_multi, ngrams, characters };
 
 static void BM_tokenize(benchmark::State& state, tokenize_type tt)
 {
@@ -46,13 +46,19 @@ static void BM_tokenize(benchmark::State& state, tokenize_type tt)
   cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, 0);
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
     switch (tt) {
-      case tokenize_type::single: nvtext::tokenize(input); break;
+      case tokenize_type::single:
+        // single whitespace delimiter
+        nvtext::tokenize(input);
+        break;
       case tokenize_type::multi:
         nvtext::tokenize(input, cudf::strings_column_view(delimiters));
         break;
-      case tokenize_type::count: nvtext::count_tokens(input); break;
+      case tokenize_type::count:
+        // single whitespace delimiter
+        nvtext::count_tokens(input);
+        break;
       case tokenize_type::count_multi:
         nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
         break;
@@ -60,6 +66,10 @@ static void BM_tokenize(benchmark::State& state, tokenize_type tt)
         // default is bigrams
         nvtext::ngrams_tokenize(input);
         break;
+      case tokenize_type::characters:
+        // every character becomes a string
+        nvtext::character_tokenize(input);
+        break;
     }
   }
 
@@ -90,3 +100,4 @@ NVTEXT_BENCHMARK_DEFINE(multi)
 NVTEXT_BENCHMARK_DEFINE(count)
 NVTEXT_BENCHMARK_DEFINE(count_multi)
 NVTEXT_BENCHMARK_DEFINE(ngrams)
+NVTEXT_BENCHMARK_DEFINE(characters)
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
index 18ef5a1168e..b09a7911595 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,26 +14,22 @@
  * limitations under the License.
  */
 
+#include "../fixture/benchmark_fixture.hpp"
+#include "../synchronization/synchronization.hpp"
+
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <cudf/detail/utilities/cuda.cuh>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 
-#include <cudf/utilities/traits.hpp>
-#include <random>
 #include <type_traits>
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
-
-using namespace cudf;
 
 enum DispatchingType { HOST_DISPATCHING, DEVICE_DISPATCHING, NO_DISPATCHING };
 
@@ -75,7 +71,7 @@ __global__ void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_
 
 // This is for HOST_DISPATCHING
 template <FunctorType functor_type, class T>
-__global__ void host_dispatching_kernel(mutable_column_device_view source_column)
+__global__ void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
   using F               = Functor<T, functor_type>;
   T* A                  = source_column.data<T>();
@@ -89,7 +85,7 @@ __global__ void host_dispatching_kernel(mutable_column_device_view source_column
 template <FunctorType functor_type>
 struct ColumnHandle {
   template <typename ColumnType, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<ColumnType>())>
-  void operator()(mutable_column_device_view source_column, int work_per_thread)
+  void operator()(cudf::mutable_column_device_view source_column, int work_per_thread)
   {
     cudf::detail::grid_1d grid_config{source_column.size(), block_size};
     int grid_size = grid_config.num_blocks;
@@ -98,7 +94,7 @@ struct ColumnHandle {
   }
 
   template <typename ColumnType, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<ColumnType>())>
-  void operator()(mutable_column_device_view source_column, int work_per_thread)
+  void operator()(cudf::mutable_column_device_view source_column, int work_per_thread)
   {
     CUDF_FAIL("Invalid type to benchmark.");
   }
@@ -112,14 +108,14 @@ struct ColumnHandle {
 template <FunctorType functor_type>
 struct RowHandle {
   template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
-  __device__ void operator()(mutable_column_device_view source, cudf::size_type index)
+  __device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index)
   {
     using F                 = Functor<T, functor_type>;
     source.data<T>()[index] = F::f(source.data<T>()[index]);
   }
 
   template <typename T, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
-  __device__ void operator()(mutable_column_device_view source, cudf::size_type index)
+  __device__ void operator()(cudf::mutable_column_device_view source, cudf::size_type index)
   {
     cudf_assert(false && "Unsupported type.");
   }
@@ -127,7 +123,7 @@ struct RowHandle {
 
 // This is for DEVICE_DISPATCHING
 template <FunctorType functor_type>
-__global__ void device_dispatching_kernel(mutable_table_device_view source)
+__global__ void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   const cudf::size_type n_rows = source.num_rows();
   cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
@@ -142,7 +138,7 @@ __global__ void device_dispatching_kernel(mutable_table_device_view source)
 }
 
 template <FunctorType functor_type, DispatchingType dispatching_type, class T>
-void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
+void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_thread)
 {
   const cudf::size_type n_rows = input.num_rows();
   const cudf::size_type n_cols = input.num_columns();
@@ -153,12 +149,12 @@ void launch_kernel(mutable_table_view input, T** d_ptr, int work_per_thread)
   if (dispatching_type == HOST_DISPATCHING) {
     // std::vector<cudf::util::cuda::scoped_stream> v_stream(n_cols);
     for (int c = 0; c < n_cols; c++) {
-      auto d_column = mutable_column_device_view::create(input.column(c));
+      auto d_column = cudf::mutable_column_device_view::create(input.column(c));
       cudf::type_dispatcher(
         d_column->type(), ColumnHandle<functor_type>{}, *d_column, work_per_thread);
     }
   } else if (dispatching_type == DEVICE_DISPATCHING) {
-    auto d_table_view = mutable_table_device_view::create(input);
+    auto d_table_view = cudf::mutable_table_device_view::create(input);
     auto f            = device_dispatching_kernel<functor_type>;
     // Launch the kernel
     f<<<grid_size, block_size>>>(*d_table_view);
@@ -191,25 +187,26 @@ void type_dispatcher_benchmark(::benchmark::State& state)
   cudf::mutable_table_view source_table{source_columns};
 
   // For no dispatching
-  std::vector<rmm::device_vector<TypeParam>> h_vec(n_cols,
-                                                   rmm::device_vector<TypeParam>(source_size, 0));
+  std::vector<rmm::device_buffer> h_vec(n_cols);
   std::vector<TypeParam*> h_vec_p(n_cols);
-  for (int c = 0; c < n_cols; c++) { h_vec_p[c] = h_vec[c].data().get(); }
-  rmm::device_vector<TypeParam*> d_vec(n_cols);
+  std::transform(h_vec.begin(), h_vec.end(), h_vec_p.begin(), [source_size](auto& col) {
+    col.resize(source_size * sizeof(TypeParam), rmm::cuda_stream_default);
+    return static_cast<TypeParam*>(col.data());
+  });
+  rmm::device_uvector<TypeParam*> d_vec(n_cols, rmm::cuda_stream_default);
 
   if (dispatching_type == NO_DISPATCHING) {
     CUDA_TRY(cudaMemcpy(
-      d_vec.data().get(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
+      d_vec.data(), h_vec_p.data(), sizeof(TypeParam*) * n_cols, cudaMemcpyHostToDevice));
   }
 
   // Warm up
-  launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data().get(), work_per_thread);
+  launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
   CUDA_TRY(cudaDeviceSynchronize());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    launch_kernel<functor_type, dispatching_type>(
-      source_table, d_vec.data().get(), work_per_thread);
+    launch_kernel<functor_type, dispatching_type>(source_table, d_vec.data(), work_per_thread);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * source_size * n_cols * 2 *
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index d05b4b347f1..eb1ade61440 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -40,7 +40,7 @@ function(jit_preprocess_files)
                                     -o ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files
                                     -i
                                     -m
-                                    -std=c++14
+                                    -std=c++17
                                     -remove-unused-globals
                                     -D__CUDACC_RTC__
                                     -I${CUDF_SOURCE_DIR}/include
diff --git a/cpp/cmake/Modules/SetGPUArchs.cmake b/cpp/cmake/Modules/SetGPUArchs.cmake
index f09d5ead8e2..8ab3c14d671 100644
--- a/cpp/cmake/Modules/SetGPUArchs.cmake
+++ b/cpp/cmake/Modules/SetGPUArchs.cmake
@@ -38,16 +38,6 @@ if(NOT DEFINED CUDAToolkit_VERSION AND CMAKE_CUDA_COMPILER)
   unset(NVCC_OUT)
 endif()
 
-if(CUDAToolkit_VERSION_MAJOR LESS 11)
-  list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "80")
-endif()
-if(CUDAToolkit_VERSION_MAJOR LESS 10)
-  list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "75")
-endif()
-if(CUDAToolkit_VERSION_MAJOR LESS 9)
-  list(REMOVE_ITEM SUPPORTED_CUDA_ARCHITECTURES "70")
-endif()
-
 if(${PROJECT_NAME}_BUILD_FOR_ALL_ARCHS)
   set(CMAKE_CUDA_ARCHITECTURES ${SUPPORTED_CUDA_ARCHITECTURES})
 
diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
index ed1926f20f0..358c4377078 100644
--- a/cpp/cmake/cudf-build-config.cmake.in
+++ b/cpp/cmake/cudf-build-config.cmake.in
@@ -43,8 +43,6 @@ find_dependency(ZLIB)
 
 # add third party dependencies using CPM
 include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetCPM.cmake)
-# find boost
-include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_FindBoost.cmake)
 # find jitify
 include(@CUDF_SOURCE_DIR@/cmake/thirdparty/CUDF_GetJitify.cmake)
 # find thrust/cub
diff --git a/cpp/cmake/cudf-config.cmake.in b/cpp/cmake/cudf-config.cmake.in
index 66c669851fa..86755696607 100644
--- a/cpp/cmake/cudf-config.cmake.in
+++ b/cpp/cmake/cudf-config.cmake.in
@@ -71,11 +71,6 @@ find_dependency(CUDAToolkit)
 find_dependency(Threads)
 find_dependency(ZLIB)
 
-# Don't look for a Boost CMake configuration file because it adds the
-# `-DBOOST_ALL_NO_LIB` and `-DBOOST_FILESYSTEM_DYN_LINK` compile defs
-set(Boost_NO_BOOST_CMAKE ON)
-find_dependency(Boost @CUDF_MIN_VERSION_Boost@ COMPONENTS filesystem)
-
 find_dependency(Arrow @CUDF_VERSION_Arrow@)
 
 set(ArrowCUDA_DIR "${Arrow_DIR}")
diff --git a/cpp/cmake/thirdparty/CUDF_FindBoost.cmake b/cpp/cmake/thirdparty/CUDF_FindBoost.cmake
deleted file mode 100644
index fef393d7f20..00000000000
--- a/cpp/cmake/thirdparty/CUDF_FindBoost.cmake
+++ /dev/null
@@ -1,38 +0,0 @@
-#=============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-# Min version set to newest boost in Ubuntu bionic apt repositories
-set(CUDF_MIN_VERSION_Boost 1.65.0)
-
-# Don't look for a Boost CMake configuration file because it adds the
-# `-DBOOST_ALL_NO_LIB` and `-DBOOST_FILESYSTEM_DYN_LINK` compile defs
-set(Boost_NO_BOOST_CMAKE ON)
-
-# TODO: Use CPMFindPackage to add or build Boost
-
-find_package(Boost ${CUDF_MIN_VERSION_Boost} QUIET MODULE COMPONENTS filesystem)
-
-message(VERBOSE "CUDF: Boost_FOUND: ${Boost_FOUND}")
-
-if(NOT Boost_FOUND)
-    message(FATAL_ERROR "CUDF: Boost not found, please check your settings.")
-endif()
-
-message(VERBOSE "CUDF: Boost_LIBRARIES: ${Boost_LIBRARIES}")
-message(VERBOSE "CUDF: Boost_INCLUDE_DIRS: ${Boost_INCLUDE_DIRS}")
-
-list(APPEND CUDF_CXX_DEFINITIONS BOOST_NO_CXX14_CONSTEXPR)
-list(APPEND CUDF_CUDA_DEFINITIONS BOOST_NO_CXX14_CONSTEXPR)
diff --git a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
index 19c07933d42..ce2921f5954 100644
--- a/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetCPM.cmake
@@ -1,6 +1,8 @@
-set(CPM_DOWNLOAD_VERSION 3b404296b539e596f39421c4e92bc803b299d964) # v0.27.5
+set(CPM_DOWNLOAD_VERSION 7644c3a40fc7889f8dee53ce21e85dc390b883dc) # v0.32.1
 
 if(CPM_SOURCE_CACHE)
+  # Expand relative path. This is important if the provided path contains a tilde (~)
+  get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE)
   set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
 elseif(DEFINED ENV{CPM_SOURCE_CACHE})
   set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
@@ -12,7 +14,7 @@ if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
   message(VERBOSE "CUDF: Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
   file(
     DOWNLOAD
-    https://raw.githubusercontent.com/TheLartians/CPM.cmake/${CPM_DOWNLOAD_VERSION}/cmake/CPM.cmake
+    https://raw.githubusercontent.com/cpm-cmake/CPM.cmake/${CPM_DOWNLOAD_VERSION}/cmake/CPM.cmake
     ${CPM_DOWNLOAD_LOCATION})
 endif()
 
diff --git a/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake b/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake
index b41c6d3b8d2..349f75d604f 100644
--- a/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetDLPack.cmake
@@ -36,6 +36,6 @@ function(find_and_configure_dlpack VERSION)
     set(DLPACK_INCLUDE_DIR "${dlpack_SOURCE_DIR}/include" PARENT_SCOPE)
 endfunction()
 
-set(CUDF_MIN_VERSION_dlpack 0.3)
+set(CUDF_MIN_VERSION_dlpack 0.5)
 
 find_and_configure_dlpack(${CUDF_MIN_VERSION_dlpack})
diff --git a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
index 136947674f9..b2861ae48c4 100644
--- a/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetRMM.cmake
@@ -14,49 +14,37 @@
 # limitations under the License.
 #=============================================================================
 
-function(cudf_save_if_enabled var)
-    if(CUDF_${var})
-        unset(${var} PARENT_SCOPE)
-        unset(${var} CACHE)
-    endif()
-endfunction()
-
-function(cudf_restore_if_enabled var)
-    if(CUDF_${var})
-        set(${var} ON CACHE INTERNAL "" FORCE)
-    endif()
-endfunction()
-
 function(find_and_configure_rmm VERSION)
 
     if(TARGET rmm::rmm)
         return()
     endif()
 
+    if(${VERSION} MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+        set(MAJOR_AND_MINOR "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}")
+    else()
+        set(MAJOR_AND_MINOR "${VERSION}")
+    endif()
+
     # Consumers have two options for local source builds:
     # 1. Pass `-D CPM_rmm_SOURCE=/path/to/rmm` to build a local RMM source tree
     # 2. Pass `-D CMAKE_PREFIX_PATH=/path/to/rmm/build` to use an existing local
     #    RMM build directory as the install location for find_package(rmm)
-    cudf_save_if_enabled(BUILD_TESTS)
-    cudf_save_if_enabled(BUILD_BENCHMARKS)
-
     CPMFindPackage(NAME rmm
         VERSION         ${VERSION}
         GIT_REPOSITORY  https://github.com/rapidsai/rmm.git
-        GIT_TAG         branch-${VERSION}
+        GIT_TAG         branch-${MAJOR_AND_MINOR}
         GIT_SHALLOW     TRUE
         OPTIONS         "BUILD_TESTS OFF"
                         "BUILD_BENCHMARKS OFF"
                         "CUDA_STATIC_RUNTIME ${CUDA_STATIC_RUNTIME}"
                         "DISABLE_DEPRECATION_WARNING ${DISABLE_DEPRECATION_WARNING}"
     )
-    cudf_restore_if_enabled(BUILD_TESTS)
-    cudf_restore_if_enabled(BUILD_BENCHMARKS)
 
     # Make sure consumers of cudf can also see rmm::rmm
     fix_cmake_global_defaults(rmm::rmm)
 endfunction()
 
-set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}")
+set(CUDF_MIN_VERSION_rmm "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}.00")
 
 find_and_configure_rmm(${CUDF_MIN_VERSION_rmm})
diff --git a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
index 5a304f234d2..343ade8664d 100644
--- a/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetThrust.cmake
@@ -15,18 +15,29 @@
 #=============================================================================
 
 function(find_and_configure_thrust VERSION)
+    # We only want to set `UPDATE_DISCONNECTED` while
+    # the GIT tag hasn't moved from the last time we cloned
+    set(cpm_thrust_disconnect_update "UPDATE_DISCONNECTED TRUE")
+    set(CPM_THRUST_CURRENT_VERSION ${VERSION} CACHE STRING "version of thrust we checked out")
+    if(NOT VERSION VERSION_EQUAL CPM_THRUST_CURRENT_VERSION)
+        set(CPM_THRUST_CURRENT_VERSION ${VERSION} CACHE STRING "version of thrust we checked out" FORCE)
+        set(cpm_thrust_disconnect_update "")
+    endif()
+
     CPMAddPackage(NAME Thrust
         VERSION         ${VERSION}
         GIT_REPOSITORY  https://github.com/NVIDIA/thrust.git
         GIT_TAG         ${VERSION}
         GIT_SHALLOW     TRUE
-        PATCH_COMMAND   patch -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true)
+        ${cpm_thrust_disconnect_update}
+        PATCH_COMMAND   patch --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true
+        )
 
     thrust_create_target(cudf::Thrust FROM_OPTIONS)
     set(THRUST_LIBRARY "cudf::Thrust" PARENT_SCOPE)
     set(Thrust_SOURCE_DIR "${Thrust_SOURCE_DIR}" PARENT_SCOPE)
 endfunction()
 
-set(CUDF_MIN_VERSION_Thrust 1.10.0)
+set(CUDF_MIN_VERSION_Thrust 1.12.0)
 
 find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust})
diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
index 3f876f7ffb7..2f9201d8ab4 100644
--- a/cpp/cmake/thrust.patch
+++ b/cpp/cmake/thrust.patch
@@ -42,3 +42,42 @@ index 1ffeef0..5e80800 100644
            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
            {
              if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
+diff a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
+index 41eb1d2..f2893b4 100644
+--- a/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -723,7 +723,7 @@ struct DeviceRadixSortPolicy
+ 
+ 
+     /// SM60 (GP100)
+-    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+     {
+         enum {
+             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+diff a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
+index f6aee45..dd64301 100644
+--- a/cub/device/dispatch/dispatch_reduce.cuh
++++ b/cub/device/dispatch/dispatch_reduce.cuh
+@@ -284,7 +284,7 @@ struct DeviceReducePolicy
+     };
+ 
+     /// SM60
+-    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
++    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+     {
+         // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+         typedef AgentReducePolicy<
+diff a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
+index c0c6d59..937ee31 100644
+--- a/cub/device/dispatch/dispatch_scan.cuh
++++ b/cub/device/dispatch/dispatch_scan.cuh
+@@ -178,7 +178,7 @@ struct DeviceScanPolicy
+     };
+ 
+     /// SM600
+-    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
++    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+     {
+         typedef AgentScanPolicy<
+                 128, 15,                                        ///< Threads per block, items per thread
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index fa59162c345..0f6e110ffd0 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -255,6 +255,11 @@ currently supported by cudf. Each type of value is represented by a separate typ
 which are all derived from `cudf::scalar`. e.g. A `numeric_scalar` holds a single numerical value, 
 a `string_scalar` holds a single string. The data for the stored value resides in device memory.
 
+A `list_scalar` holds the underlying data of a single list. This means the underlying data can be any type
+that cudf supports. For example, a `list_scalar` representing a list of integers stores a `cudf::column`
+of type `INT32`. A `list_scalar` representing a list of lists of integers stores a `cudf::column` of
+type `LIST`, which in turn stores a column of type `INT32`.
+
 |Value type|Scalar class|Notes|
 |-|-|-|
 |fixed-width|`fixed_width_scalar<T>`| `T` can be any fixed-width type|
@@ -263,6 +268,7 @@ a `string_scalar` holds a single string. The data for the stored value resides i
 |timestamp|`timestamp_scalar<T>` | `T` can be `timestamp_D`, `timestamp_s`, etc.|
 |duration|`duration_scalar<T>` | `T` can be `duration_D`, `duration_s`, etc.|
 |string|`string_scalar`| This class object is immutable|
+|list|`list_scalar`| Underlying data can be any type supported by cudf |
 
 ### Construction
 `scalar`s can be created using either their respective constructors or using factory functions like 
@@ -285,11 +291,16 @@ auto s1 = static_cast<ScalarType *>(s.get());
 ```
 
 ### Passing to device
-Each scalar type has a corresponding non-owning device view class which allows access to the value 
-and its validity from the device. This can be obtained using the function 
+Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which allows
+access to the value and its validity from the device. This can be obtained using the function 
 `get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar 
 object, only for the derived typed scalar class objects.
 
+The underlying data for `list_scalar` can be accessed via `view()` method. For non-nested data,
+the device view can be obtained via function `column_device_view::create(column_view)`. For nested
+data, a specialized device view for list columns can be constructed via
+`lists_column_device_view(column_device_view)`.
+
 # libcudf++ API and Implementation
 
 ## Streams
@@ -403,9 +414,9 @@ Allocates a specified number of bytes of untyped, uninitialized device memory us
 `device_memory_resource`. If no resource is explicitly provided, uses 
 `rmm::mr::get_current_device_resource()`. 
 
-`rmm::device_buffer` is copyable and movable. A copy performs a deep copy of the `device_buffer`'s 
-device memory, whereas a move moves ownership of the device memory from one `device_buffer` to 
-another.
+`rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the 
+`device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the 
+device memory from one `device_buffer` to another.
 
 ```c++
 // Allocates at least 100 bytes of uninitialized device memory 
@@ -413,11 +424,15 @@ another.
 rmm::device_buffer buff(100, stream, mr); 
 void * raw_data = buff.data(); // Raw pointer to underlying device memory
 
-rmm::device_buffer copy(buff); // Deep copies `buff` into `copy`
-rmm::device_buffer moved_to(std::move(buff)); // Moves contents of `buff` into `moved_to`
+// Deep copies `buff` into `copy` on `stream`
+rmm::device_buffer copy(buff, stream); 
+
+// Moves contents of `buff` into `moved_to`
+rmm::device_buffer moved_to(std::move(buff)); 
 
 custom_memory_resource *mr...;
-rmm::device_buffer custom_buff(100, mr); // Allocates 100 bytes from the custom_memory_resource
+// Allocates 100 bytes from the custom_memory_resource
+rmm::device_buffer custom_buff(100, mr, stream); 
 ```
 
 #### `rmm::device_scalar<T>`
@@ -530,6 +545,30 @@ Note:  `std::tuple`  _could_  be used if not for the fact that Cython does not s
 only two objects of different types. Multiple objects of the same type may be returned via a 
 `std::vector<T>`.
 
+Alternatively, with C++17 (supported from cudf v0.20), [structured binding](https://en.cppreference.com/w/cpp/language/structured_binding) 
+may be used to disaggregate multiple return values:
+
+```c++
+auto [out0, out1] = cudf::return_two_outputs();
+```
+
+Note that the compiler might not support capturing aliases defined in a structured binding 
+in a lambda. One may work around this by using a capture with an initializer instead:
+
+```c++
+auto [out0, out1] = cudf::return_two_outputs();
+
+// Direct capture of alias from structured binding might fail with:
+// "error: structured binding cannot be captured"
+// auto foo = [out0]() {...};
+
+// Use an initializing capture:
+auto foo = [&out0 = out0] {
+  // Use out0 to compute something.
+  // ...
+};
+```
+
 ## Iterator-based interfaces
 
 Increasingly, libcudf is moving toward internal (`detail`) APIs with iterator parameters rather 
@@ -929,21 +968,18 @@ this compound column representation of strings.
 
 ## Structs columns
 
-Structs are represented similarly to lists, except that they have multiple child data columns.
-The parent column's type is `STRUCT` and contains no data, but its size represents the number of 
-structs in the column, and its null mask represents the validity of each struct element. The parent 
-has `N + 1` children, where `N` is the number of fields in the struct. 
+A struct is a nested data type with a set of child columns each representing an individual field
+of a logical struct. Field names are not represented.
 
-1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each 
-   struct in each dense column of elements.
-2. For each field, a column containing the actual field data and optional null mask for all elements
-   of all the structs packed together.
-   
-With this representation, `child[0][offsets[i]]` is the first field of struct `i`, 
-`child[1][offsets[i]]` is the second field of struct `i`, etc.
+A structs column with `N` fields has `N` children. Each child is a column storing all the data
+of a single field packed column-wise, with an optional null mask. The parent column's type is
+`STRUCT` and contains no data, its size represents the number of struct rows in the column, and its
+null mask represents the validity of each struct element.
+
+With this representation, `child[0][10]` is row 10 of the first field of the struct, `child[1][42]`
+is row 42 of the second field of the struct.
 
-As defined in the [Apache Arrow specification](https://arrow.apache.org/docs/format/Columnar.html#struct-layout),
-in addition to the struct column's null mask, each struct field column has its own optional null
+Notice that in addition to the struct column's null mask, each struct field column has its own optional null
 mask. A struct field's validity can vary independently from the corresponding struct row. For
 instance, a non-null struct row might have a null field. However, the fields of a null struct row
 are deemed to be null as well. For example, consider a struct column of type 
diff --git a/cpp/docs/TESTING.md b/cpp/docs/TESTING.md
index 638f7224ab8..2c7b62b8b6d 100644
--- a/cpp/docs/TESTING.md
+++ b/cpp/docs/TESTING.md
@@ -1,7 +1,7 @@
 # Unit Testing in libcudf
 
 Unit tests in libcudf are written using 
-[Google Test](https://github.com/google/googletest/blob/master/googletest/docs/primer.md).
+[Google Test](https://github.com/google/googletest/blob/master/docs/primer.md).
 
 **Important:** Instead of including `gtest/gtest.h` directly, use 
 `#include <cudf_test/cudf_gtest.hpp>`.
@@ -59,7 +59,7 @@ files, and are therefore preferred in test code over `thrust::device_vector`.
 
 ## Base Fixture
 
-All libcudf unit tests should make use of a GTest ["Test Fixture"](https://github.com/google/googletest/blob/master/googletest/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests).
+All libcudf unit tests should make use of a GTest ["Test Fixture"](https://github.com/google/googletest/blob/master/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests).
 Even if the fixture is empty, it should inherit from the base fixture `cudf::test::BaseFixture` 
 found in `include/cudf_test/base_fixture.hpp`. This ensures that RMM is properly initialized and 
 finalized. `cudf::test::BaseFixture` already inherits from `::testing::Test` and therefore it is 
@@ -75,7 +75,7 @@ class MyTestFiture : public cudf::test::BaseFixture {...};
 In general, libcudf features must work across all of the supported types (there are exceptions e.g.
 not all binary operations are supported for all types). In order to automate the process of running
 the same tests across multiple types, we use GTest's 
-[Typed Tests](https://github.com/google/googletest/blob/master/googletest/docs/advanced.md#typed-tests).
+[Typed Tests](https://github.com/google/googletest/blob/master/docs/advanced.md#typed-tests).
 Typed tests allow you to write a test once and run it across a list of types.
 
 For example:
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 8fde8098bd3..d359fe59c1a 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 0.19.0
+PROJECT_NUMBER         = 21.06.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/0.19
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/21.06
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 74ce6e42d7e..2600926d363 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -40,14 +40,16 @@ namespace cudf {
 
 // forward declaration
 namespace detail {
+class simple_aggregations_collector;
 class aggregation_finalizer;
 }  // namespace detail
 /**
- * @brief Base class for specifying the desired aggregation in an
+ * @brief Abstract base class for specifying the desired aggregation in an
  * `aggregation_request`.
  *
- * Other kinds of aggregations may derive from this class to encapsulate
- * additional information needed to compute the aggregation.
+ * All aggregations must derive from this class to implement the pure virtual
+ * functions and potentially encapsulate additional information needed to
+ * compute the aggregation.
  */
 class aggregation {
  public:
@@ -82,58 +84,78 @@ class aggregation {
     CUDA             ///< CUDA UDF based reduction
   };
 
+  aggregation() = delete;
   aggregation(aggregation::Kind a) : kind{a} {}
   Kind kind;  ///< The aggregation to perform
+  virtual ~aggregation() = default;
 
   virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
-
   virtual size_t do_hash() const { return std::hash<int>{}(kind); }
+  virtual std::unique_ptr<aggregation> clone() const = 0;
 
-  virtual std::unique_ptr<aggregation> clone() const
-  {
-    return std::make_unique<aggregation>(*this);
-  }
+  // override functions for compound aggregations
+  virtual std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const = 0;
+  virtual void finalize(cudf::detail::aggregation_finalizer& finalizer) const         = 0;
+};
 
-  virtual ~aggregation() = default;
+/**
+ * @brief Derived class intended for enforcing operation-specific restrictions
+ * when calling various cudf functions.
+ *
+ * As an example, rolling_window will only accept rolling_aggregation inputs,
+ * and the appropriate derived classes (sum_aggregation, mean_aggregation, etc)
+ * derive from this interface to represent these valid options.
+ */
+class rolling_aggregation : public virtual aggregation {
+ public:
+  ~rolling_aggregation() = default;
 
-  // override functions for compound aggregations
-  virtual std::vector<aggregation::Kind> get_simple_aggregations(data_type col_type) const;
-  virtual void finalize(cudf::detail::aggregation_finalizer& finalizer);
+ protected:
+  rolling_aggregation() {}
 };
 
 enum class udf_type : bool { CUDA, PTX };
 
 /// Factory to create a SUM aggregation
-std::unique_ptr<aggregation> make_sum_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_sum_aggregation();
 
 /// Factory to create a PRODUCT aggregation
-std::unique_ptr<aggregation> make_product_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_product_aggregation();
 
 /// Factory to create a MIN aggregation
-std::unique_ptr<aggregation> make_min_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_min_aggregation();
 
 /// Factory to create a MAX aggregation
-std::unique_ptr<aggregation> make_max_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_max_aggregation();
 
 /**
  * @brief Factory to create a COUNT aggregation
  *
  * @param null_handling Indicates if null values will be counted.
  */
-std::unique_ptr<aggregation> make_count_aggregation(
-  null_policy null_handling = null_policy::EXCLUDE);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_count_aggregation(null_policy null_handling = null_policy::EXCLUDE);
 
-/// Factory to create a ANY aggregation
-std::unique_ptr<aggregation> make_any_aggregation();
+/// Factory to create an ANY aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_any_aggregation();
 
 /// Factory to create a ALL aggregation
-std::unique_ptr<aggregation> make_all_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_all_aggregation();
 
 /// Factory to create a SUM_OF_SQUARES aggregation
-std::unique_ptr<aggregation> make_sum_of_squares_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_sum_of_squares_aggregation();
 
 /// Factory to create a MEAN aggregation
-std::unique_ptr<aggregation> make_mean_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_mean_aggregation();
 
 /**
  * @brief Factory to create a VARIANCE aggregation
@@ -141,7 +163,8 @@ std::unique_ptr<aggregation> make_mean_aggregation();
  * @param ddof Delta degrees of freedom. The divisor used in calculation of
  *             `variance` is `N - ddof`, where `N` is the population size.
  */
-std::unique_ptr<aggregation> make_variance_aggregation(size_type ddof = 1);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_variance_aggregation(size_type ddof = 1);
 
 /**
  * @brief Factory to create a STD aggregation
@@ -149,10 +172,12 @@ std::unique_ptr<aggregation> make_variance_aggregation(size_type ddof = 1);
  * @param ddof Delta degrees of freedom. The divisor used in calculation of
  *             `std` is `N - ddof`, where `N` is the population size.
  */
-std::unique_ptr<aggregation> make_std_aggregation(size_type ddof = 1);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_std_aggregation(size_type ddof = 1);
 
 /// Factory to create a MEDIAN aggregation
-std::unique_ptr<aggregation> make_median_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_median_aggregation();
 
 /**
  * @brief Factory to create a QUANTILE aggregation
@@ -160,22 +185,25 @@ std::unique_ptr<aggregation> make_median_aggregation();
  * @param quantiles The desired quantiles
  * @param interpolation The desired interpolation
  */
-std::unique_ptr<aggregation> make_quantile_aggregation(std::vector<double> const& q,
-                                                       interpolation i = interpolation::LINEAR);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& q,
+                                                interpolation i = interpolation::LINEAR);
 
 /**
  * @brief Factory to create an `argmax` aggregation
  *
  * `argmax` returns the index of the maximum element.
  */
-std::unique_ptr<aggregation> make_argmax_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_argmax_aggregation();
 
 /**
  * @brief Factory to create an `argmin` aggregation
  *
  * `argmin` returns the index of the minimum element.
  */
-std::unique_ptr<aggregation> make_argmin_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_argmin_aggregation();
 
 /**
  * @brief Factory to create a `nunique` aggregation
@@ -183,8 +211,8 @@ std::unique_ptr<aggregation> make_argmin_aggregation();
  * `nunique` returns the number of unique elements.
  * @param null_handling Indicates if null values will be counted.
  */
-std::unique_ptr<aggregation> make_nunique_aggregation(
-  null_policy null_handling = null_policy::EXCLUDE);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling = null_policy::EXCLUDE);
 
 /**
  * @brief Factory to create a `nth_element` aggregation
@@ -199,11 +227,13 @@ std::unique_ptr<aggregation> make_nunique_aggregation(
  * @param n index of nth element in each group.
  * @param null_handling Indicates to include/exclude nulls during indexing.
  */
-std::unique_ptr<aggregation> make_nth_element_aggregation(
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_nth_element_aggregation(
   size_type n, null_policy null_handling = null_policy::INCLUDE);
 
 /// Factory to create a ROW_NUMBER aggregation
-std::unique_ptr<aggregation> make_row_number_aggregation();
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_row_number_aggregation();
 
 /**
  * @brief Factory to create a COLLECT_LIST aggregation
@@ -215,7 +245,8 @@ std::unique_ptr<aggregation> make_row_number_aggregation();
  *
  * @param null_handling Indicates whether to include/exclude nulls in list elements.
  */
-std::unique_ptr<aggregation> make_collect_list_aggregation(
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_collect_list_aggregation(
   null_policy null_handling = null_policy::INCLUDE);
 
 /**
@@ -233,16 +264,18 @@ std::unique_ptr<aggregation> make_collect_list_aggregation(
  * @param nans_equal    Flag to specify whether NaN values in floating point column should be
  * considered equal
  */
-std::unique_ptr<aggregation> make_collect_set_aggregation(
-  null_policy null_handling = null_policy::INCLUDE,
-  null_equality nulls_equal = null_equality::EQUAL,
-  nan_equality nans_equal   = nan_equality::UNEQUAL);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
+                                                   null_equality nulls_equal = null_equality::EQUAL,
+                                                   nan_equality nans_equal = nan_equality::UNEQUAL);
 
 /// Factory to create a LAG aggregation
-std::unique_ptr<aggregation> make_lag_aggregation(size_type offset);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lag_aggregation(size_type offset);
 
 /// Factory to create a LEAD aggregation
-std::unique_ptr<aggregation> make_lead_aggregation(size_type offset);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lead_aggregation(size_type offset);
 
 /**
  * @brief Factory to create an aggregation base on UDF for PTX or CUDA
@@ -253,9 +286,10 @@ std::unique_ptr<aggregation> make_lead_aggregation(size_type offset);
  *
  * @return aggregation unique pointer housing user_defined_aggregator string.
  */
-std::unique_ptr<aggregation> make_udf_aggregation(udf_type type,
-                                                  std::string const& user_defined_aggregator,
-                                                  data_type output_type);
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_udf_aggregation(udf_type type,
+                                           std::string const& user_defined_aggregator,
+                                           data_type output_type);
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp
index 68319a24e5d..166a0408703 100644
--- a/cpp/include/cudf/ast/detail/linearizer.hpp
+++ b/cpp/include/cudf/ast/detail/linearizer.hpp
@@ -83,10 +83,7 @@ class linearizer;
  * This class is a part of a "visitor" pattern with the `linearizer` class.
  * Nodes inheriting from this class can accept visitors.
  */
-class node {
-  friend class detail::linearizer;
-
- private:
+struct node {
   virtual cudf::size_type accept(detail::linearizer& visitor) const = 0;
 };
 
@@ -102,10 +99,6 @@ class node {
  * resolved into intermediate data storage in shared memory.
  */
 class linearizer {
-  friend class literal;
-  friend class column_reference;
-  friend class expression;
-
  public:
   /**
    * @brief Construct a new linearizer object
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 27bcb0d320b..8ae60f96997 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -753,43 +753,6 @@ struct operator_functor<ast_operator::NOT> {
   }
 };
 
-#if 0
-/**
- * @brief Functor used to double-type-dispatch binary operators.
- *
- * This functor's `operator()` is templated to validate calls to its operators based on the input
- * type, as determined by the `is_valid_binary_op` trait.
- *
- * @tparam OperatorFunctor Binary operator functor.
- */
-template <typename OperatorFunctor>
-struct double_dispatch_binary_operator_types {
-  template <typename LHS,
-            typename RHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<OperatorFunctor, LHS, RHS>(std::forward<Ts>(args)...);
-  }
-
-  template <typename LHS,
-            typename RHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid binary operation.");
-#else
-    cudf_assert(false && "Invalid binary operation.");
-#endif
-  }
-};
-#endif
-
 /**
  * @brief Functor used to single-type-dispatch binary operators.
  *
@@ -856,16 +819,6 @@ struct type_dispatch_binary_op {
                                             F&& f,
                                             Ts&&... args)
   {
-#if 0
-    // Double dispatch
-    /*
-    double_type_dispatcher(lhs_type,
-                           rhs_type,
-                           detail::double_dispatch_binary_operator_types<operator_functor<op>>{},
-                           std::forward<F>(f),
-                           std::forward<Ts>(args)...);
-    */
-#endif
     // Single dispatch (assume lhs_type == rhs_type)
     type_dispatcher(lhs_type,
                     detail::single_dispatch_binary_operator_types<operator_functor<op>>{},
diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh
index da15ac07c63..f69927a3601 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/transform.cuh
@@ -15,8 +15,9 @@
  */
 #pragma once
 
+#include <cudf/ast/detail/linearizer.hpp>
 #include <cudf/ast/detail/operators.hpp>
-#include <cudf/ast/linearizer.hpp>
+#include <cudf/ast/nodes.hpp>
 #include <cudf/ast/operators.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -25,6 +26,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -155,10 +157,11 @@ struct row_evaluator {
    * storing intermediates.
    * @param output_column The output column where results are stored.
    */
-  __device__ row_evaluator(table_device_view const& table,
-                           const cudf::detail::fixed_width_scalar_device_view_base* literals,
-                           std::int64_t* thread_intermediate_storage,
-                           mutable_column_device_view* output_column)
+  __device__ row_evaluator(
+    table_device_view const& table,
+    device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals,
+    std::int64_t* thread_intermediate_storage,
+    mutable_column_device_view* output_column)
     : table(table),
       literals(literals),
       thread_intermediate_storage(thread_intermediate_storage),
@@ -264,7 +267,7 @@ struct row_evaluator {
 
  private:
   table_device_view const& table;
-  const cudf::detail::fixed_width_scalar_device_view_base* literals;
+  device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals;
   std::int64_t* thread_intermediate_storage;
   mutable_column_device_view* output_column;
 };
@@ -298,15 +301,15 @@ __device__ void row_output::resolve_output(detail::device_data_reference device_
  * @param num_operators Number of operators.
  * @param row_index Row index of data column(s).
  */
-__device__ void evaluate_row_expression(detail::row_evaluator const& evaluator,
-                                        const detail::device_data_reference* data_references,
-                                        const ast_operator* operators,
-                                        const cudf::size_type* operator_source_indices,
-                                        cudf::size_type num_operators,
-                                        cudf::size_type row_index)
+__device__ void evaluate_row_expression(
+  detail::row_evaluator const& evaluator,
+  device_span<const detail::device_data_reference> data_references,
+  device_span<const ast_operator> operators,
+  device_span<const cudf::size_type> operator_source_indices,
+  cudf::size_type row_index)
 {
-  auto operator_source_index = cudf::size_type(0);
-  for (cudf::size_type operator_index(0); operator_index < num_operators; operator_index++) {
+  auto operator_source_index = static_cast<cudf::size_type>(0);
+  for (cudf::size_type operator_index = 0; operator_index < operators.size(); operator_index++) {
     // Execute operator
     auto const op    = operators[operator_index];
     auto const arity = ast_operator_arity(op);
@@ -336,43 +339,79 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator,
   }
 }
 
+/**
+ * @brief The AST plan creates a device buffer of data needed to execute an AST.
+ *
+ * On construction, an AST plan creates a single "packed" host buffer of all necessary data arrays,
+ * and copies that to the device with a single host-device memory copy. Because the plan tends to be
+ * small, this is the most efficient approach for low latency.
+ *
+ */
 struct ast_plan {
- public:
-  ast_plan() : sizes(), data_pointers() {}
+  ast_plan(linearizer const& expr_linearizer,
+           rmm::cuda_stream_view stream,
+           rmm::mr::device_memory_resource* mr)
+    : _sizes{}, _data_pointers{}
+  {
+    add_to_plan(expr_linearizer.data_references());
+    add_to_plan(expr_linearizer.literals());
+    add_to_plan(expr_linearizer.operators());
+    add_to_plan(expr_linearizer.operator_source_indices());
+
+    // Create device buffer
+    auto const buffer_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0);
+    auto buffer_offsets    = std::vector<int>(_sizes.size());
+    thrust::exclusive_scan(_sizes.cbegin(), _sizes.cend(), buffer_offsets.begin(), 0);
+
+    auto h_data_buffer = std::make_unique<char[]>(buffer_size);
+    for (unsigned int i = 0; i < _data_pointers.size(); ++i) {
+      std::memcpy(h_data_buffer.get() + buffer_offsets[i], _data_pointers[i], _sizes[i]);
+    }
 
-  using buffer_type = std::pair<std::unique_ptr<char[]>, int>;
+    _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr);
+
+    stream.synchronize();
+
+    // Create device pointers to components of plan
+    auto device_data_buffer_ptr = static_cast<const char*>(_device_data_buffer.data());
+    _device_data_references     = device_span<const detail::device_data_reference>(
+      reinterpret_cast<const detail::device_data_reference*>(device_data_buffer_ptr +
+                                                             buffer_offsets[0]),
+      expr_linearizer.data_references().size());
+    _device_literals = device_span<const cudf::detail::fixed_width_scalar_device_view_base>(
+      reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
+        device_data_buffer_ptr + buffer_offsets[1]),
+      expr_linearizer.literals().size());
+    _device_operators = device_span<const ast_operator>(
+      reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]),
+      expr_linearizer.operators().size());
+    _device_operator_source_indices = device_span<const cudf::size_type>(
+      reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]),
+      expr_linearizer.operator_source_indices().size());
+  }
 
+  /**
+   * @brief Helper function for adding components (operators, literals, etc) to AST plan
+   *
+   * @tparam T  The underlying type of the input `std::vector`
+   * @param  v  The `std::vector` containing components (operators, literals, etc)
+   */
   template <typename T>
   void add_to_plan(std::vector<T> const& v)
   {
     auto const data_size = sizeof(T) * v.size();
-    sizes.push_back(data_size);
-    data_pointers.push_back(v.data());
+    _sizes.push_back(data_size);
+    _data_pointers.push_back(v.data());
   }
 
-  buffer_type get_host_data_buffer() const
-  {
-    auto const total_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
-    auto host_data_buffer = std::make_unique<char[]>(total_size);
-    auto const offsets    = get_offsets();
-    for (unsigned int i = 0; i < data_pointers.size(); ++i) {
-      std::memcpy(host_data_buffer.get() + offsets[i], data_pointers[i], sizes[i]);
-    }
-    return std::make_pair(std::move(host_data_buffer), total_size);
-  }
+  std::vector<cudf::size_type> _sizes;
+  std::vector<const void*> _data_pointers;
 
-  std::vector<cudf::size_type> get_offsets() const
-  {
-    auto offsets = std::vector<int>(sizes.size());
-    // When C++17, use std::exclusive_scan
-    offsets[0] = 0;
-    std::partial_sum(sizes.cbegin(), sizes.cend() - 1, offsets.begin() + 1);
-    return offsets;
-  }
-
- private:
-  std::vector<cudf::size_type> sizes;
-  std::vector<const void*> data_pointers;
+  rmm::device_buffer _device_data_buffer;
+  device_span<const detail::device_data_reference> _device_data_references;
+  device_span<const cudf::detail::fixed_width_scalar_device_view_base> _device_literals;
+  device_span<const ast_operator> _device_operators;
+  device_span<const cudf::size_type> _device_operator_source_indices;
 };
 
 /**
diff --git a/cpp/include/cudf/ast/linearizer.hpp b/cpp/include/cudf/ast/nodes.hpp
similarity index 90%
rename from cpp/include/cudf/ast/linearizer.hpp
rename to cpp/include/cudf/ast/nodes.hpp
index e5ccb2e8069..70dda58816e 100644
--- a/cpp/include/cudf/ast/linearizer.hpp
+++ b/cpp/include/cudf/ast/nodes.hpp
@@ -38,17 +38,10 @@ enum class table_reference {
   OUTPUT  // Column index in the output table
 };
 
-// Forward declaration
-class literal;
-class column_reference;
-class expression;
-
 /**
  * @brief A literal value used in an abstract syntax tree.
  */
 class literal : public detail::node {
-  friend class detail::linearizer;
-
  public:
   /**
    * @brief Construct a new literal object.
@@ -90,7 +83,6 @@ class literal : public detail::node {
    */
   cudf::data_type get_data_type() const { return get_value().type(); }
 
- private:
   /**
    * @brief Get the value object.
    *
@@ -106,6 +98,7 @@ class literal : public detail::node {
    */
   cudf::size_type accept(detail::linearizer& visitor) const override;
 
+ private:
   const cudf::detail::fixed_width_scalar_device_view_base value;
 };
 
@@ -113,8 +106,6 @@ class literal : public detail::node {
  * @brief A node referring to data from a column in a table.
  */
 class column_reference : public detail::node {
-  friend class detail::linearizer;
-
  public:
   /**
    * @brief Construct a new column reference object
@@ -175,7 +166,6 @@ class column_reference : public detail::node {
     return table.column(get_column_index()).type();
   }
 
- private:
   /**
    * @brief Accepts a visitor class.
    *
@@ -184,6 +174,7 @@ class column_reference : public detail::node {
    */
   cudf::size_type accept(detail::linearizer& visitor) const override;
 
+ private:
   cudf::size_type column_index;
   table_reference table_source;
 };
@@ -192,8 +183,6 @@ class column_reference : public detail::node {
  * @brief An expression node holds an operator and zero or more operands.
  */
 class expression : public detail::node {
-  friend class detail::linearizer;
-
  public:
   /**
    * @brief Construct a new unary expression object.
@@ -208,11 +197,6 @@ class expression : public detail::node {
     }
   }
 
-  /**
-   * @brief `expression` doesn't accept r-value references for expression nodes
-   */
-  expression(ast_operator op, node&& input) = delete;
-
   /**
    * @brief Construct a new binary expression object.
    *
@@ -227,19 +211,11 @@ class expression : public detail::node {
     }
   }
 
-  /**
-   * @brief `expression` doesn't accept r-value references for expression nodes
-   */
-  expression(ast_operator op, node&& left, node&& right) = delete;
-
-  /**
-   * @brief `expression` doesn't accept r-value references for expression nodes
-   */
+  // expression only stores references to nodes, so it does not accept r-value
+  // references: the calling code must own the nodes.
+  expression(ast_operator op, node&& input)                   = delete;
+  expression(ast_operator op, node&& left, node&& right)      = delete;
   expression(ast_operator op, node&& left, node const& right) = delete;
-
-  /**
-   * @brief `expression` doesn't accept r-value references for expression nodes
-   */
   expression(ast_operator op, node const& left, node&& right) = delete;
 
   /**
@@ -256,7 +232,6 @@ class expression : public detail::node {
    */
   std::vector<std::reference_wrapper<const node>> get_operands() const { return operands; }
 
- private:
   /**
    * @brief Accepts a visitor class.
    *
@@ -265,6 +240,7 @@ class expression : public detail::node {
    */
   cudf::size_type accept(detail::linearizer& visitor) const override;
 
+ private:
   const ast_operator op;
   const std::vector<std::reference_wrapper<const node>> operands;
 };
diff --git a/cpp/include/cudf/ast/transform.hpp b/cpp/include/cudf/ast/transform.hpp
index 513f92ea251..59697e5f75c 100644
--- a/cpp/include/cudf/ast/transform.hpp
+++ b/cpp/include/cudf/ast/transform.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/ast/linearizer.hpp>
+#include <cudf/ast/nodes.hpp>
 #include <cudf/table/table_view.hpp>
 
 namespace cudf {
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index a08b10df6f4..ee367840644 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,13 +49,6 @@ class column {
   column& operator=(column const& other) = delete;
   column& operator=(column&& other) = delete;
 
-  /**
-   * @brief Construct a new column by deep copying the contents of `other`.
-   *
-   * @param other The column to copy
-   */
-  column(column const& other);
-
   /**
    * @brief Construct a new column object by deep copying the contents of
    *`other`.
@@ -68,7 +61,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         rmm::cuda_stream_view stream,
+         rmm::cuda_stream_view stream        = rmm::cuda_stream_view{},
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -165,18 +158,21 @@ class column {
   /**
    * @brief Sets the column's null value indicator bitmask to `new_null_mask`.
    *
-   * @throws cudf::logic_error if new_null_count is larger than 0 and the size
-   * of `new_null_mask` does not match the size of this column.
-   *
-   * @param new_null_mask New null value indicator bitmask (lvalue overload &
-   * copied) to set the column's null value indicator mask. May be empty if
-   * `new_null_count` is 0 or `UNKOWN_NULL_COUNT`.
-   * @param new_null_count Optional, the count of null elements. If unknown,
-   * specify `UNKNOWN_NULL_COUNT` to indicate that the null count should be
-   * computed on the first invocation of `null_count()`.
+   * @throws cudf::logic_error if new_null_count is larger than 0 and the size of `new_null_mask`
+   * does not match the size of this column.
+   *
+   * @param new_null_mask New null value indicator bitmask (lvalue overload & copied) to set the
+   * column's null value indicator mask. May be empty if `new_null_count` is 0 or
+   * `UNKOWN_NULL_COUNT`.
+   * @param new_null_count Optional, the count of null elements. If unknown, specify
+   * `UNKNOWN_NULL_COUNT` to indicate that the null count should be computed on the first invocation
+   * of `null_count()`.
+   * @param stream The stream on which to perform the allocation and copy. Uses the default CUDA
+   * stream if none is specified.
    */
   void set_null_mask(rmm::device_buffer const& new_null_mask,
-                     size_type new_null_count = UNKNOWN_NULL_COUNT);
+                     size_type new_null_count     = UNKNOWN_NULL_COUNT,
+                     rmm::cuda_stream_view stream = rmm::cuda_stream_view{});
 
   /**
    * @brief Updates the count of null elements.
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index a842e51c94a..5f42823afe4 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -31,6 +31,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
+#include <thrust/optional.h>
 
 #include <algorithm>
 
@@ -40,6 +41,28 @@
  */
 
 namespace cudf {
+
+/**
+ * @brief Policy for what assumptions the optional iterator has about null values
+ *
+ * - `YES` means that the column supports nulls and has null values, therefore
+ *    the optional might not contain a value
+ *
+ * - `NO` means that the column has no null values, therefore the optional will
+ *    always have a value
+ *
+ * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating
+ *    on construction of the iterator if column has nulls.
+ */
+namespace contains_nulls {
+struct YES {
+};
+struct NO {
+};
+struct DYNAMIC {
+};
+}  // namespace contains_nulls
+
 namespace detail {
 /**
  * @brief An immutable, non-owning view of device data as a column of elements
@@ -255,10 +278,11 @@ class alignas(16) column_device_view_base {
     : std::true_type {
   };
 };
-
 // Forward declaration
 template <typename T>
 struct value_accessor;
+template <typename T, typename contains_nulls_mode>
+struct optional_accessor;
 template <typename T, bool has_nulls>
 struct pair_accessor;
 template <typename T, bool has_nulls>
@@ -484,6 +508,13 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
     return const_iterator<T>{count_it{size()}, detail::value_accessor<T>{*this}};
   }
 
+  /**
+   * @brief optional iterator for navigating this column
+   */
+  template <typename T, typename contains_nulls_mode>
+  using const_optional_iterator =
+    thrust::transform_iterator<detail::optional_accessor<T, contains_nulls_mode>, count_it>;
+
   /**
    * @brief Pair iterator for navigating this column
    */
@@ -500,6 +531,124 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   using const_pair_rep_iterator =
     thrust::transform_iterator<detail::pair_rep_accessor<T, has_nulls>, count_it>;
 
+  /**
+   * @brief Return an optional iterator to the first element of the column.
+   *
+   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   *
+   * When the element of an iterator contextually converted to bool, the conversion returns true
+   * if the object contains a value and false if it does not contain a value.
+   *
+   * optional_begin with mode `DYNAMIC` defers the assumption of nullability to
+   * runtime, with the user stating on construction of the iterator if column has nulls.
+   * `DYNAMIC` mode is nice when an algorithm is going to execute on mutliple
+   * iterators and you don't want to compile all the combinations of iterator types
+   *
+   * Example:
+   *
+   * \code{.cpp}
+   * template<typename T>
+   * void some_function(cudf::column_view<T> const& col_view){
+   *    auto d_col = cudf::column_device_view::create(col_view);
+   *    // Create a `DYNAMIC` optional iterator
+   *    auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{},
+   *                                                      col_view.has_nulls());
+   * }
+   * \endcode
+   *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
+   * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
+   *         the user has stated nulls exist
+   * @throws cudf::logic_error if column datatype and Element type mismatch.
+   */
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_begin(contains_nulls::DYNAMIC, bool has_nulls) const
+  {
+    return const_optional_iterator<T, contains_nulls::DYNAMIC>{
+      count_it{0}, detail::optional_accessor<T, contains_nulls::DYNAMIC>{*this, has_nulls}};
+  }
+
+  /**
+   * @brief Return an optional iterator to the first element of the column.
+   *
+   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   *
+   * When the element of an iterator contextually converted to bool, the conversion returns true
+   * if the object contains a value and false if it does not contain a value.
+   *
+   * optional_begin with mode `YES` means that the column supports nulls and
+   * potentially has null values, therefore the optional might not contain a value
+   *
+   * Example:
+   *
+   * \code{.cpp}
+   * template<typename T, bool has_nulls>
+   * void some_function(cudf::column_view<T> const& col_view){
+   *    auto d_col = cudf::column_device_view::create(col_view);
+   *    if constexpr(has_nulls) {
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
+   *      //use optional_iterator
+   *    } else {
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::NO{});
+   *      //use optional_iterator
+   *    }
+   * }
+   * \endcode
+   *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
+   * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
+   * @throws cudf::logic_error if column datatype and Element type mismatch.
+   */
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_begin(contains_nulls::YES) const
+  {
+    return const_optional_iterator<T, contains_nulls::YES>{
+      count_it{0}, detail::optional_accessor<T, contains_nulls::YES>{*this}};
+  }
+
+  /**
+   * @brief Return an optional iterator to the first element of the column.
+   *
+   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   *
+   * When the element of an iterator contextually converted to bool, the conversion returns true
+   * if the object contains a value and false if it does not contain a value.
+   *
+   * optional_begin with mode `NO` means that the column has no null values,
+   * therefore the optional will always contain a value.
+   *
+   * Example:
+   *
+   * \code{.cpp}
+   * template<typename T, bool has_nulls>
+   * void some_function(cudf::column_view<T> const& col_view){
+   *    auto d_col = cudf::column_device_view::create(col_view);
+   *    if constexpr(has_nulls) {
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
+   *      //use optional_iterator
+   *    } else {
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::NO{});
+   *      //use optional_iterator
+   *    }
+   * }
+   * \endcode
+   *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
+   * @throws cudf::logic_error if column datatype and Element type mismatch.
+   */
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_begin(contains_nulls::NO) const
+  {
+    return const_optional_iterator<T, contains_nulls::NO>{
+      count_it{0}, detail::optional_accessor<T, contains_nulls::NO>{*this}};
+  }
+
   /**
    * @brief Return a pair iterator to the first element of the column.
    *
@@ -558,6 +707,63 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
                                                  detail::pair_rep_accessor<T, has_nulls>{*this}};
   }
 
+  /**
+   * @brief Return an optional iterator to the element following the last element of
+   * the column.
+   *
+   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
+   * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
+   *         the user has stated nulls exist
+   * @throws cudf::logic_error if column datatype and Element type mismatch.
+   */
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_end(contains_nulls::DYNAMIC, bool has_nulls) const
+  {
+    return const_optional_iterator<T, contains_nulls::DYNAMIC>{
+      count_it{size()}, detail::optional_accessor<T, contains_nulls::DYNAMIC>{*this, has_nulls}};
+  }
+
+  /**
+   * @brief Return an optional iterator to the element following the last element of
+   * the column.
+   *
+   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
+   * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
+   * @throws cudf::logic_error if column datatype and Element type mismatch.
+   */
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_end(contains_nulls::YES) const
+  {
+    return const_optional_iterator<T, contains_nulls::YES>{
+      count_it{size()}, detail::optional_accessor<T, contains_nulls::YES>{*this}};
+  }
+
+  /**
+   * @brief Return an optional iterator to the element following the last element of
+   * the column.
+   *
+   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   *
+   * This function does not participate in overload resolution if
+   * `column_device_view::has_element_accessor<T>()` is false.
+   *
+   * @throws cudf::logic_error if column datatype and Element type mismatch.
+   */
+  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_end(contains_nulls::NO) const
+  {
+    return const_optional_iterator<T, contains_nulls::NO>{
+      count_it{size()}, detail::optional_accessor<T, contains_nulls::NO>{*this}};
+  }
+
   /**
    * @brief Return a pair iterator to the element following the last element of
    * the column.
@@ -999,6 +1205,82 @@ struct value_accessor {
   __device__ T operator()(cudf::size_type i) const { return col.element<T>(i); }
 };
 
+/**
+ * @brief optional accessor of a column
+ *
+ *
+ * The optional_accessor always returns a thrust::optional of column[i]. The validity
+ * of the optional is determined by the contains_nulls_mode template parameter
+ * which has the following modes:
+ *
+ * - `YES` means that the column supports nulls and has null values, therefore
+ *    the optional might be valid or invalid
+ *
+ * - `NO` the user has attested that the column has no null values,
+ *    no checks will occur and `thrust::optional{column[i]}` will be
+ *    return for each `i`.
+ *
+ * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating
+ *    on construction of the iterator if column has nulls.
+ *    When `with_nulls=true` the return value validity will be determined if column[i]
+ *    is not null.
+ *    When `with_nulls=false` the return value will always be valid
+ *
+ * @throws cudf::logic_error if column datatype and template T type mismatch.
+ * @throws cudf::logic_error if the column is not nullable, and `with_nulls=true`
+ *
+ *
+ * @tparam T The type of elements in the column
+ * @tparam contains_nulls_mode Specifies if nulls are checked at runtime or compile time.
+ */
+template <typename T, typename contains_nulls_mode>
+struct optional_accessor {
+  column_device_view const col;  ///< column view of column in device
+
+  /**
+   * @brief constructor
+   * @param[in] _col column device view of cudf column
+   */
+  optional_accessor(column_device_view const& _col) : col{_col}
+  {
+    CUDF_EXPECTS(type_id_matches_device_storage_type<T>(col.type().id()), "the data type mismatch");
+  }
+
+  CUDA_DEVICE_CALLABLE
+  thrust::optional<T> operator()(cudf::size_type i) const
+  {
+    if constexpr (std::is_same_v<contains_nulls_mode, contains_nulls::YES>) {
+      return (col.is_valid_nocheck(i)) ? thrust::optional<T>{col.element<T>(i)}
+                                       : thrust::optional<T>{thrust::nullopt};
+    }
+    return thrust::optional<T>{col.element<T>(i)};
+  }
+};
+
+template <typename T>
+struct optional_accessor<T, contains_nulls::DYNAMIC> {
+  column_device_view const col;  ///< column view of column in device
+  bool has_nulls;
+
+  /**
+   * @brief constructor
+   * @param[in] _col column device view of cudf column
+   */
+  optional_accessor(column_device_view const& _col, bool with_nulls)
+    : col{_col}, has_nulls{with_nulls}
+  {
+    CUDF_EXPECTS(type_id_matches_device_storage_type<T>(col.type().id()), "the data type mismatch");
+    if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
+  }
+
+  CUDA_DEVICE_CALLABLE
+  thrust::optional<T> operator()(cudf::size_type i) const
+  {
+    return (has_nulls and col.is_null_nocheck(i)) ? thrust::optional<T>{thrust::nullopt}
+                                                  : thrust::optional<T>{col.element<T>(i)};
+  }
+};
+
 /**
  * @brief pair accessor of column with/without null bitmask
  * A unary functor returns pair with scalar value at `id` and boolean validity
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 43c2407d629..e5424f0fc44 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -541,7 +541,8 @@ std::unique_ptr<cudf::column> make_structs_column(
  *
  * The output column will have the same type as `s.type()`
  * The output column will contain all null rows if `s.invalid()==false`
- * The output column will be empty if `size==0`.
+ * The output column will be empty if `size==0`. For LIST scalars, the column hierarchy
+ * from @p s is preserved.
  *
  * @param[in] s The scalar to use for values in the column.
  * @param[in] size The number of rows for the output column.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index bb44e33f786..c9a4eab2154 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -182,6 +182,14 @@ enum class mask_allocation_policy {
  */
 std::unique_ptr<column> empty_like(column_view const& input);
 
+/**
+ * @brief Initializes and returns an empty column of the same type as the `input`.
+ *
+ * @param[in] input Scalar to emulate
+ * @return std::unique_ptr<column> An empty column of same type as `input`
+ */
+std::unique_ptr<column> empty_like(scalar const& input);
+
 /**
  * @brief Creates an uninitialized new column of the same size and type as the `input`.
  * Supports only fixed-width types.
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index 3f5f5a91632..09763d66403 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -53,6 +53,14 @@ struct corresponding_operator<aggregation::MAX> {
   using type = DeviceMax;
 };
 template <>
+struct corresponding_operator<aggregation::ARGMIN> {
+  using type = DeviceMin;
+};
+template <>
+struct corresponding_operator<aggregation::ARGMAX> {
+  using type = DeviceMax;
+};
+template <>
 struct corresponding_operator<aggregation::ANY> {
   using type = DeviceMax;
 };
@@ -81,6 +89,10 @@ struct corresponding_operator<aggregation::VARIANCE> {
   using type = DeviceSum;
 };
 template <>
+struct corresponding_operator<aggregation::MEAN> {
+  using type = DeviceSum;
+};
+template <>
 struct corresponding_operator<aggregation::COUNT_VALID> {
   using type = DeviceCount;
 };
@@ -92,6 +104,12 @@ struct corresponding_operator<aggregation::COUNT_ALL> {
 template <aggregation::Kind k>
 using corresponding_operator_t = typename corresponding_operator<k>::type;
 
+template <aggregation::Kind k>
+constexpr bool has_corresponding_operator()
+{
+  return !std::is_same<typename corresponding_operator<k>::type, void>::value;
+}
+
 template <typename Source,
           aggregation::Kind k,
           bool target_has_nulls,
@@ -140,8 +158,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-#if (__CUDACC_VER_MAJOR__ != 10) or (__CUDACC_VER_MINOR__ != 2)
-
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target       = target_type_t<Source, aggregation::MIN>;
@@ -152,8 +168,6 @@ struct update_target_element<Source,
               static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-
-#endif
   }
 };
 
@@ -190,8 +204,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-#if (__CUDACC_VER_MAJOR__ != 10) or (__CUDACC_VER_MINOR__ != 2)
-
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target       = target_type_t<Source, aggregation::MAX>;
@@ -202,8 +214,6 @@ struct update_target_element<Source,
               static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-
-#endif
   }
 };
 
@@ -240,8 +250,6 @@ struct update_target_element<Source,
                              column_device_view source,
                              size_type source_index) const noexcept
   {
-#if (__CUDACC_VER_MAJOR__ != 10) or (__CUDACC_VER_MINOR__ != 2)
-
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
     using Target       = target_type_t<Source, aggregation::SUM>;
@@ -252,7 +260,6 @@ struct update_target_element<Source,
               static_cast<DeviceTarget>(source.element<DeviceSource>(source_index)));
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
-#endif
   }
 };
 
@@ -260,42 +267,55 @@ struct update_target_element<Source,
  * @brief Function object to update a single element in a target column using
  * the dictionary key addressed by the specific index.
  *
- * `target[target_index] = d_dictionary.keys[d_dictionary.indices[source_index]]`
+ * SFINAE is used to prevent recursion for dictionary type. Dictionary keys cannot be a dictionary.
+ *
  */
+template <bool target_has_nulls = true>
 struct update_target_from_dictionary {
-  template <typename KeyType,
-            std::enable_if_t<is_fixed_width<KeyType>() && !is_fixed_point<KeyType>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view& target,
+  template <typename Source,
+            aggregation::Kind k,
+            std::enable_if_t<!is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
-                             column_device_view& d_dictionary,
+                             column_device_view source,
                              size_type source_index) const noexcept
   {
-// This code will segfault in nvcc/ptxas 10.2 only
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317
-#if (__CUDACC_VER_MAJOR__ != 10) or (__CUDACC_VER_MINOR__ != 2)
-    auto const keys  = d_dictionary.child(cudf::dictionary_column_view::keys_column_index);
-    auto const value = keys.element<KeyType>(
-      static_cast<cudf::size_type>(d_dictionary.element<dictionary32>(source_index)));
-    using Target = target_type_t<KeyType, aggregation::SUM>;
-    atomicAdd(&target.element<Target>(target_index), static_cast<Target>(value));
-#endif
+    update_target_element<Source, k, target_has_nulls, false>{}(
+      target, target_index, source, source_index);
   }
-  template <typename KeyType,
-            std::enable_if_t<!is_fixed_width<KeyType>() || is_fixed_point<KeyType>()>* = nullptr>
-  __device__ void operator()(mutable_column_device_view& target,
+  template <typename Source,
+            aggregation::Kind k,
+            std::enable_if_t<is_dictionary<Source>()>* = nullptr>
+  __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
-                             column_device_view& d_dictionary,
-                             size_type source_index) const noexcept {};
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+  }
 };
 
 /**
- * @brief Specialization function for dictionary type and aggregation SUM.
+ * @brief Specialization function for dictionary type and aggregations.
+ *
+ * The `source` column is a dictionary type. This functor de-references the
+ * dictionary's keys child column and maps the input source index through
+ * the dictionary's indices child column to pass to the `update_target_element`
+ * in the above `update_target_from_dictionary` using the type-dispatcher to
+ * resolve the keys column type.
+ *
+ * `update_target_element( target, target_index, source.keys(), source.indices()[source_index] )`
  *
  * @tparam target_has_nulls Indicates presence of null elements in `target`
  * @tparam source_has_nulls Indicates presence of null elements in `source`.
  */
-template <bool target_has_nulls, bool source_has_nulls>
-struct update_target_element<dictionary32, aggregation::SUM, target_has_nulls, source_has_nulls> {
+template <aggregation::Kind k, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element<
+  dictionary32,
+  k,
+  target_has_nulls,
+  source_has_nulls,
+  std::enable_if_t<not(k == aggregation::ARGMIN or k == aggregation::ARGMAX or
+                       k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL)>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -303,40 +323,29 @@ struct update_target_element<dictionary32, aggregation::SUM, target_has_nulls, s
   {
     if (source_has_nulls and source.is_null(source_index)) { return; }
 
-    type_dispatcher(source.child(cudf::dictionary_column_view::keys_column_index).type(),
-                    update_target_from_dictionary{},
-                    target,
-                    target_index,
-                    source,
-                    source_index);
-
-    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+    dispatch_type_and_aggregation(
+      source.child(cudf::dictionary_column_view::keys_column_index).type(),
+      k,
+      update_target_from_dictionary<target_has_nulls>{},
+      target,
+      target_index,
+      source.child(cudf::dictionary_column_view::keys_column_index),
+      static_cast<cudf::size_type>(source.element<dictionary32>(source_index)));
   }
 };
 
-// This code will segfault in nvcc/ptxas 10.2 only
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317
-// Enabling only for 2 types does not segfault. Using for unit tests.
-#if (__CUDACC_VER_MAJOR__ == 10) and (__CUDACC_VER_MINOR__ == 2)
-template <typename T>
-constexpr bool is_SOS_supported()
-{
-  return std::is_floating_point<T>::value;
-}
-#else
 template <typename T>
-constexpr bool is_SOS_supported()
+constexpr bool is_product_supported()
 {
   return is_numeric<T>();
 }
-#endif
 
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
 struct update_target_element<Source,
                              aggregation::SUM_OF_SQUARES,
                              target_has_nulls,
                              source_has_nulls,
-                             std::enable_if_t<is_SOS_supported<Source>()>> {
+                             std::enable_if_t<is_product_supported<Source>()>> {
   __device__ void operator()(mutable_column_device_view target,
                              size_type target_index,
                              column_device_view source,
@@ -351,6 +360,26 @@ struct update_target_element<Source,
   }
 };
 
+template <typename Source, bool target_has_nulls, bool source_has_nulls>
+struct update_target_element<Source,
+                             aggregation::PRODUCT,
+                             target_has_nulls,
+                             source_has_nulls,
+                             std::enable_if_t<is_product_supported<Source>()>> {
+  __device__ void operator()(mutable_column_device_view target,
+                             size_type target_index,
+                             column_device_view source,
+                             size_type source_index) const noexcept
+  {
+    if (source_has_nulls and source.is_null(source_index)) { return; }
+
+    using Target = target_type_t<Source, aggregation::PRODUCT>;
+    atomicMul(&target.element<Target>(target_index),
+              static_cast<Target>(source.element<Source>(source_index)));
+    if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
+  }
+};
+
 template <typename Source, bool target_has_nulls, bool source_has_nulls>
 struct update_target_element<
   Source,
@@ -559,7 +588,8 @@ struct identity_initializer {
             k == aggregation::COUNT_VALID or k == aggregation::COUNT_ALL or
             k == aggregation::ARGMAX or k == aggregation::ARGMIN or
             k == aggregation::SUM_OF_SQUARES or k == aggregation::STD or
-            k == aggregation::VARIANCE);
+            k == aggregation::VARIANCE or
+            (k == aggregation::PRODUCT and is_product_supported<T>()));
   }
 
   template <typename T, aggregation::Kind k>
@@ -577,27 +607,17 @@ struct identity_initializer {
   }
 
   template <typename T, aggregation::Kind k>
-  typename std::enable_if<cudf::is_timestamp_t<T>::value, T>::type get_identity()
+  T get_identity()
   {
-    if (k == aggregation::ARGMAX)
-      return T{typename T::duration(ARGMAX_SENTINEL)};
-    else if (k == aggregation::ARGMIN)
-      return T{typename T::duration(ARGMIN_SENTINEL)};
-    else
-      // In C++17, we can use compile time if and not make this function SFINAE
-      return identity_from_operator<T, k>();
-  }
-
-  template <typename T, aggregation::Kind k>
-  typename std::enable_if<!cudf::is_timestamp_t<T>::value, T>::type get_identity()
-  {
-    if (k == aggregation::ARGMAX)
-      return static_cast<T>(ARGMAX_SENTINEL);
-    else if (k == aggregation::ARGMIN)
-      return static_cast<T>(ARGMIN_SENTINEL);
-    else
-      // In C++17, we can use compile time if and not make this function SFINAE
-      return identity_from_operator<T, k>();
+    if (k == aggregation::ARGMAX || k == aggregation::ARGMIN) {
+      if constexpr (cudf::is_timestamp<T>())
+        return k == aggregation::ARGMAX ? T{typename T::duration(ARGMAX_SENTINEL)}
+                                        : T{typename T::duration(ARGMIN_SENTINEL)};
+      else
+        return k == aggregation::ARGMAX ? static_cast<T>(ARGMAX_SENTINEL)
+                                        : static_cast<T>(ARGMIN_SENTINEL);
+    }
+    return identity_from_operator<T, k>();
   }
 
  public:
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 0bfe6b84ae2..e230ce0b757 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -28,252 +28,495 @@
 namespace cudf {
 namespace detail {
 
-// Forward declare compound aggregations.
-class mean_aggregation;
-class var_aggregation;
-class std_aggregation;
-class min_aggregation;
-class max_aggregation;
-
 // Visitor pattern
+class simple_aggregations_collector {  // Declares the interface for the simple aggregations
+                                       // collector
+ public:
+  // Declare overloads for each kind of a agg to dispatch
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class sum_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class product_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class min_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class max_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class count_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class any_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class all_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class sum_of_squares_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class mean_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class var_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class std_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class median_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class quantile_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class argmax_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class argmin_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class nunique_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class nth_element_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class row_number_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class collect_list_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class collect_set_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class lead_lag_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class udf_aggregation const& agg);
+};
+
 class aggregation_finalizer {  // Declares the interface for the finalizer
  public:
   // Declare overloads for each kind of a agg to dispatch
-  virtual void visit(aggregation const& agg)      = 0;
-  virtual void visit(min_aggregation const& agg)  = 0;
-  virtual void visit(max_aggregation const& agg)  = 0;
-  virtual void visit(mean_aggregation const& agg) = 0;
-  virtual void visit(var_aggregation const& agg)  = 0;
-  virtual void visit(std_aggregation const& agg)  = 0;
+  virtual void visit(aggregation const& agg);
+  virtual void visit(class sum_aggregation const& agg);
+  virtual void visit(class product_aggregation const& agg);
+  virtual void visit(class min_aggregation const& agg);
+  virtual void visit(class max_aggregation const& agg);
+  virtual void visit(class count_aggregation const& agg);
+  virtual void visit(class any_aggregation const& agg);
+  virtual void visit(class all_aggregation const& agg);
+  virtual void visit(class sum_of_squares_aggregation const& agg);
+  virtual void visit(class mean_aggregation const& agg);
+  virtual void visit(class var_aggregation const& agg);
+  virtual void visit(class std_aggregation const& agg);
+  virtual void visit(class median_aggregation const& agg);
+  virtual void visit(class quantile_aggregation const& agg);
+  virtual void visit(class argmax_aggregation const& agg);
+  virtual void visit(class argmin_aggregation const& agg);
+  virtual void visit(class nunique_aggregation const& agg);
+  virtual void visit(class nth_element_aggregation const& agg);
+  virtual void visit(class row_number_aggregation const& agg);
+  virtual void visit(class collect_list_aggregation const& agg);
+  virtual void visit(class collect_set_aggregation const& agg);
+  virtual void visit(class lead_lag_aggregation const& agg);
+  virtual void visit(class udf_aggregation const& agg);
 };
 
 /**
- * @brief Derived class for specifying a min aggregation
+ * @brief Derived class for specifying a sum aggregation
  */
-struct min_aggregation final : aggregation {
-  min_aggregation() : aggregation{MIN} {}
+class sum_aggregation final : public rolling_aggregation {
+ public:
+  sum_aggregation() : aggregation(SUM) {}
 
-  std::vector<aggregation::Kind> get_simple_aggregations(data_type col_type) const override
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<sum_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
   {
-    if (col_type.id() == type_id::STRING)
-      return {aggregation::ARGMIN};
-    else
-      return {this->kind};
+    return collector.visit(col_type, *this);
   }
-  void finalize(aggregation_finalizer& finalizer) override { finalizer.visit(*this); }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived class for specifying a product aggregation
+ */
+class product_aggregation final : public aggregation {
+ public:
+  product_aggregation() : aggregation(PRODUCT) {}
 
   std::unique_ptr<aggregation> clone() const override
   {
-    return std::make_unique<min_aggregation>(*this);
+    return std::make_unique<product_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
   }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
- * @brief Derived class for specifying a max aggregation
+ * @brief Derived class for specifying a min aggregation
  */
-struct max_aggregation final : aggregation {
-  max_aggregation() : aggregation{MAX} {}
+class min_aggregation final : public rolling_aggregation {
+ public:
+  min_aggregation() : aggregation(MIN) {}
 
-  std::vector<aggregation::Kind> get_simple_aggregations(data_type col_type) const override
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<min_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
   {
-    if (col_type.id() == type_id::STRING)
-      return {aggregation::ARGMAX};
-    else
-      return {this->kind};
+    return collector.visit(col_type, *this);
   }
-  void finalize(aggregation_finalizer& finalizer) override { finalizer.visit(*this); }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived class for specifying a max aggregation
+ */
+class max_aggregation final : public rolling_aggregation {
+ public:
+  max_aggregation() : aggregation(MAX) {}
 
   std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<max_aggregation>(*this);
   }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
- * @brief A wrapper to simplify inheritance of virtual methods from aggregation
- *
- * Derived aggregations are required to implement operator==() and hash_impl().
- *
- * https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern
+ * @brief Derived class for specifying a count aggregation
  */
-template <class Derived>
-class derived_aggregation : public aggregation {
+class count_aggregation final : public rolling_aggregation {
  public:
-  derived_aggregation(aggregation::Kind a) : aggregation(a) {}
+  count_aggregation(aggregation::Kind kind) : aggregation(kind) {}
 
-  bool is_equal(aggregation const& other) const override
+  std::unique_ptr<aggregation> clone() const override
   {
-    if (this->aggregation::is_equal(other)) {
-      // Dispatch to operator== using static polymorphism
-      return static_cast<Derived const&>(*this) == static_cast<Derived const&>(other);
-    } else {
-      return false;
-    }
+    return std::make_unique<count_aggregation>(*this);
   }
-
-  size_t do_hash() const override
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
   {
-    // Dispatch to hash_impl() using static polymorphism
-    return this->aggregation::do_hash() ^ static_cast<Derived const&>(*this).hash_impl();
+    return collector.visit(col_type, *this);
   }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived class for specifying an any aggregation
+ */
+class any_aggregation final : public aggregation {
+ public:
+  any_aggregation() : aggregation(ANY) {}
 
   std::unique_ptr<aggregation> clone() const override
   {
-    // Dispatch to copy constructor using static polymorphism
-    return std::make_unique<Derived>(static_cast<Derived const&>(*this));
+    return std::make_unique<any_aggregation>(*this);
   }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
- * @brief Derived class for specifying a quantile aggregation
+ * @brief Derived class for specifying an all aggregation
  */
-struct quantile_aggregation final : derived_aggregation<quantile_aggregation> {
-  quantile_aggregation(std::vector<double> const& q, interpolation i)
-    : derived_aggregation{QUANTILE}, _quantiles{q}, _interpolation{i}
+class all_aggregation final : public aggregation {
+ public:
+  all_aggregation() : aggregation(ALL) {}
+
+  std::unique_ptr<aggregation> clone() const override
   {
+    return std::make_unique<all_aggregation>(*this);
   }
-  std::vector<double> _quantiles;  ///< Desired quantile(s)
-  interpolation _interpolation;    ///< Desired interpolation
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
 
- protected:
-  friend class derived_aggregation<quantile_aggregation>;
+/**
+ * @brief Derived class for specifying a sum_of_squares aggregation
+ */
+class sum_of_squares_aggregation final : public aggregation {
+ public:
+  sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {}
 
-  bool operator==(quantile_aggregation const& other) const
+  std::unique_ptr<aggregation> clone() const override
   {
-    return _interpolation == other._interpolation and
-           std::equal(_quantiles.begin(), _quantiles.end(), other._quantiles.begin());
+    return std::make_unique<sum_of_squares_aggregation>(*this);
   }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
 
-  size_t hash_impl() const
+/**
+ * @brief Derived class for specifying a mean aggregation
+ */
+class mean_aggregation final : public rolling_aggregation {
+ public:
+  mean_aggregation() : aggregation(MEAN) {}
+
+  std::unique_ptr<aggregation> clone() const override
   {
-    return std::hash<int>{}(static_cast<int>(_interpolation)) ^
-           std::accumulate(
-             _quantiles.cbegin(), _quantiles.cend(), size_t{0}, [](size_t a, double b) {
-               return a ^ std::hash<double>{}(b);
-             });
+    return std::make_unique<mean_aggregation>(*this);
   }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
- * @brief Derived aggregation class for specifying LEAD/LAG window aggregations
+ * @brief Derived class for specifying a standard deviation/variance aggregation
  */
-struct lead_lag_aggregation final : derived_aggregation<lead_lag_aggregation> {
-  lead_lag_aggregation(Kind kind, size_type offset)
-    : derived_aggregation{offset < 0 ? (kind == LAG ? LEAD : LAG) : kind},
-      row_offset{std::abs(offset)}
+class std_var_aggregation : public aggregation {
+ public:
+  size_type _ddof;  ///< Delta degrees of freedom
+
+  bool is_equal(aggregation const& _other) const override
   {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<std_var_aggregation const&>(_other);
+    return _ddof == other._ddof;
   }
 
-  size_type row_offset;
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
 
  protected:
-  friend class derived_aggregation<lead_lag_aggregation>;
-
-  bool operator==(lead_lag_aggregation const& rhs) const { return row_offset == rhs.row_offset; }
+  std_var_aggregation(aggregation::Kind k, size_type ddof) : aggregation(k), _ddof{ddof}
+  {
+    CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
+                 "std_var_aggregation can accept only STD, VARIANCE");
+  }
 
-  size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
+  size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
 };
 
 /**
- * @brief Derived class for specifying a mean aggregation
+ * @brief Derived class for specifying a variance aggregation
  */
-struct mean_aggregation final : aggregation {
-  mean_aggregation() : aggregation{MEAN} {}
+class var_aggregation final : public std_var_aggregation {
+ public:
+  var_aggregation(size_type ddof) : std_var_aggregation{aggregation::VARIANCE, ddof} {}
 
-  std::vector<aggregation::Kind> get_simple_aggregations(data_type col_type) const override
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<var_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
   {
-    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
-    return {aggregation::SUM, aggregation::COUNT_VALID};
+    return collector.visit(col_type, *this);
   }
-  void finalize(aggregation_finalizer& finalizer) override { finalizer.visit(*this); }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived class for specifying a standard deviation aggregation
+ */
+class std_aggregation final : public std_var_aggregation {
+ public:
+  std_aggregation(size_type ddof) : std_var_aggregation{aggregation::STD, ddof} {}
 
   std::unique_ptr<aggregation> clone() const override
   {
-    return std::make_unique<mean_aggregation>(*this);
+    return std::make_unique<std_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
   }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
- * @brief Derived class for specifying a standard deviation/variance aggregation
+ * @brief Derived class for specifying a median aggregation
  */
-struct std_var_aggregation : derived_aggregation<std_var_aggregation> {
-  size_type _ddof;  ///< Delta degrees of freedom
+class median_aggregation final : public aggregation {
+ public:
+  median_aggregation() : aggregation(MEDIAN) {}
 
-  virtual std::vector<aggregation::Kind> get_simple_aggregations(data_type col_type) const override
+  std::unique_ptr<aggregation> clone() const override
   {
-    return {aggregation::SUM, aggregation::COUNT_VALID};
+    return std::make_unique<median_aggregation>(*this);
   }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
 
- protected:
-  friend class derived_aggregation<std_var_aggregation>;
+/**
+ * @brief Derived class for specifying a quantile aggregation
+ */
+class quantile_aggregation final : public aggregation {
+ public:
+  quantile_aggregation(std::vector<double> const& q, interpolation i)
+    : aggregation{QUANTILE}, _quantiles{q}, _interpolation{i}
+  {
+  }
+  std::vector<double> _quantiles;  ///< Desired quantile(s)
+  interpolation _interpolation;    ///< Desired interpolation
 
-  bool operator==(std_var_aggregation const& other) const { return _ddof == other._ddof; }
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
 
-  size_t hash_impl() const { return std::hash<size_type>{}(_ddof); }
+    auto const& other = dynamic_cast<quantile_aggregation const&>(_other);
 
-  std_var_aggregation(aggregation::Kind k, size_type ddof) : derived_aggregation{k}, _ddof{ddof}
+    return _interpolation == other._interpolation &&
+           std::equal(_quantiles.begin(), _quantiles.end(), other._quantiles.begin());
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
   {
-    CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
-                 "std_var_aggregation can accept only STD, VARIANCE");
+    return std::make_unique<quantile_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ private:
+  size_t hash_impl() const
+  {
+    return std::hash<int>{}(static_cast<int>(_interpolation)) ^
+           std::accumulate(
+             _quantiles.cbegin(), _quantiles.cend(), size_t{0}, [](size_t a, double b) {
+               return a ^ std::hash<double>{}(b);
+             });
   }
 };
 
 /**
- * @brief Derived class for specifying a standard deviation aggregation
+ * @brief Derived class for specifying an argmax aggregation
  */
-struct std_aggregation final : std_var_aggregation {
-  std_aggregation(size_type ddof) : std_var_aggregation{aggregation::STD, ddof} {}
-  void finalize(aggregation_finalizer& finalizer) override { finalizer.visit(*this); }
+class argmax_aggregation final : public rolling_aggregation {
+ public:
+  argmax_aggregation() : aggregation(ARGMAX) {}
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<argmax_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
- * @brief Derived class for specifying a variance aggregation
+ * @brief Derived class for specifying an argmin aggregation
  */
-struct var_aggregation final : std_var_aggregation {
-  var_aggregation(size_type ddof) : std_var_aggregation{aggregation::VARIANCE, ddof} {}
-  void finalize(aggregation_finalizer& finalizer) override { finalizer.visit(*this); }
+class argmin_aggregation final : public rolling_aggregation {
+ public:
+  argmin_aggregation() : aggregation(ARGMIN) {}
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<argmin_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
  * @brief Derived class for specifying a nunique aggregation
  */
-struct nunique_aggregation final : derived_aggregation<nunique_aggregation> {
+class nunique_aggregation final : public aggregation {
+ public:
   nunique_aggregation(null_policy null_handling)
-    : derived_aggregation{NUNIQUE}, _null_handling{null_handling}
+    : aggregation{NUNIQUE}, _null_handling{null_handling}
   {
   }
-  null_policy _null_handling;  ///< include or exclude nulls
 
- protected:
-  friend class derived_aggregation<nunique_aggregation>;
+  null_policy _null_handling;  ///< include or exclude nulls
 
-  bool operator==(nunique_aggregation const& other) const
+  bool is_equal(aggregation const& _other) const override
   {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<nunique_aggregation const&>(_other);
     return _null_handling == other._null_handling;
   }
 
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<nunique_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ private:
   size_t hash_impl() const { return std::hash<int>{}(static_cast<int>(_null_handling)); }
 };
 
 /**
  * @brief Derived class for specifying a nth element aggregation
  */
-struct nth_element_aggregation final : derived_aggregation<nth_element_aggregation> {
+class nth_element_aggregation final : public aggregation {
+ public:
   nth_element_aggregation(size_type n, null_policy null_handling)
-    : derived_aggregation{NTH_ELEMENT}, _n{n}, _null_handling{null_handling}
+    : aggregation{NTH_ELEMENT}, _n{n}, _null_handling{null_handling}
   {
   }
+
   size_type _n;                ///< nth index to return
   null_policy _null_handling;  ///< include or exclude nulls
 
- protected:
-  friend class derived_aggregation<nth_element_aggregation>;
-
-  bool operator==(nth_element_aggregation const& other) const
+  bool is_equal(aggregation const& _other) const override
   {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<nth_element_aggregation const&>(_other);
     return _n == other._n and _null_handling == other._null_handling;
   }
 
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<nth_element_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ private:
   size_t hash_impl() const
   {
     return std::hash<size_type>{}(_n) ^ std::hash<int>{}(static_cast<int>(_null_handling));
@@ -281,92 +524,102 @@ struct nth_element_aggregation final : derived_aggregation<nth_element_aggregati
 };
 
 /**
- * @brief Derived class for specifying a custom aggregation
- * specified in udf
+ * @brief Derived class for specifying a row_number aggregation
  */
-struct udf_aggregation final : derived_aggregation<udf_aggregation> {
-  udf_aggregation(aggregation::Kind type,
-                  std::string const& user_defined_aggregator,
-                  data_type output_type)
-    : derived_aggregation{type},
-      _source{user_defined_aggregator},
-      _operator_name{(type == aggregation::PTX) ? "rolling_udf_ptx" : "rolling_udf_cuda"},
-      _function_name{"rolling_udf"},
-      _output_type{output_type}
-  {
-    CUDF_EXPECTS(type == aggregation::PTX or type == aggregation::CUDA,
-                 "udf_aggregation can accept only PTX, CUDA");
-  }
-  std::string const _source;
-  std::string const _operator_name;
-  std::string const _function_name;
-  data_type _output_type;
-
- protected:
-  friend class derived_aggregation<udf_aggregation>;
+class row_number_aggregation final : public rolling_aggregation {
+ public:
+  row_number_aggregation() : aggregation(ROW_NUMBER) {}
 
-  bool operator==(udf_aggregation const& other) const
+  std::unique_ptr<aggregation> clone() const override
   {
-    return _source == other._source and _operator_name == other._operator_name and
-           _function_name == other._function_name and _output_type == other._output_type;
+    return std::make_unique<row_number_aggregation>(*this);
   }
-
-  size_t hash_impl() const
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
   {
-    return std::hash<std::string>{}(_source) ^ std::hash<std::string>{}(_operator_name) ^
-           std::hash<std::string>{}(_function_name) ^
-           std::hash<int>{}(static_cast<int32_t>(_output_type.id()));
+    return collector.visit(col_type, *this);
   }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
 /**
  * @brief Derived aggregation class for specifying COLLECT_LIST aggregation
  */
-struct collect_list_aggregation final : derived_aggregation<nunique_aggregation> {
+class collect_list_aggregation final : public rolling_aggregation {
+ public:
   explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE)
-    : derived_aggregation{COLLECT_LIST}, _null_handling{null_handling}
+    : aggregation{COLLECT_LIST}, _null_handling{null_handling}
   {
   }
+
   null_policy _null_handling;  ///< include or exclude nulls
 
- protected:
-  friend class derived_aggregation<nunique_aggregation>;
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<collect_list_aggregation const&>(_other);
+    return (_null_handling == other._null_handling);
+  }
 
-  bool operator==(nunique_aggregation const& other) const
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
   {
-    return _null_handling == other._null_handling;
+    return std::make_unique<collect_list_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
   }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
+ private:
   size_t hash_impl() const { return std::hash<int>{}(static_cast<int>(_null_handling)); }
 };
 
 /**
  * @brief Derived aggregation class for specifying COLLECT_SET aggregation
  */
-struct collect_set_aggregation final : derived_aggregation<collect_set_aggregation> {
+class collect_set_aggregation final : public rolling_aggregation {
+ public:
   explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
                                    null_equality nulls_equal = null_equality::EQUAL,
                                    nan_equality nans_equal   = nan_equality::UNEQUAL)
-    : derived_aggregation{COLLECT_SET},
+    : aggregation{COLLECT_SET},
       _null_handling{null_handling},
       _nulls_equal(nulls_equal),
       _nans_equal(nans_equal)
   {
   }
+
   null_policy _null_handling;  ///< include or exclude nulls
   null_equality _nulls_equal;  ///< whether to consider nulls as equal values
   nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
                                ///< floating point types)
 
- protected:
-  friend class derived_aggregation<collect_set_aggregation>;
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<collect_set_aggregation const&>(_other);
+    return (_null_handling == other._null_handling && _nulls_equal == other._nulls_equal &&
+            _nans_equal == other._nans_equal);
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
 
-  bool operator==(collect_set_aggregation const& other) const
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<collect_set_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
   {
-    return _null_handling == other._null_handling && _nulls_equal == other._nulls_equal &&
-           _nans_equal == other._nans_equal;
+    return collector.visit(col_type, *this);
   }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
+ protected:
   size_t hash_impl() const
   {
     return std::hash<int>{}(static_cast<int>(_null_handling) ^ static_cast<int>(_nulls_equal) ^
@@ -374,6 +627,96 @@ struct collect_set_aggregation final : derived_aggregation<collect_set_aggregati
   }
 };
 
+/**
+ * @brief Derived aggregation class for specifying LEAD/LAG window aggregations
+ */
+class lead_lag_aggregation final : public rolling_aggregation {
+ public:
+  lead_lag_aggregation(Kind kind, size_type offset)
+    : aggregation{offset < 0 ? (kind == LAG ? LEAD : LAG) : kind}, row_offset{std::abs(offset)}
+  {
+  }
+
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<lead_lag_aggregation const&>(_other);
+    return (row_offset == other.row_offset);
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<lead_lag_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+  size_type row_offset;
+
+ private:
+  size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
+};
+
+/**
+ * @brief Derived class for specifying a custom aggregation
+ * specified in udf
+ */
+class udf_aggregation final : public rolling_aggregation {
+ public:
+  udf_aggregation(aggregation::Kind type,
+                  std::string const& user_defined_aggregator,
+                  data_type output_type)
+    : aggregation{type},
+      _source{user_defined_aggregator},
+      _operator_name{(type == aggregation::PTX) ? "rolling_udf_ptx" : "rolling_udf_cuda"},
+      _function_name{"rolling_udf"},
+      _output_type{output_type}
+  {
+    CUDF_EXPECTS(type == aggregation::PTX or type == aggregation::CUDA,
+                 "udf_aggregation can accept only PTX, CUDA");
+  }
+
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<udf_aggregation const&>(_other);
+    return (_source == other._source and _operator_name == other._operator_name and
+            _function_name == other._function_name and _output_type == other._output_type);
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<udf_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, cudf::detail::simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+  std::string const _source;
+  std::string const _operator_name;
+  std::string const _function_name;
+  data_type _output_type;
+
+ protected:
+  size_t hash_impl() const
+  {
+    return std::hash<std::string>{}(_source) ^ std::hash<std::string>{}(_operator_name) ^
+           std::hash<std::string>{}(_function_name) ^
+           std::hash<int>{}(static_cast<int32_t>(_output_type.id()));
+  }
+};
+
 /**
  * @brief Sentinel value used for `ARGMAX` aggregation.
  *
@@ -441,9 +784,10 @@ struct target_type_impl<Source, aggregation::ALL> {
 // Except for chrono types where result is chrono. (Use FloorDiv)
 // TODO: MEAN should be only be enabled for duration types - not for timestamps
 template <typename Source, aggregation::Kind k>
-struct target_type_impl<Source,
-                        k,
-                        std::enable_if_t<!is_chrono<Source>() && (k == aggregation::MEAN)>> {
+struct target_type_impl<
+  Source,
+  k,
+  std::enable_if_t<is_fixed_width<Source>() && !is_chrono<Source>() && (k == aggregation::MEAN)>> {
   using type = double;
 };
 
@@ -689,7 +1033,7 @@ template <typename Element>
 struct dispatch_aggregation {
 #pragma nv_exec_check_disable
   template <aggregation::Kind k, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const noexcept
+  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<Element, k>(std::forward<Ts>(args)...);
   }
@@ -700,7 +1044,7 @@ struct dispatch_source {
   template <typename Element, typename F, typename... Ts>
   CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(aggregation::Kind k,
                                                       F&& f,
-                                                      Ts&&... args) const noexcept
+                                                      Ts&&... args) const
   {
     return aggregation_dispatcher(
       k, dispatch_aggregation<Element>{}, std::forward<F>(f), std::forward<Ts>(args)...);
@@ -763,4 +1107,4 @@ constexpr inline bool is_valid_aggregation()
 bool is_valid_aggregation(data_type source, aggregation::Kind k);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace cudf
\ No newline at end of file
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 2783bd7729f..aebf0c23469 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -88,6 +88,47 @@ std::unique_ptr<column> shift(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Performs segmented shifts for specified values.
+ *
+ * For each segment, `i`th element is determined by the `i - offset`th element
+ * of the segment. If `i - offset < 0 or >= segment_size`, the value is determined by
+ * @p fill_value.
+ *
+ * Example:
+ * @code{.pseudo}
+ * segmented_values: { 3 1 2 | 3 5 3 | 2 6 }
+ * segment_offsets: {0 3 6 8}
+ * offset: 2
+ * fill_value: @
+ * result: { @ @ 3 | @ @ 3 | @ @ }
+ * -------------------------------------------------
+ * segmented_values: { 3 1 2 | 3 5 3 | 2 6 }
+ * segment_offsets: {0 3 6 8}
+ * offset: -1
+ * fill_value: -1
+ * result: { 1 2 -1 | 5 3 -1 | 6 -1 }
+ * @endcode
+ *
+ * @param segmented_values Segmented column, specified by @p segment_offsets
+ * @param segment_offsets Each segment's offset of @p segmented_values. A list of offsets
+ * with size `num_segments + 1`. The size of each segment is `segment_offsets[i+1] -
+ * segment_offsets[i]`.
+ * @param offset The offset by which to shift the input
+ * @param fill_value Fill value for indeterminable outputs
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @note If `offset == 0`, a copy of @p segmented_values is returned.
+ */
+std::unique_ptr<column> segmented_shift(
+  column_view const& segmented_values,
+  device_span<size_type const> segment_offsets,
+  size_type offset,
+  scalar const& fill_value,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::contiguous_split
  *
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index fbf68a20364..2051daec00b 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -278,9 +278,9 @@ struct scatter_gather_functor {
   std::unique_ptr<cudf::column> operator()(
     cudf::column_view const& input,
     cudf::size_type const& output_size,
-    cudf::size_type const* block_offsets,
+    cudf::size_type const*,
     Filter filter,
-    cudf::size_type per_thread,
+    cudf::size_type,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
   {
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 7a560e4c048..1dd0d472d0d 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -20,6 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -35,7 +36,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <algorithm>
@@ -567,15 +567,14 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  thrust::host_vector<bitmask_type*> target_masks(target.size());
+  std::vector<bitmask_type*> target_masks(target.size());
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
-  rmm::device_vector<bitmask_type*> d_target_masks(target_masks);
+  auto d_target_masks = make_device_uvector_async(target_masks, stream);
 
-  auto const masks         = d_target_masks.data().get();
   auto const device_source = table_device_view::create(source, stream);
-  auto d_valid_counts      = rmm::device_vector<size_type>(target.size());
+  auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(target.size(), stream);
 
   // Dispatch operation enum to get implementation
   auto const impl = [op]() {
@@ -591,14 +590,14 @@ void gather_bitmask(table_view const& source,
   }();
   impl(*device_source,
        gather_map,
-       masks,
+       d_target_masks.data(),
        target.size(),
        target_rows,
-       d_valid_counts.data().get(),
+       d_valid_counts.data(),
        stream);
 
   // Copy the valid counts into each column
-  auto const valid_counts = thrust::host_vector<size_type>(d_valid_counts);
+  auto const valid_counts = make_std_vector_sync(d_valid_counts, stream);
   for (size_t i = 0; i < target.size(); ++i) {
     if (target[i]->nullable()) {
       auto const null_count = target_rows - valid_counts[i];
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
new file mode 100644
index 00000000000..5fb7379734f
--- /dev/null
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/exec_policy.hpp>
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+/**
+ * @brief Internal API to replace nulls with preceding/following non-null values in @p value
+ *
+ * @param[in] grouped_value A column whose null values will be replaced.
+ * @param[in] group_labels Group labels for @p grouped_value, corresponding to group keys.
+ * @param[in] replace_policy Specify the position of replacement values relative to null values.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used to allocate device memory of the returned column.
+ */
+std::unique_ptr<column> group_replace_nulls(
+  cudf::column_view const& grouped_value,
+  device_span<size_type const> group_labels,
+  cudf::replace_policy replace_policy,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index 06f523c2320..83d6be14709 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,17 +29,17 @@ namespace detail {
  */
 std::unique_ptr<column> hash(
   table_view const& input,
-  hash_id hash_function                     = hash_id::HASH_MURMUR3,
-  std::vector<uint32_t> const& initial_hash = {},
-  uint32_t seed                             = 0,
-  rmm::cuda_stream_view stream              = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  hash_id hash_function                        = hash_id::HASH_MURMUR3,
+  cudf::host_span<uint32_t const> initial_hash = {},
+  uint32_t seed                                = 0,
+  rmm::cuda_stream_view stream                 = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr          = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> murmur_hash3_32(
   table_view const& input,
-  std::vector<uint32_t> const& initial_hash = {},
-  rmm::cuda_stream_view stream              = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  cudf::host_span<uint32_t const> initial_hash = {},
+  rmm::cuda_stream_view stream                 = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr          = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> md5_hash(
   table_view const& input,
diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp
new file mode 100644
index 00000000000..fff67f107d9
--- /dev/null
+++ b/cpp/include/cudf/detail/is_element_valid.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Return validity of a row
+ *
+ * Retrieves the validity (NULL or non-NULL) of the specified row from device memory.
+ *
+ * @note Synchronizes `stream`.
+ *
+ * @throw cudf::logic_error if `element_index < 0 or >= col_view.size()`
+ *
+ * @param col_view The column to retrieve the validity from.
+ * @param element_index The index of the row to retrieve.
+ * @param stream The stream to use for copying the validity to the host.
+ * @return Host boolean that indicates the validity of the row.
+ */
+
+bool is_element_valid_sync(column_view const& col_view,
+                           size_type element_index,
+                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 881afa63ca5..4cb0c6e1877 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -167,6 +167,134 @@ auto make_null_replacement_iterator(column_device_view const& column,
     0, null_replaced_value_accessor<Element>{column, null_replacement, has_nulls});
 }
 
+/**
+ * @brief Constructs an optional iterator over a column's values and its validity.
+ *
+ * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ *
+ * When the element of an iterator contextually converted to bool, the conversion returns true
+ * if the object contains a value and false if it does not contain a value.
+ *
+ * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to
+ * runtime, with the user stating on construction of the iterator if column has nulls.
+ * `DYNAMIC` mode is nice when an algorithm is going to execute on mutliple
+ * iterators and you don't want to compile all the combinations of iterator types
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * template<typename T>
+ * void some_function(cudf::column_view<T> const& col_view){
+ *    auto d_col = cudf::column_device_view::create(col_view);
+ *    // Create a `DYNAMIC` optional iterator
+ *    auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                cudf::contains_nulls::DYNAMIC{},
+ *                                                col_view.has_nulls());
+ * }
+ * \endcode
+ *
+ * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
+ *         the user has stated nulls exist
+ * @throws cudf::logic_error if column datatype and Element type mismatch.
+ *
+ * @tparam Element The type of elements in the column
+ * @param column The column to iterate
+ * @return Iterator that returns valid column elements and the validity of the
+ * element in a thrust::optional
+ */
+template <typename Element>
+auto make_optional_iterator(column_device_view const& column,
+                            contains_nulls::DYNAMIC,
+                            bool has_nulls)
+{
+  return column.optional_begin<Element>(contains_nulls::DYNAMIC{}, has_nulls);
+}
+
+/**
+ * @brief Constructs an optional iterator over a column's values and its validity.
+ *
+ * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ *
+ * When the element of an iterator contextually converted to bool, the conversion returns true
+ * if the object contains a value and false if it does not contain a value.
+ *
+ * make_optional_iterator with mode `YES` means that the column supports nulls and
+ * potentially has null values, therefore the optional might not contain a value
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * template<typename T, bool has_nulls>
+ * void some_function(cudf::column_view<T> const& col_view){
+ *    auto d_col = cudf::column_device_view::create(col_view);
+ *    if constexpr(has_nulls) {
+ *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                  cudf::contains_nulls::YES{});
+ *      //use optional_iterator
+ *    } else {
+ *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                  cudf::contains_nulls::NO{});
+ *      //use optional_iterator
+ *    }
+ * }
+ * \endcode
+ *
+ * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
+ * @throws cudf::logic_error if column datatype and Element type mismatch.
+ *
+ * @tparam Element The type of elements in the column
+ * @param column The column to iterate
+ * @return Iterator that returns column elements and the validity of the
+ * element as a thrust::optional
+ */
+template <typename Element>
+auto make_optional_iterator(column_device_view const& column, contains_nulls::YES)
+{
+  return column.optional_begin<Element>(contains_nulls::YES{});
+}
+
+/**
+ * @brief Constructs an optional iterator over a column's values and its validity.
+ *
+ * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ *
+ * When the element of an iterator contextually converted to bool, the conversion returns true
+ * if the object contains a value and false if it does not contain a value.
+ *
+ * make_optional_iterator with mode `NO` means that the column has no null values,
+ * therefore the optional will always contain a value.
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * template<typename T, bool has_nulls>
+ * void some_function(cudf::column_view<T> const& col_view){
+ *    auto d_col = cudf::column_device_view::create(col_view);
+ *    if constexpr(has_nulls) {
+ *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                  cudf::contains_nulls::YES{});
+ *      //use optional_iterator
+ *    } else {
+ *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                  cudf::contains_nulls::NO{});
+ *      //use optional_iterator
+ *    }
+ * }
+ * \endcode
+ *
+ * @throws cudf::logic_error if column datatype and Element type mismatch.
+ *
+ * @tparam Element The type of elements in the column
+ * @param column The column to iterate
+ * @return Iterator that returns column elements and the validity of the
+ * element in a thrust::optional
+ */
+template <typename Element>
+auto make_optional_iterator(column_device_view const& column, contains_nulls::NO)
+{
+  return column.optional_begin<Element>(contains_nulls::NO{});
+}
+
 /**
  * @brief Constructs a pair iterator over a column's values and its validity.
  *
@@ -320,6 +448,81 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
                                          scalar_value_accessor<Element>{scalar_value});
 }
 
+template <typename Element, typename contains_nulls_mode>
+struct scalar_optional_accessor;
+
+/**
+ * @brief optional accessor of a maybe-nullable scalar
+ *
+ * The scalar_optional_accessor always returns a thrust::optional of the scalar.
+ * The validity of the optional is determined by the contains_nulls_mode template parameter
+ * which has the following modes:
+ *
+ * `DYNAMIC`: Defer nullability checks to runtime
+ *
+ *  - When `with_nulls=true` the return value will be a `thrust::optional{scalar}`
+ *    when scalar is valid, and `thrust::optional{}` when the scalar is invalid.
+ *
+ *  - When `with_nulls=false` the return value will always be `thrust::optional{scalar}`
+ *
+ * `NO`: No null values will occur for this scalar, no checks will occur
+ *  and `thrust::optional{scalar}` will always be returned.
+ *
+ * `YES`: null values will occur for this scalar,
+ *  and `thrust::optional{scalar}` will always be returned.
+ *
+ * @throws `cudf::logic_error` if scalar datatype and Element type mismatch.
+ *
+ * @tparam Element The type of return type of functor
+ */
+template <typename Element, typename contains_nulls_mode>
+struct scalar_optional_accessor : public scalar_value_accessor<Element> {
+  using super_t    = scalar_value_accessor<Element>;
+  using value_type = thrust::optional<Element>;
+
+  scalar_optional_accessor(scalar const& scalar_value)
+    : scalar_value_accessor<Element>(scalar_value)
+  {
+  }
+
+  /**
+   * @brief returns a thrust::optional<Element>.
+   *
+   * @throw `cudf::logic_error` if this function is called in host.
+   *
+   * @return a thrust::optional<Element> for the scalar value.
+   */
+  CUDA_HOST_DEVICE_CALLABLE
+  const value_type operator()(size_type) const
+  {
+    if constexpr (std::is_same_v<contains_nulls_mode, contains_nulls::YES>) {
+      return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()}
+                                           : value_type{thrust::nullopt};
+    }
+    return Element{super_t::dscalar.value()};
+  }
+};
+
+template <typename Element>
+struct scalar_optional_accessor<Element, cudf::contains_nulls::DYNAMIC>
+  : public scalar_value_accessor<Element> {
+  using super_t    = scalar_value_accessor<Element>;
+  using value_type = thrust::optional<Element>;
+  bool has_nulls;
+
+  scalar_optional_accessor(scalar const& scalar_value, bool with_nulls)
+    : scalar_value_accessor<Element>(scalar_value), has_nulls{with_nulls}
+  {
+  }
+
+  CUDA_HOST_DEVICE_CALLABLE
+  const value_type operator()(size_type) const
+  {
+    return (has_nulls and !super_t::dscalar.is_valid()) ? value_type{thrust::nullopt}
+                                                        : Element{super_t::dscalar.value()};
+  }
+};
+
 /**
  * @brief pair accessor for scalar.
  * The unary functor returns a pair of data of Element type and bool validity of the scalar.
@@ -415,6 +618,163 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
   }
 };
 
+/**
+ * @brief Constructs an optional iterator over a scalar's values and its validity.
+ *
+ * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ *
+ * When the element of an iterator contextually converted to bool, the conversion returns true
+ * if the object contains a value and false if it does not contain a value.
+ *
+ * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
+ *
+ * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to
+ * runtime, with the user stating on construction of the iterator if scalar has nulls.
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * template<typename T>
+ * void some_function(cudf::column_view<T> const& col_view,
+ *                    scalar const& scalar_value,
+ *                    bool col_has_nulls){
+ *    auto d_col = cudf::column_device_view::create(col_view);
+ *    auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+                                      cudf::contains_nulls::DYNAMIC{}, col_has_nulls);
+ *    auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
+                                      cudf::contains_nulls::DYNAMIC{}, scalar_value.is_valid());
+ *    //use iterators
+ * }
+ * \endcode
+ *
+ * @throws cudf::logic_error if the scalar is not nullable, and `DYNAMIC` mode used and
+ *         the user has stated nulls exist
+ * @throws cudf::logic_error if scalar datatype and Element type mismatch.
+ *
+ * @tparam Element The type of elements in the scalar
+ * @tparam has_nulls If the scalar value will have a null at runtime
+ * @param scalar_value The scalar to iterate
+ * @return Iterator that returns scalar elements and validity of the
+ * element in a thrust::optional
+ */
+template <typename Element>
+auto inline make_optional_iterator(scalar const& scalar_value,
+                                   contains_nulls::DYNAMIC,
+                                   bool has_nulls)
+{
+  CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
+               "the data type mismatch");
+  return thrust::make_transform_iterator(
+    thrust::make_constant_iterator<size_type>(0),
+    scalar_optional_accessor<Element, contains_nulls::DYNAMIC>{scalar_value, has_nulls});
+}
+
+/**
+ * @brief Constructs an optional iterator over a scalar's values and its validity.
+ *
+ * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ *
+ * When the element of an iterator contextually converted to bool, the conversion returns true
+ * if the object contains a value and false if it does not contain a value.
+ *
+ * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
+ *
+ * make_optional_iterator ith mode `YES` means that the scalar supports nulls and
+ * potentially has null values, therefore the optional might not contain a value
+ * therefore the optional will always contain a value.
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * template<typename T, bool any_nulls>
+ * void some_function(cudf::column_view<T> const& col_view, scalar const& scalar_value){
+ *    auto d_col = cudf::column_device_view::create(col_view);
+ *    if constexpr(any_nulls) {
+ *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                cudf::contains_nulls::YES{});
+ *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
+ *                                                cudf::contains_nulls::YES{});
+ *      //use iterators
+ *    } else {
+ *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                cudf::contains_nulls::NO{});
+ *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
+ *                                                cudf::contains_nulls::NO{});
+ *      //use iterators
+ *    }
+ * }
+ * \endcode
+ *
+ * @throws cudf::logic_error if the scalar is not nullable, and `YES` mode used
+ * @throws cudf::logic_error if scalar datatype and Element type mismatch.
+ *
+ * @tparam Element The type of elements in the scalar
+ * @param scalar_value The scalar to iterate
+ * @return Iterator that returns scalar elements and the validity of the
+ * element in a thrust::optional
+ */
+template <typename Element>
+auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::YES)
+{
+  CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
+               "the data type mismatch");
+  return thrust::make_transform_iterator(
+    thrust::make_constant_iterator<size_type>(0),
+    scalar_optional_accessor<Element, contains_nulls::YES>{scalar_value});
+}
+
+/**
+ * @brief Constructs an optional iterator over a scalar's values and its validity.
+ *
+ * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ *
+ * When the element of an iterator contextually converted to bool, the conversion returns true
+ * if the object contains a value and false if it does not contain a value.
+ *
+ * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
+ *
+ * make_optional_iterator with mode `NO` means that the scalar has no null values,
+ * therefore the optional will always contain a value.
+ *
+ * Example:
+ *
+ * \code{.cpp}
+ * template<typename T, bool any_nulls>
+ * void some_function(cudf::column_view<T> const& col_view, scalar const& scalar_value){
+ *    auto d_col = cudf::column_device_view::create(col_view);
+ *    if constexpr(any_nulls) {
+ *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                cudf::contains_nulls::YES{});
+ *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
+ *                                                cudf::contains_nulls::YES{});
+ *      //use iterators
+ *    } else {
+ *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
+ *                                                cudf::contains_nulls::NO{});
+ *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
+ *                                                cudf::contains_nulls::NO{});
+ *      //use iterators
+ *    }
+ * }
+ * \endcode
+ *
+ * @throws cudf::logic_error if scalar datatype and Element type mismatch.
+ *
+ * @tparam Element The type of elements in the scalar
+ * @param scalar_value The scalar to iterate
+ * @return Iterator that returns scalar elements and the validity of the
+ * element in a thrust::optional
+ */
+template <typename Element>
+auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::NO)
+{
+  CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
+               "the data type mismatch");
+  return thrust::make_transform_iterator(
+    thrust::make_constant_iterator<size_type>(0),
+    scalar_optional_accessor<Element, contains_nulls::NO>{scalar_value});
+}
+
 /**
  * @brief Constructs a constant device pair iterator over a scalar's value and its validity.
  *
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index 06f9bfc5034..a938a3a053a 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,12 +30,12 @@ enum class side : bool { LEFT, RIGHT };
  * @brief Tagged index type: `thrust::get<0>` indicates left/right side,
  * `thrust::get<1>` indicates the row index
  */
-using index_type = thrust::tuple<side, cudf::size_type>;
+using index_type = thrust::pair<side, cudf::size_type>;
 
 /**
  * @brief Vector of `index_type` values.
  */
-using index_vector = rmm::device_vector<index_type>;
+using index_vector = rmm::device_uvector<index_type>;
 
 /**
  * @brief tagged_element_relational_comparator uses element_relational_comparator to provide
@@ -80,11 +80,11 @@ struct tagged_element_relational_comparator {
   __device__ weak_ordering compare(index_type lhs_tagged_index, index_type rhs_tagged_index) const
     noexcept
   {
-    side l_side = thrust::get<0>(lhs_tagged_index);
-    side r_side = thrust::get<0>(rhs_tagged_index);
+    side const l_side = thrust::get<0>(lhs_tagged_index);
+    side const r_side = thrust::get<0>(rhs_tagged_index);
 
-    cudf::size_type l_indx = thrust::get<1>(lhs_tagged_index);
-    cudf::size_type r_indx = thrust::get<1>(rhs_tagged_index);
+    cudf::size_type const l_indx = thrust::get<1>(lhs_tagged_index);
+    cudf::size_type const r_indx = thrust::get<1>(rhs_tagged_index);
 
     column_device_view const* ptr_left_dview{l_side == side::LEFT ? &lhs : &rhs};
 
diff --git a/cpp/include/cudf/detail/replace/nulls.cuh b/cpp/include/cudf/detail/replace/nulls.cuh
new file mode 100644
index 00000000000..1500bdfb0b8
--- /dev/null
+++ b/cpp/include/cudf/detail/replace/nulls.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <thrust/functional.h>
+
+namespace cudf {
+namespace detail {
+
+using idx_valid_pair_t = thrust::tuple<cudf::size_type, bool>;
+
+/**
+ * @brief Functor used by `replace_nulls(replace_policy)` to determine the index to gather from in
+ * the result column.
+ *
+ * Binary functor passed to `inclusive_scan` or `inclusive_scan_by_key`. Arguments are a tuple of
+ * index and validity of a row. Returns a tuple of current index and a discarded boolean if current
+ * row is valid, otherwise a tuple of the nearest non-null row index and a discarded boolean.
+ */
+struct replace_policy_functor {
+  __device__ idx_valid_pair_t operator()(idx_valid_pair_t const& lhs, idx_valid_pair_t const& rhs)
+  {
+    return thrust::get<1>(rhs) ? thrust::make_tuple(thrust::get<0>(rhs), true)
+                               : thrust::make_tuple(thrust::get<0>(lhs), true);
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
new file mode 100644
index 00000000000..2b06d11c5a9
--- /dev/null
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @copydoc std::unique_ptr<column> rolling_window(
+ *            column_view const& input,
+ *            column_view const& preceding_window,
+ *            column_view const& following_window,
+ *            size_type min_periods,
+ *            rolling_aggregation const& agg,
+ *            rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> rolling_window(
+  column_view const& input,
+  column_view const& preceding_window,
+  column_view const& following_window,
+  size_type min_periods,
+  rolling_aggregation const& agg,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
new file mode 100644
index 00000000000..5691adecb5e
--- /dev/null
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Computes the exclusive scan of a column.
+ *
+ * The null values are skipped for the operation, and if an input element
+ * at `i` is null, then the output element at `i` will also be null.
+ *
+ * The identity value for the column type as per the aggregation type
+ * is used for the value of the first element in the output column.
+ *
+ * @throws cudf::logic_error if column data_type is not an arithmetic type.
+ *
+ * @param input The input column view for the scan
+ * @param agg unique_ptr to aggregation operator applied by the scan
+ * @param null_handling Exclude null values when computing the result if
+ *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
+ *                      Any operation with a null results in a null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @returns Column with scan results
+ */
+std::unique_ptr<column> scan_exclusive(column_view const& input,
+                                       std::unique_ptr<aggregation> const& agg,
+                                       null_policy null_handling,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Computes the inclusive scan of a column.
+ *
+ * The null values are skipped for the operation, and if an input element
+ * at `i` is null, then the output element at `i` will also be null.
+ *
+ * String columns are allowed with aggregation types Min and Max.
+ *
+ * @throws cudf::logic_error if column data_type is not an arithmetic type
+ *                           or string type but the `agg` is not Min or Max
+ *
+ * @param input The input column view for the scan
+ * @param agg unique_ptr to aggregation operator applied by the scan
+ * @param null_handling Exclude null values when computing the result if
+ *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
+ *                      Any operation with a null results in a null.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned scalar's device memory
+ * @returns Column with scan results
+ */
+std::unique_ptr<column> scan_inclusive(column_view const& input,
+                                       std::unique_ptr<aggregation> const& agg,
+                                       null_policy null_handling,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index d069ed06cae..410cd213618 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -296,7 +296,7 @@ struct column_scatterer_impl<struct_view> {
 
     // We still need to call `gather_bitmask` even when the source's children are not nullable,
     // as if the target's children have null_masks, those null_masks need to be updated after
-    // being scattered onto
+    // being scattered onto.
     auto const child_nullable = std::any_of(structs_src.child_begin(),
                                             structs_src.child_end(),
                                             [](auto const& col) { return col.nullable(); }) or
@@ -315,9 +315,9 @@ struct column_scatterer_impl<struct_view> {
                      mr);
     }
 
-    // Need to put the result column in a vector to call `gather_bitmask`
+    // Need to put the result column in a vector to call `gather_bitmask`.
     std::vector<std::unique_ptr<column>> result;
-    result.emplace_back(cudf::make_structs_column(source.size(),
+    result.emplace_back(cudf::make_structs_column(target.size(),
                                                   std::move(output_struct_members),
                                                   0,
                                                   rmm::device_buffer{0, stream, mr},
@@ -325,7 +325,7 @@ struct column_scatterer_impl<struct_view> {
                                                   mr));
 
     // Only gather bitmask from the target column for the rows that have not been scattered onto
-    // The bitmask from the source column will be gathered at the top level `scatter()` call
+    // The bitmask from the source column will be gathered at the top level `scatter()` call.
     if (target.nullable()) {
       auto const gather_map =
         scatter_to_gather_complement(scatter_map_begin, scatter_map_end, target.size(), stream);
@@ -402,7 +402,7 @@ std::unique_ptr<table> scatter(
   CUDF_EXPECTS(std::distance(scatter_map_begin, scatter_map_end) <= source.num_rows(),
                "scatter map size should be <= to number of rows in source");
 
-  // Transform negative indices to index + target size
+  // Transform negative indices to index + target size.
   auto updated_scatter_map_begin =
     thrust::make_transform_iterator(scatter_map_begin, index_converter<MapType>{target.num_rows()});
   auto updated_scatter_map_end =
@@ -425,7 +425,7 @@ std::unique_ptr<table> scatter(
                  });
 
   // We still need to call `gather_bitmask` even when the source columns are not nullable,
-  // as if the target has null_mask, that null_mask needs to be updated after scattering
+  // as if the target has null_mask, that null_mask needs to be updated after scattering.
   auto const nullable =
     std::any_of(source.begin(), source.end(), [](auto const& col) { return col.nullable(); }) or
     std::any_of(target.begin(), target.end(), [](auto const& col) { return col.nullable(); });
@@ -433,6 +433,25 @@ std::unique_ptr<table> scatter(
     auto const gather_map = scatter_to_gather(
       updated_scatter_map_begin, updated_scatter_map_end, target.num_rows(), stream);
     gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, stream, mr);
+
+    // For struct columns, we need to superimpose the null_mask of the parent over the null_mask of
+    // the children.
+    std::for_each(result.begin(), result.end(), [=](auto& col) {
+      auto const col_view = col->view();
+      if (col_view.type().id() == type_id::STRUCT and col_view.nullable()) {
+        auto const num_rows   = col_view.size();
+        auto const null_count = col_view.null_count();
+        auto contents         = col->release();
+
+        // Children null_mask will be superimposed during structs column construction.
+        col = cudf::make_structs_column(num_rows,
+                                        std::move(contents.children),
+                                        null_count,
+                                        std::move(*contents.null_mask),
+                                        stream,
+                                        mr);
+      }
+    });
   }
   return std::make_unique<table>(std::move(result));
 }
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 5bc12fb0713..87823d71c6f 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -71,6 +71,7 @@ std::unique_ptr<table> drop_duplicates(
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
   null_equality nulls_equal           = null_equality::EQUAL,
+  null_order null_precedence          = null_order::BEFORE,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 33c61414a1c..11dbba70c3f 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -134,7 +134,7 @@ cudf::size_type elements_per_thread(Kernel kernel,
   int num_sms = 0;
   CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
   int per_thread = total_size / (max_blocks * num_sms * block_size);
-  return std::max(1, std::min(per_thread, max_per_thread));  // switch to std::clamp with C++17
+  return std::clamp(per_thread, 1, max_per_thread);
 }
 
 /**
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 246817a5cb5..16b7da0a083 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -42,9 +42,6 @@
 
 namespace cudf {
 namespace detail {
-// TODO: remove this if C++17 is supported.
-// `static_assert` requires a string literal at C++14.
-#define errmsg_cast "`long long int` has different size to `int64_t`"
 
 template <typename T_output, typename T_input>
 __forceinline__ __device__ T_output type_reinterpret(T_input value)
@@ -142,7 +139,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
   {
     using T_int = unsigned long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
 
     T old_value = *addr;
     T assumed{old_value};
@@ -210,7 +207,7 @@ struct genericAtomicOperationImpl<int64_t, DeviceSum, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
   {
     using T_int = unsigned long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
     T ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return ret;
   }
@@ -240,7 +237,7 @@ struct genericAtomicOperationImpl<int64_t, DeviceMin, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
   {
     using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
     T ret = atomicMin(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return ret;
   }
@@ -252,7 +249,7 @@ struct genericAtomicOperationImpl<int64_t, DeviceMax, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
   {
     using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
     T ret = atomicMax(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return ret;
   }
@@ -271,7 +268,7 @@ struct genericAtomicOperationImpl<T, DeviceAnd, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
   {
     using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
     T ret = atomicAnd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return ret;
   }
@@ -290,7 +287,7 @@ struct genericAtomicOperationImpl<T, DeviceOr, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
   {
     using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
     T ret = atomicOr(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return ret;
   }
@@ -309,7 +306,7 @@ struct genericAtomicOperationImpl<T, DeviceXor, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
   {
     using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
     T ret = atomicXor(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return ret;
   }
@@ -400,7 +397,7 @@ struct typesAtomicCASImpl<T, 8> {
   __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
   {
     using T_int = unsigned long long int;
-    static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
+    static_assert(sizeof(T) == sizeof(T_int));
 
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
                           type_reinterpret<T_int, T>(compare),
@@ -503,6 +500,28 @@ __forceinline__ __device__ T atomicAdd(T* address, T val)
   return cudf::genericAtomicOperation(address, val, cudf::DeviceSum{});
 }
 
+/**
+ * @brief Overloads for `atomicMul`
+ * reads the `old` located at the `address` in global or shared memory,
+ * computes (old * val), and stores the result back to memory at the same
+ * address. These three operations are performed in one atomic transaction.
+ *
+ * The supported cudf types for `atomicMul` are:
+ * int8_t, int16_t, int32_t, int64_t, float, double, and bool
+ *
+ * All types are implemented by `atomicCAS`.
+ *
+ * @param[in] address The address of old value in global or shared memory
+ * @param[in] val The value to be multiplied
+ *
+ * @returns The old value at `address`
+ */
+template <typename T>
+__forceinline__ __device__ T atomicMul(T* address, T val)
+{
+  return cudf::genericAtomicOperation(address, val, cudf::DeviceProduct{});
+}
+
 /**
  * @brief Overloads for `atomicMin`
  * reads the `old` located at the `address` in global or shared memory,
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 7f3c05134e2..6eab13ae9af 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -91,21 +91,21 @@ void CUDA_DEVICE_CALLABLE md5_process(TKey const& key, md5_intermediate_data* ha
   // 64 bytes for the number of byt es processed in a given step
   constexpr int md5_chunk_size = 64;
   if (hash_state->buffer_length + len < md5_chunk_size) {
-    thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
+    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len);
     hash_state->buffer_length += len;
   } else {
     uint32_t copylen = md5_chunk_size - hash_state->buffer_length;
 
-    thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
+    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen);
     md5_hash_step(hash_state);
 
     while (len > md5_chunk_size + copylen) {
-      thrust::copy_n(thrust::seq, data + copylen, md5_chunk_size, hash_state->buffer);
+      std::memcpy(hash_state->buffer, data + copylen, md5_chunk_size);
       md5_hash_step(hash_state);
       copylen += md5_chunk_size;
     }
 
-    thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
+    std::memcpy(hash_state->buffer, data + copylen, len - copylen);
     hash_state->buffer_length = len - copylen;
   }
 }
@@ -146,7 +146,7 @@ void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destina
 
   x |= 0x3030303030303030;
   x += offsets;
-  thrust::copy_n(thrust::seq, reinterpret_cast<uint8_t*>(&x), 8, destination);
+  std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
 }
 
 struct MD5ListHasher {
@@ -211,20 +211,20 @@ MD5ListHasher::operator()<string_view>(column_device_view data_col,
       hash_state->message_length += len;
 
       if (hash_state->buffer_length + len < 64) {
-        thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
+        std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len);
         hash_state->buffer_length += len;
       } else {
         uint32_t copylen = 64 - hash_state->buffer_length;
-        thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
+        std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen);
         md5_hash_step(hash_state);
 
         while (len > 64 + copylen) {
-          thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
+          std::memcpy(hash_state->buffer, data + copylen, 64);
           md5_hash_step(hash_state);
           copylen += 64;
         }
 
-        thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
+        std::memcpy(hash_state->buffer, data + copylen, len - copylen);
         hash_state->buffer_length = len - copylen;
       }
     }
@@ -262,10 +262,9 @@ struct MD5Hash {
       thrust::fill_n(thrust::seq, hash_state->buffer, md5_chunk_size - message_length_size, 0x00);
     }
 
-    thrust::copy_n(thrust::seq,
-                   reinterpret_cast<uint8_t const*>(&full_length),
-                   message_length_size,
-                   hash_state->buffer + md5_chunk_size - message_length_size);
+    std::memcpy(hash_state->buffer + md5_chunk_size - message_length_size,
+                reinterpret_cast<uint8_t const*>(&full_length),
+                message_length_size);
     md5_hash_step(hash_state);
 
 #pragma unroll
@@ -323,20 +322,20 @@ void CUDA_DEVICE_CALLABLE MD5Hash::operator()<string_view>(column_device_view co
   hash_state->message_length += len;
 
   if (hash_state->buffer_length + len < 64) {
-    thrust::copy_n(thrust::seq, data, len, hash_state->buffer + hash_state->buffer_length);
+    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len);
     hash_state->buffer_length += len;
   } else {
     uint32_t copylen = 64 - hash_state->buffer_length;
-    thrust::copy_n(thrust::seq, data, copylen, hash_state->buffer + hash_state->buffer_length);
+    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen);
     md5_hash_step(hash_state);
 
     while (len > 64 + copylen) {
-      thrust::copy_n(thrust::seq, data + copylen, 64, hash_state->buffer);
+      std::memcpy(hash_state->buffer, data + copylen, 64);
       md5_hash_step(hash_state);
       copylen += 64;
     }
 
-    thrust::copy_n(thrust::seq, data + copylen, len - copylen, hash_state->buffer);
+    std::memcpy(hash_state->buffer, data + copylen, len - copylen);
     hash_state->buffer_length = len - copylen;
   }
 }
@@ -549,6 +548,20 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<double>::operator()(double c
   return this->compute_floating_point(key);
 }
 
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+MurmurHash3_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+hash_value_type CUDA_DEVICE_CALLABLE
+MurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+{
+  return this->compute(key.value());
+}
+
 template <>
 hash_value_type CUDA_DEVICE_CALLABLE
 MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 030d2c331c5..1e735719400 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -14,20 +14,70 @@
  * limitations under the License.
  */
 
+#pragma once
+
 /**
  * @brief Convenience factories for creating device vectors from host spans
  * @file vector_factories.hpp
  */
 
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
+#include <vector>
+
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Asynchronously construct a `device_uvector` and set all elements to zero.
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory and perform the memset
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing zeros
+ */
+template <typename T>
+rmm::device_uvector<T> make_zeroed_device_uvector_async(
+  std::size_t size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  rmm::device_uvector<T> ret(size, stream, mr);
+  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  return ret;
+}
+
+/**
+ * @brief Synchronously construct a `device_uvector` and set all elements to zero.
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory and perform the memset
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing zeros
+ */
+template <typename T>
+rmm::device_uvector<T> make_zeroed_device_uvector_sync(
+  std::size_t size,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  rmm::device_uvector<T> ret(size, stream, mr);
+  CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
+  stream.synchronize();
+  return ret;
+}
+
 /**
  * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a
  * `host_span`
@@ -43,7 +93,7 @@ namespace detail {
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(
   host_span<T const> source_data,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
@@ -74,7 +124,7 @@ template <typename Container,
                                 host_span<typename Container::value_type const>>::value>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
   Container const& c,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
@@ -127,7 +177,7 @@ template <
     nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
   Container const& c,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return make_device_uvector_async(
@@ -231,6 +281,181 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
+// Utility function template to allow copying to either a thrust::host_vector or std::vector
+template <typename T, typename OutContainer>
+OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+{
+  OutContainer result(v.size());
+  CUDA_TRY(cudaMemcpyAsync(
+    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDeviceToHost, stream.value()));
+  return result;
+}
+
+/**
+ * @brief Asynchronously construct a `std::vector` containing a copy of data from a
+ * `device_span`
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The device data to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <typename T>
+std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+{
+  return make_vector_async<T, std::vector<T>>(v, stream);
+}
+
+/**
+ * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
+ * container
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input device container from which to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <
+  typename Container,
+  std::enable_if_t<
+    std::is_convertible<Container, device_span<typename Container::value_type const>>::value>* =
+    nullptr>
+std::vector<typename Container::value_type> make_std_vector_async(Container const& c,
+                                                                  rmm::cuda_stream_view stream)
+{
+  return make_std_vector_async(device_span<typename Container::value_type const>{c}, stream);
+}
+
+/**
+ * @brief Synchronously construct a `std::vector` containing a copy of data from a
+ * `device_span`
+ *
+ * @note This function does a synchronize on `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The device data to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <typename T>
+std::vector<T> make_std_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
+{
+  auto result = make_std_vector_async(v, stream);
+  stream.synchronize();
+  return result;
+}
+
+/**
+ * @brief Synchronously construct a `std::vector` containing a copy of data from a device
+ * container
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input device container from which to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <
+  typename Container,
+  std::enable_if_t<
+    std::is_convertible<Container, device_span<typename Container::value_type const>>::value>* =
+    nullptr>
+std::vector<typename Container::value_type> make_std_vector_sync(
+  Container const& c, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+{
+  return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
+}
+
+/**
+ * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
+ * `device_span`
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The device data to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <typename T>
+thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+{
+  return make_vector_async<T, thrust::host_vector<T>>(v, stream);
+}
+
+/**
+ * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
+ * container
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input device container from which to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <
+  typename Container,
+  std::enable_if_t<
+    std::is_convertible<Container, device_span<typename Container::value_type const>>::value>* =
+    nullptr>
+thrust::host_vector<typename Container::value_type> make_host_vector_async(
+  Container const& c, rmm::cuda_stream_view stream)
+{
+  return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
+}
+
+/**
+ * @brief Synchronously construct a `std::vector` containing a copy of data from a
+ * `device_span`
+ *
+ * @note This function does a synchronize on `stream`.
+ *
+ * @tparam T The type of the data to copy
+ * @param source_data The device data to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <typename T>
+thrust::host_vector<T> make_host_vector_sync(
+  device_span<T const> v, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+{
+  auto result = make_host_vector_async(v, stream);
+  stream.synchronize();
+  return result;
+}
+
+/**
+ * @brief Synchronously construct a `std::vector` containing a copy of data from a device
+ * container
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam Container The type of the container to copy from
+ * @tparam T The type of the data to copy
+ * @param c The input device container from which to copy
+ * @param stream The stream on which to perform the copy
+ * @return The data copied to the host
+ */
+template <
+  typename Container,
+  std::enable_if_t<
+    std::is_convertible<Container, device_span<typename Container::value_type const>>::value>* =
+    nullptr>
+thrust::host_vector<typename Container::value_type> make_host_vector_sync(
+  Container const& c, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+{
+  return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index c685837ae2b..11ce9199c2d 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -25,7 +25,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/distance.h>
 
 namespace cudf {
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 952075b1703..d7bc9e02eff 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -31,20 +31,8 @@
 
 //! `fixed_point` and supporting types
 namespace numeric {
-/** \cond HIDDEN_SYMBOLS */
-// This is a wrapper struct that enforces "strong typing"
-// at the construction site of the type. No implicit
-// conversions will be allowed and you will need to use the
-// name of the type alias (i.e. scale_type{0})
-template <typename T>
-struct strong_typedef {
-  T _t;
-  CUDA_HOST_DEVICE_CALLABLE explicit constexpr strong_typedef(T t) : _t(t) {}
-  CUDA_HOST_DEVICE_CALLABLE operator T() const { return _t; }
-};
-/** \endcond */
 
-using scale_type = strong_typedef<int32_t>;
+enum scale_type : int32_t {};
 
 /**
  * @brief Scoped enumerator to use when constructing `fixed_point`
@@ -76,8 +64,7 @@ namespace detail {
  *
  * https://simple.wikipedia.org/wiki/Exponentiation_by_squaring <br>
  * Note: this is the iterative equivalent of the recursive definition (faster) <br>
- * Quick-bench: http://quick-bench.com/Wg7o7HYQC9FW5M0CO0wQAjSwP_Y <br>
- * `exponent` comes from `using scale_type = strong_typedef<int32_t>` <br>
+ * Quick-bench: http://quick-bench.com/Wg7o7HYQC9FW5M0CO0wQAjSwP_Y
  *
  * @tparam Rep Representation type for return type
  * @tparam Base The base to be exponentiated
@@ -106,14 +93,6 @@ CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent)
   return square * extra;
 }
 
-/** @brief Helper function to negate strongly typed scale_type
- *
- * @param scale The scale to be negated
- * @return The negated scale
- */
-CUDA_HOST_DEVICE_CALLABLE
-auto negate(scale_type const& scale) { return scale_type{-scale}; }
-
 /** @brief Function that performs a `right shift` scale "times" on the `val`
  *
  * Note: perform this operation when constructing with positive scale
@@ -128,7 +107,7 @@ auto negate(scale_type const& scale) { return scale_type{-scale}; }
 template <typename Rep, Radix Rad, typename T>
 CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const& scale)
 {
-  return val / ipow<Rep, Rad>(scale._t);
+  return val / ipow<Rep, Rad>(static_cast<int32_t>(scale));
 }
 
 /** @brief Function that performs a `left shift` scale "times" on the `val`
@@ -145,7 +124,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const
 template <typename Rep, Radix Rad, typename T>
 CUDA_HOST_DEVICE_CALLABLE constexpr T left_shift(T const& val, scale_type const& scale)
 {
-  return val * ipow<Rep, Rad>(-scale._t);
+  return val * ipow<Rep, Rad>(static_cast<int32_t>(-scale));
 }
 
 /** @brief Function that performs a `right` or `left shift`
@@ -197,7 +176,7 @@ template <typename Rep,
 struct scaled_integer {
   Rep value;
   scale_type scale;
-  CUDA_HOST_DEVICE_CALLABLE explicit scaled_integer(Rep v, scale_type s) : value(v), scale(s) {}
+  CUDA_HOST_DEVICE_CALLABLE explicit scaled_integer(Rep v, scale_type s) : value{v}, scale{s} {}
 };
 
 /**
@@ -287,7 +266,7 @@ class fixed_point {
             typename cuda::std::enable_if_t<cuda::std::is_floating_point<U>::value>* = nullptr>
   explicit constexpr operator U() const
   {
-    return detail::shift<Rep, Rad>(static_cast<U>(_value), detail::negate(_scale));
+    return detail::shift<Rep, Rad>(static_cast<U>(_value), scale_type{-_scale});
   }
 
   /**
@@ -302,7 +281,7 @@ class fixed_point {
   {
     // Don't cast to U until converting to Rep because in certain cases casting to U before shifting
     // will result in integer overflow (i.e. if U = int32_t, Rep = int64_t and _value > 2 billion)
-    return static_cast<U>(detail::shift<Rep, Rad>(_value, detail::negate(_scale)));
+    return static_cast<U>(detail::shift<Rep, Rad>(_value, scale_type{-_scale}));
   }
 
   CUDA_HOST_DEVICE_CALLABLE operator scaled_integer<Rep>() const
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 19f87873873..85c469f58f8 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -17,10 +17,13 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <memory>
 #include <rmm/cuda_stream_view.hpp>
 
 #include <utility>
@@ -222,6 +225,62 @@ class groupby {
     host_span<aggregation_request const> requests,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  /**
+   * @brief Performs grouped shifts for specified values.
+   *
+   * In `j`th column, for each group, `i`th element is determined by the `i - offsets[j]`th
+   * element of the group. If `i - offsets[j] < 0 or >= group_size`, the value is determined by
+   * @p fill_values[j].
+   *
+   * @note The first returned table stores the keys passed to the groupby object. Row `i` of the key
+   * table corresponds to the group labels of row `i` in the shifted columns. The key order in
+   * each group matches the input order. The order of each group is arbitrary. The group order
+   * in successive calls to `groupby::shifts` may be different.
+   *
+   * Example:
+   * @code{.pseudo}
+   * keys:    {1 4 1 3 4 4 1}
+   *          {1 2 1 3 2 2 1}
+   * values:  {3 9 1 4 2 5 7}
+   *          {"a" "c" "bb" "ee" "z" "x" "d"}
+   * offset:  {2, -1}
+   * fill_value: {@, @}
+   * result (group order maybe different):
+   *    keys:   {3 1 1 1 4 4 4}
+   *            {3 1 1 1 2 2 2}
+   *    values: {@ @ @ 3 @ @ 9}
+   *            {@ "bb" "d" @ "z" "x" @}
+   *
+   * -------------------------------------------------
+   * keys:    {1 4 1 3 4 4 1}
+   *          {1 2 1 3 2 2 1}
+   * values:  {3 9 1 4 2 5 7}
+   *          {"a" "c" "bb" "ee" "z" "x" "d"}
+   * offset:  {-2, 1}
+   * fill_value: {-1, "42"}
+   * result (group order maybe different):
+   *    keys:   {3 1 1 1 4 4 4}
+   *            {3 1 1 1 2 2 2}
+   *    values: {-1 7 -1 -1 5 -1 -1}
+   *            {"42" "42" "a" "bb" "42" "c" "z"}
+   *
+   * @endcode
+   *
+   * @param values Table whose columns to be shifted
+   * @param offsets The offsets by which to shift the input
+   * @param fill_values Fill values for indeterminable outputs
+   * @param mr Device memory resource used to allocate the returned table and columns' device memory
+   * @return Pair containing the tables with each group's key and the columns shifted
+   *
+   * @throws cudf::logic_error if @p fill_value[i] dtype does not match @p values[i] dtype for
+   * `i`th column
+   */
+  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> shift(
+    table_view const& values,
+    host_span<size_type const> offsets,
+    std::vector<std::reference_wrapper<const scalar>> const& fill_values,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief The grouped data corresponding to a groupby operation on a set of values.
    *
@@ -251,6 +310,46 @@ class groupby {
   groups get_groups(cudf::table_view values             = {},
                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+  /**
+   * @brief Performs grouped replace nulls on @p value
+   *
+   * For each `value[i] == NULL` in group `j`, `value[i]` is replaced with the first non-null value
+   * in group `j` that precedes or follows `value[i]`. If a non-null value is not found in the
+   * specified direction, `value[i]` is left NULL.
+   *
+   * The returned pair contains a column of the sorted keys and the result column. In result column,
+   * values of the same group are in contiguous memory. In each group, the order of values maintain
+   * their original order. The order of groups are not guaranteed.
+   *
+   * Example:
+   * @code{.pseudo}
+   *
+   * //Inputs:
+   * keys:    {3 3 1 3 1 3 4}
+   *          {2 2 1 2 1 2 5}
+   * values:  {3 4 7 @ @ @ @}
+   *          {@ @ @ "x" "tt" @ @}
+   * replace_policies:    {FORWARD, BACKWARD}
+   *
+   * //Outputs (group orders may be different):
+   * keys:    {3 3 3 3 1 1 4}
+   *          {2 2 2 2 1 1 5}
+   * result:  {3 4 4 4 7 7 @}
+   *          {"x" "x" "x" @ "tt" "tt" @}
+   * @endcode
+   *
+   * @param[in] values A table whose column null values will be replaced.
+   * @param[in] replace_policies Specify the position of replacement values relative to null values,
+   * one for each column
+   * @param[in] mr Device memory resource used to allocate device memory of the returned column.
+   *
+   * @return Pair that contains a table with the sorted keys and the result column
+   */
+  std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
+    table_view const& values,
+    host_span<cudf::replace_policy const> replace_policies,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
  private:
   table_view _keys;                                      ///< Keys that determine grouping
   null_policy _include_null_keys{null_policy::EXCLUDE};  ///< Include rows in keys
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 0fb5002a953..73bff0b36e5 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 /**
@@ -29,18 +30,18 @@ namespace cudf {
  * @brief Computes the hash value of each row in the input set of columns.
  *
  * @param input The table of columns to hash
- * @param initial_hash Optional vector of initial hash values for each column.
- * If this vector is empty then each element will be hashed as-is.
+ * @param initial_hash Optional host_span of initial hash values for each column.
+ * If this span is empty then each element will be hashed as-is.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @returns A column where each row is the hash of a column from the input
  */
 std::unique_ptr<column> hash(
   table_view const& input,
-  hash_id hash_function                     = hash_id::HASH_MURMUR3,
-  std::vector<uint32_t> const& initial_hash = {},
-  uint32_t seed                             = DEFAULT_HASH_SEED,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  hash_id hash_function                        = hash_id::HASH_MURMUR3,
+  cudf::host_span<uint32_t const> initial_hash = {},
+  uint32_t seed                                = DEFAULT_HASH_SEED,
+  rmm::mr::device_memory_resource* mr          = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 9dbde1432aa..bbe0eb0eaac 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -35,8 +35,8 @@ namespace cudf {
 /**
  * @brief Convert a DLPack DLTensor into a cudf table
  *
- * The `device_type` of the DLTensor must be `kDLGPU`, `kDLCPU`, or
- * `kDLCPUPinned`, and `device_id` must match the current device. The `ndim`
+ * The `device_type` of the DLTensor must be `kDLCPU`, `kDLCuda`, or
+ * `kDLCUDAHost`, and `device_id` must match the current device. The `ndim`
  * must be set to 1 or 2. The `dtype` must have 1 lane and the bitsize must
  * match a supported `cudf::data_type`.
  *
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 40090dbc438..4310d0e7c4b 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -43,22 +43,26 @@ class reader {
    *
    * @param filepaths Paths to the files containing the input dataset
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::string> const &filepaths,
                   avro_reader_options const &options,
-                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource *mr);
 
   /**
    * @brief Constructor from an array of datasources
    *
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                   avro_reader_options const &options,
-                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource *mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 7790c2ceee1..8ec2818c2ca 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -38,22 +38,26 @@ class reader {
    *
    * @param filepaths Paths to the files containing the input dataset
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::string> const &filepaths,
                   csv_reader_options const &options,
-                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource *mr);
 
   /**
    * @brief Constructor from an array of datasources
    *
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                   csv_reader_options const &options,
-                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource *mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -83,14 +87,15 @@ class writer {
    *
    * @param sinkp The data sink to write the data to
    * @param options Settings for controlling writing behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   writer(std::unique_ptr<cudf::io::data_sink> sinkp,
          csv_writer_options const &options,
-         rmm::mr::device_memory_resource *mr =
-           rmm::mr::get_current_device_resource());  // cannot provide definition here (because
-                                                     // _impl is incomplete, hence unique_ptr has
-                                                     // not enough sizeof() info)
+         rmm::cuda_stream_view stream,
+         rmm::mr::device_memory_resource *mr);  // cannot provide definition here (because
+                                                // _impl is incomplete hence unique_ptr has
+                                                // not enough sizeof() info)
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 2176381879a..6ed93dc5c25 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -51,22 +51,26 @@ class reader {
    *
    * @param filepaths Paths to the files containing the input dataset
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::string> const &filepaths,
                   json_reader_options const &options,
-                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource *mr);
 
   /**
    * @brief Constructor from an array of datasources
    *
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                   json_reader_options const &options,
-                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource *mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index b8b6bc79159..ab26c01db74 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -52,22 +52,26 @@ class reader {
    *
    * @param filepaths Paths to the files containing the input dataset
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::string> const& filepaths,
                   orc_reader_options const& options,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor from an array of datasources
    *
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   orc_reader_options const& options,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
@@ -101,14 +105,14 @@ class writer {
    * @param sink The data sink to write the data to
    * @param options Settings for controlling writing behavior
    * @param mode Option to write at once or in chunks
-   * @param mr Device memory resource to use for device memory allocation
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
    */
   explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
                   orc_writer_options const& options,
-                  SingleWriteMode mode                = SingleWriteMode::NO,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default);
+                  SingleWriteMode mode,
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor with chunked writer options.
@@ -116,14 +120,14 @@ class writer {
    * @param sink The data sink to write the data to
    * @param options Settings for controlling writing behavior
    * @param mode Option to write at once or in chunks
-   * @param mr Device memory resource to use for device memory allocation
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
    */
   explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
                   chunked_orc_writer_options const& options,
-                  SingleWriteMode mode                = SingleWriteMode::YES,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default);
+                  SingleWriteMode mode,
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 2c946dae748..d95af7a11da 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -54,22 +54,26 @@ class reader {
    *
    * @param filepaths Paths to the files containing the input dataset
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::string> const& filepaths,
                   parquet_reader_options const& options,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor from an array of datasources
    *
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   parquet_reader_options const& options,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -103,14 +107,14 @@ class writer {
    * @param sink The data sink to write the data to
    * @param options Settings for controlling writing behavior
    * @param mode Option to write at once or in chunks
-   * @param mr Device memory resource to use for device memory allocation
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
    */
   explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
                   parquet_writer_options const& options,
-                  SingleWriteMode mode                = SingleWriteMode::YES,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default);
+                  SingleWriteMode mode,
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor for writer to handle chunked parquet options.
@@ -118,16 +122,16 @@ class writer {
    * @param sink The data sink to write the data to
    * @param options Settings for controlling writing behavior for chunked writer
    * @param mode Option to write at once or in chunks
-   * @param mr Device memory resource to use for device memory allocation
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
    *
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
   explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
                   chunked_parquet_writer_options const& options,
-                  SingleWriteMode mode                = SingleWriteMode::NO,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(),
-                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default);
+                  SingleWriteMode mode,
+                  rmm::cuda_stream_view stream,
+                  rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 6be82b4968c..28e51351730 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -63,12 +63,6 @@ class orc_reader_options {
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
-  // Whether to convert decimals to float64
-  bool _decimals_as_float64 = true;
-  // For decimals as int, optional forced decimal scale;
-  // -1 is auto (column scale), >=0: number of fractional digits
-  size_type _forced_decimals_scale = -1;
-
   friend orc_reader_options_builder;
 
   /**
@@ -134,16 +128,6 @@ class orc_reader_options {
    */
   data_type get_timestamp_type() const { return _timestamp_type; }
 
-  /**
-   * @brief Whether to convert decimals to float64.
-   */
-  bool is_enabled_decimals_as_float64() const { return _decimals_as_float64; }
-
-  /**
-   * @brief Returns whether decimal scale is inferred or forced to have limited fractional digits.
-   */
-  size_type get_forced_decimals_scale() const { return _forced_decimals_scale; }
-
   // Setters
 
   /**
@@ -207,20 +191,6 @@ class orc_reader_options {
    * @param type Type of timestamp.
    */
   void set_timestamp_type(data_type type) { _timestamp_type = type; }
-
-  /**
-   * @brief Enable/Disable conversion of decimals to float64.
-   *
-   * @param val Boolean value to enable/disable.
-   */
-  void set_decimals_as_float64(bool val) { _decimals_as_float64 = val; }
-
-  /**
-   * @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
-   *
-   * @param val Length of fractional digits.
-   */
-  void set_forced_decimals_scale(size_type val) { _forced_decimals_scale = val; }
 };
 
 class orc_reader_options_builder {
@@ -325,30 +295,6 @@ class orc_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Enable/Disable conversion of decimals to float64.
-   *
-   * @param val Boolean value to enable/disable.
-   * @return this for chaining.
-   */
-  orc_reader_options_builder& decimals_as_float64(bool val)
-  {
-    options._decimals_as_float64 = val;
-    return *this;
-  }
-
-  /**
-   * @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
-   *
-   * @param val Length of fractional digits.
-   * @return this for chaining.
-   */
-  orc_reader_options_builder& forced_decimals_scale(size_type val)
-  {
-    options._forced_decimals_scale = val;
-    return *this;
-  }
-
   /**
    * @brief move orc_reader_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 906df3f1005..807fab2e85c 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -23,6 +23,8 @@
 
 #include <cudf/io/types.hpp>
 
+#include <optional>
+#include <variant>
 #include <vector>
 
 namespace cudf {
@@ -61,21 +63,9 @@ struct raw_orc_statistics {
 raw_orc_statistics read_raw_orc_statistics(source_info const& src_info);
 
 /**
- * @brief Enumerator for types of column statistics that can be included in `column_statistics`.
- *
- * The statistics type depends on the column data type.
+ * @brief Monostate type alias for the statistics variant.
  */
-enum class statistics_type {
-  NONE,
-  INT,
-  DOUBLE,
-  STRING,
-  BUCKET,
-  DECIMAL,
-  DATE,
-  BINARY,
-  TIMESTAMP,
-};
+using no_statistics = std::monostate;
 
 /**
  * @brief Base class for column statistics that include optional minimum and maximum.
@@ -84,13 +74,8 @@ enum class statistics_type {
  */
 template <typename T>
 struct minmax_statistics {
-  std::unique_ptr<T> _minimum;
-  std::unique_ptr<T> _maximum;
-
-  auto has_minimum() const { return _minimum != nullptr; }
-  auto has_maximum() const { return _maximum != nullptr; }
-  auto minimum() const { return _minimum.get(); }
-  auto maximum() const { return _maximum.get(); }
+  std::optional<T> minimum;
+  std::optional<T> maximum;
 };
 
 /**
@@ -100,24 +85,19 @@ struct minmax_statistics {
  */
 template <typename T>
 struct sum_statistics {
-  std::unique_ptr<T> _sum;
-
-  auto has_sum() const { return _sum != nullptr; }
-  auto sum() const { return _sum.get(); }
+  std::optional<T> sum;
 };
 
 /**
  * @brief Statistics for integral columns.
  */
 struct integer_statistics : minmax_statistics<int64_t>, sum_statistics<int64_t> {
-  static constexpr statistics_type type = statistics_type::INT;
 };
 
 /**
  * @brief Statistics for floating point columns.
  */
 struct double_statistics : minmax_statistics<double>, sum_statistics<double> {
-  static constexpr statistics_type type = statistics_type::DOUBLE;
 };
 
 /**
@@ -128,7 +108,6 @@ struct double_statistics : minmax_statistics<double>, sum_statistics<double> {
  * Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value
  */
 struct string_statistics : minmax_statistics<std::string>, sum_statistics<uint64_t> {
-  static constexpr statistics_type type = statistics_type::STRING;
 };
 
 /**
@@ -137,34 +116,26 @@ struct string_statistics : minmax_statistics<std::string>, sum_statistics<uint64
  * The `count` array includes the count of `false` and `true` values.
  */
 struct bucket_statistics {
-  static constexpr statistics_type type = statistics_type::BUCKET;
-  std::vector<uint64_t> _count;
-
-  auto count(size_t index) const { return &_count.at(index); }
+  std::vector<uint64_t> count;
 };
 
 /**
  * @brief Statistics for decimal columns.
  */
 struct decimal_statistics : minmax_statistics<std::string>, sum_statistics<std::string> {
-  static constexpr statistics_type type = statistics_type::DECIMAL;
 };
 
 /**
  * @brief Statistics for date(time) columns.
  */
-struct date_statistics : minmax_statistics<int32_t> {
-  static constexpr statistics_type type = statistics_type::DATE;
-};
+using date_statistics = minmax_statistics<int32_t>;
 
 /**
  * @brief Statistics for binary columns.
  *
  * The `sum` is the total number of bytes across all elements.
  */
-struct binary_statistics : sum_statistics<int64_t> {
-  static constexpr statistics_type type = statistics_type::BINARY;
-};
+using binary_statistics = sum_statistics<int64_t>;
 
 /**
  * @brief Statistics for timestamp columns.
@@ -173,14 +144,8 @@ struct binary_statistics : sum_statistics<int64_t> {
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  static constexpr statistics_type type = statistics_type::TIMESTAMP;
-  std::unique_ptr<int64_t> _minimum_utc;
-  std::unique_ptr<int64_t> _maximum_utc;
-
-  auto has_minimum_utc() const { return _minimum_utc != nullptr; }
-  auto has_maximum_utc() const { return _maximum_utc != nullptr; }
-  auto minimum_utc() const { return _minimum_utc.get(); }
-  auto maximum_utc() const { return _maximum_utc.get(); }
+  std::optional<int64_t> minimum_utc;
+  std::optional<int64_t> maximum_utc;
 };
 
 namespace orc {
@@ -196,40 +161,20 @@ struct column_statistics;
  * All columns can have the `number_of_values` statistics. Depending on the data type, a column can
  * have additional statistics, accessible through `type_specific_stats` accessor.
  */
-class column_statistics {
- private:
-  std::unique_ptr<uint64_t> _number_of_values;
-  statistics_type _type      = statistics_type::NONE;
-  void* _type_specific_stats = nullptr;
-
- public:
-  column_statistics() = default;
-  column_statistics(cudf::io::orc::column_statistics&& other);
-
-  column_statistics& operator=(column_statistics&&) noexcept;
-  column_statistics(column_statistics&&) noexcept;
-
-  auto has_number_of_values() const { return _number_of_values != nullptr; }
-  auto number_of_values() const { return _number_of_values.get(); }
-
-  auto type() const { return _type; }
-
-  /**
-   * @brief Returns a non-owning pointer to the type-specific statistics of the given type.
-   *
-   * Returns null if the requested statistics type does not match the type of the currently held
-   * type-specific statistics.
-   *
-   * @tparam T the statistics type
-   */
-  template <typename T>
-  T const* type_specific_stats() const
-  {
-    if (T::type != _type) return nullptr;
-    return static_cast<T*>(_type_specific_stats);
-  }
-
-  ~column_statistics();
+struct column_statistics {
+  std::optional<uint64_t> number_of_values;
+  std::variant<no_statistics,
+               integer_statistics,
+               double_statistics,
+               string_statistics,
+               bucket_statistics,
+               decimal_statistics,
+               date_statistics,
+               binary_statistics,
+               timestamp_statistics>
+    type_specific_stats;
+
+  column_statistics(cudf::io::orc::column_statistics&& detail_statistics);
 };
 
 /**
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 7cb3db1eb30..178e46a0c5c 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -392,6 +392,17 @@ class column_in_metadata {
   std::vector<column_in_metadata> children;
 
  public:
+  /**
+   * @brief Get the children of this column metadata
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& add_child(column_in_metadata const& child)
+  {
+    children.push_back(child);
+    return *this;
+  }
+
   /**
    * @brief Set the name of this column
    *
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 5a2c913d4c3..428a4195bf8 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -424,13 +424,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
  * TableB: {{1, 2, 3},  {1, 2, 5}}
  * left_on: {0}
  * right_on: {1}
- * Result: {{0}, {1}}
+ * Result: {{0}}
  *
  * TableA: {{0, 1, 2}, {1, 2, 5}}
  * TableB: {{1, 2, 3}}
  * left_on: {0}
  * right_on: {0}
- * Result: { {0} {1} }
+ * Result: { {0}, {1} }
  * @endcode
  *
  * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
new file mode 100644
index 00000000000..a9407ed57ca
--- /dev/null
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+namespace cudf {
+namespace lists {
+/**
+ * @addtogroup lists_combine
+ * @{
+ * @file
+ */
+
+/*
+ * @brief Flag to specify whether a null list element will be ignored from concatenation, or the
+ * entire concatenation result involving null list elements will be a null element.
+ */
+enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW };
+
+/**
+ * @brief Row-wise concatenating multiple lists columns into a single lists column.
+ *
+ * The output column is generated by concatenating the elements within each row of the input
+ * table. If any row of the input table contains null elements, the concatenation process will
+ * either ignore those null elements, or will simply set the entire resulting row to be a null
+ * element.
+ *
+ * @code{.pseudo}
+ * s1 = [{0, 1}, {2, 3, 4}, {5}, {}, {6, 7}]
+ * s2 = [{8}, {9}, {}, {10, 11, 12}, {13, 14, 15, 16}]
+ * r = lists::concatenate_rows(s1, s2)
+ * r is now [{0, 1, 8}, {2, 3, 4, 9}, {5}, {10, 11, 12}, {6, 7, 13, 14, 15, 16}]
+ * @endcode
+ *
+ * @throws cudf::logic_error if any column of the input table is not a lists columns.
+ * @throws cudf::logic_error if any lists column contains nested typed entry.
+ * @throws cudf::logic_error if all lists columns do not have the same entry type.
+ *
+ * @param input Table of lists to be concatenated.
+ * @param null_policy The parameter to specify whether a null list element will be ignored from
+ *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return A new column in which each row is a list resulted from concatenating all list elements in
+ *         the corresponding row of the input table.
+ */
+std::unique_ptr<column> concatenate_rows(
+  table_view const& input,
+  concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Concatenating multiple lists on the same row of a lists column into a single list.
+ *
+ * Given a lists column where each row in the column is a list of lists of entries, an output lists
+ * column is generated by concatenating all the list elements at the same row together. If any row
+ * contains null list elements, the concatenation process will either ignore those null elements, or
+ * will simply set the entire resulting row to be a null element.
+ *
+ * @code{.pseudo}
+ * l = [ [{1, 2}, {3, 4}, {5}], [{6}, {}, {7, 8, 9}] ]
+ * r = lists::concatenate_list_elements(l);
+ * r is [ {1, 2, 3, 4, 5}, {6, 7, 8, 9} ]
+ * @endcode
+ *
+ * @throws cudf::logic_error if the input column is not at least two-level depth lists column (i.e.,
+ *         each row must be a list of list).
+ * @throws cudf::logic_error if the input lists column contains nested typed entries that are not
+ *         lists.
+ *
+ * @param input The lists column containing lists of list elements to concatenate.
+ * @param null_policy The parameter to specify whether a null list element will be ignored from
+ *        concatenation, or any concatenation involving a null element will result in a null list.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return A new column in which each row is a list resulted from concatenating all list elements in
+ *         the corresponding row of the input lists column.
+ */
+std::unique_ptr<column> concatenate_list_elements(
+  column_view const& input,
+  concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
new file mode 100644
index 00000000000..9f28074173a
--- /dev/null
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/combine.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+/**
+ * @copydoc cudf::lists::concatenate_rows
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> concatenate_rows(
+  table_view const& input,
+  concatenate_null_policy null_policy,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::lists::concatenate_list_elements
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> concatenate_list_elements(
+  column_view const& input,
+  concatenate_null_policy null_policy,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index cfa1980e665..3760294f079 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ namespace detail {
  * @param start Index to first list to select in the column
  * @param end One past the index to last list to select in the column
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocatet the returned column's device memory.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New lists column of size (end - start)
  */
 std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
@@ -47,6 +47,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
new file mode 100644
index 00000000000..7ae90779fdc
--- /dev/null
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+
+/**
+ * @brief Returns a single column by interleaving rows of the given table of list elements.
+ *
+ * @code{.pseudo}
+ * s1 = [{0, 1}, {2, 3, 4}, {5}, {}, {6, 7}]
+ * s2 = [{8}, {9}, {}, {10, 11, 12}, {13, 14, 15, 16}]
+ * r = lists::interleave_columns(s1, s2)
+ * r is now [{0, 1}, {8}, {2, 3, 4}, {9}, {5}, {}, {}, {10, 11, 12}, {6, 7}, {13, 14, 15, 16}]
+ * @endcode
+ *
+ * @throws cudf::logic_error if any column of the input table is not a lists columns.
+ * @throws cudf::logic_error if any lists column contains nested typed entry.
+ * @throws cudf::logic_error if all lists columns do not have the same entry type.
+ *
+ * @param input Table containing lists columns to interleave.
+ * @param has_null_mask A boolean flag indicating that the input columns have a null mask.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return The interleaved columns as a single column.
+ */
+std::unique_ptr<column> interleave_columns(
+  table_view const& input,
+  bool has_null_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 8e2ecdf49a7..dac67545748 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -20,7 +20,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/detail/copying.hpp>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/utilities.cuh>
@@ -32,6 +34,8 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <cinttypes>
 
@@ -46,8 +50,8 @@ namespace {
  *        also holding a reference to the list column.
  *
  * Analogous to the list_view, this class is default constructable,
- * and can thus be stored in rmm::device_vector. It is used to represent
- * the results of a `scatter()` operation; a device_vector may hold
+ * and can thus be stored in rmm::device_uvector. It is used to represent
+ * the results of a `scatter()` operation; a device_uvector may hold
  * several instances of unbound_list_view, each with a flag indicating
  * whether it came from the scatter source or target. Each instance
  * may later be "bound" to the appropriate source/target column, to
@@ -131,7 +135,7 @@ struct unbound_list_view {
   }
 
  private:
-  // Note: Cannot store reference to list column, because of storage in device_vector.
+  // Note: Cannot store reference to list column, because of storage in device_uvector.
   // Only keep track of whether this list row came from the source or target of scatter.
 
   label_type _label{
@@ -140,19 +144,22 @@ struct unbound_list_view {
   size_type _size{};       // Number of elements in *this* list row.
 };
 
+template <typename IndexIterator>
 rmm::device_uvector<unbound_list_view> list_vector_from_column(
   unbound_list_view::label_type label,
   cudf::detail::lists_column_device_view const& lists_column,
+  IndexIterator index_begin,
+  IndexIterator index_end,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto n_rows = lists_column.size();
+  auto n_rows = thrust::distance(index_begin, index_end);
 
   auto vector = rmm::device_uvector<unbound_list_view>(n_rows, stream, mr);
 
   thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(n_rows),
+                    index_begin,
+                    index_end,
                     vector.begin(),
                     [label, lists_column] __device__(size_type row_index) {
                       return unbound_list_view{label, lists_column, row_index};
@@ -203,43 +210,6 @@ std::pair<rmm::device_buffer, size_type> construct_child_nullmask(
                                 mr);
 }
 
-#ifndef NDEBUG
-void print(std::string const& msg, column_view const& col, rmm::cuda_stream_view stream)
-{
-  if (col.type().id() != type_id::INT32) {
-    std::cout << "[Cannot print non-INT32 column.]" << std::endl;
-    return;
-  }
-
-  std::cout << msg << " = [";
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    col.size(),
-    [c = col.template data<int32_t>()] __device__(auto const& i) { printf("%d,", c[i]); });
-  std::cout << "]" << std::endl;
-}
-
-void print(std::string const& msg,
-           rmm::device_uvector<unbound_list_view> const& scatter,
-           rmm::cuda_stream_view stream)
-{
-  std::cout << msg << " == [";
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     scatter.size(),
-                     [s = scatter.begin()] __device__(auto const& i) {
-                       auto si = s[i];
-                       printf("%s[%d](%d), ",
-                              (si.label() == unbound_list_view::label_type::SOURCE ? "S" : "T"),
-                              si.row_index(),
-                              si.size());
-                     });
-  std::cout << "]" << std::endl;
-}
-#endif  // NDEBUG
-
 /**
  * @brief (type_dispatch endpoint) Functor that constructs the child column result
  *        of `scatter()`ing a list column.
@@ -247,7 +217,7 @@ void print(std::string const& msg,
  * The protocol is as follows:
  *
  * Inputs:
- *  1. list_vector:  A device_vector of unbound_list_view, with each element
+ *  1. list_vector:  A device_uvector of unbound_list_view, with each element
  *                   indicating the position, size, and which column the list
  *                   row came from.
  *  2. list_offsets: The offsets column for the (outer) lists column, each offset
@@ -336,69 +306,46 @@ struct list_child_constructor {
     auto const num_child_rows{
       cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
 
-    auto const child_null_mask =
+    auto child_null_mask =
       source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable()
         ? construct_child_nullmask(
             list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr)
         : std::make_pair(rmm::device_buffer{}, 0);
 
-#ifndef NDEBUG
-    print("list_offsets ", list_offsets, stream);
-    print("source_lists.child() ", source_lists_column_view.child(), stream);
-    print("source_lists.offsets() ", source_lists_column_view.offsets(), stream);
-    print("target_lists.child() ", target_lists_column_view.child(), stream);
-    print("target_lists.offsets() ", target_lists_column_view.offsets(), stream);
-    print("scatter_rows ", list_vector, stream);
-#endif  // NDEBUG
-
-    auto child_column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<T>()},
+    auto child_column = cudf::make_fixed_width_column(source_lists_column_view.child().type(),
                                                       num_child_rows,
-                                                      child_null_mask.first,
+                                                      std::move(child_null_mask.first),
                                                       child_null_mask.second,
                                                       stream,
                                                       mr);
 
-    auto copy_child_values_for_list_index = [d_scattered_lists =
-                                               list_vector.begin(),  // unbound_list_view*
-                                             d_child_column =
-                                               child_column->mutable_view().data<T>(),
-                                             d_offsets = list_offsets.template data<int32_t>(),
-                                             source_lists,
-                                             target_lists] __device__(auto const& row_index) {
-      auto const unbound_list_row = d_scattered_lists[row_index];
-      auto const actual_list_row  = unbound_list_row.bind_to_column(source_lists, target_lists);
-      auto const& bound_column =
-        (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? source_lists
-                                                                           : target_lists);
-      auto const list_begin_offset =
-        bound_column.offsets().template element<size_type>(unbound_list_row.row_index());
-      auto const list_end_offset =
-        bound_column.offsets().template element<size_type>(unbound_list_row.row_index() + 1);
-
-#ifndef NDEBUG
-      printf(
-        "%d: Unbound == %s[%d](%d), Bound size == %d, calc_begin==%d, calc_end=%d, calc_size=%d\n",
-        row_index,
-        (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? "S" : "T"),
-        unbound_list_row.row_index(),
-        unbound_list_row.size(),
-        actual_list_row.size(),
-        list_begin_offset,
-        list_end_offset,
-        list_end_offset - list_begin_offset);
-#endif  // NDEBUG
-
-      // Copy all elements in this list row, to "appropriate" offset in child-column.
-      auto const destination_start_offset = d_offsets[row_index];
-      thrust::for_each_n(thrust::seq,
-                         thrust::make_counting_iterator<size_type>(0),
-                         actual_list_row.size(),
-                         [actual_list_row, d_child_column, destination_start_offset] __device__(
-                           auto const& list_element_index) {
-                           d_child_column[destination_start_offset + list_element_index] =
-                             actual_list_row.template element<T>(list_element_index);
-                         });
-    };
+    auto copy_child_values_for_list_index =
+      [d_scattered_lists = list_vector.begin(),  // unbound_list_view*
+       d_child_column    = child_column->mutable_view().data<T>(),
+       d_offsets         = list_offsets.template data<int32_t>(),
+       source_lists,
+       target_lists] __device__(auto const& row_index) {
+        auto const unbound_list_row = d_scattered_lists[row_index];
+        auto const actual_list_row  = unbound_list_row.bind_to_column(source_lists, target_lists);
+        auto const& bound_column =
+          (unbound_list_row.label() == unbound_list_view::label_type::SOURCE ? source_lists
+                                                                             : target_lists);
+        auto const list_begin_offset =
+          bound_column.offsets().template element<size_type>(unbound_list_row.row_index());
+        auto const list_end_offset =
+          bound_column.offsets().template element<size_type>(unbound_list_row.row_index() + 1);
+
+        // Copy all elements in this list row, to "appropriate" offset in child-column.
+        auto const destination_start_offset = d_offsets[row_index];
+        thrust::for_each_n(thrust::seq,
+                           thrust::make_counting_iterator<size_type>(0),
+                           actual_list_row.size(),
+                           [actual_list_row, d_child_column, destination_start_offset] __device__(
+                             auto const& list_element_index) {
+                             d_child_column[destination_start_offset + list_element_index] =
+                               actual_list_row.template element<T>(list_element_index);
+                           });
+      };
 
     // For each list-row, copy underlying elements to the child column.
     thrust::for_each_n(rmm::exec_policy(stream),
@@ -431,6 +378,8 @@ struct list_child_constructor {
     auto const num_child_rows{
       cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
 
+    if (num_child_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
+
     auto string_views = rmm::device_uvector<string_view>(num_child_rows, stream);
 
     auto populate_string_views = [d_scattered_lists = list_vector.begin(),  // unbound_list_view*
@@ -521,6 +470,11 @@ struct list_child_constructor {
     auto const num_child_rows{
       cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
 
+    if (num_child_rows == 0) {
+      // make an empty lists column using the input child type
+      return empty_like(source_lists_column_view.child());
+    }
+
     auto child_list_views = rmm::device_uvector<unbound_list_view>(num_child_rows, stream, mr);
 
     // Function to convert from parent list_device_view instances to child list_device_views.
@@ -641,7 +595,7 @@ struct list_child_constructor {
                                      std::make_unique<column>(structs_list_offsets, stream, mr),
                                      std::make_unique<column>(structs_member, stream, mr),
                                      structs_list_null_count,
-                                     rmm::device_buffer(structs_list_nullmask),
+                                     rmm::device_buffer(structs_list_nullmask, stream),
                                      stream,
                                      mr);
     };
@@ -704,64 +658,46 @@ struct list_child_constructor {
 void assert_same_data_type(column_view const& lhs, column_view const& rhs)
 {
   CUDF_EXPECTS(lhs.type().id() == rhs.type().id(), "Mismatched Data types.");
-  CUDF_EXPECTS(lhs.num_children() == rhs.num_children(), "Mismatched number of child columns.");
+  // Empty string column has no children
+  CUDF_EXPECTS(lhs.type().id() == type_id::STRING or lhs.num_children() == rhs.num_children(),
+               "Mismatched number of child columns.");
 
   for (int i{0}; i < lhs.num_children(); ++i) { assert_same_data_type(lhs.child(i), rhs.child(i)); }
 }
 
-}  // namespace
-
 /**
- * @brief Scatters lists into a copy of the target column
- * according to a scatter map.
+ * @brief General implementation of scattering into list column
  *
- * The scatter is performed according to the scatter iterator such that row
- * `scatter_map[i]` of the output column is replaced by the source list-row.
- * All other rows of the output column equal corresponding rows of the target table.
+ * Scattering `source` into `target` according to `scatter_map`.
+ * The view order of `source` and `target` can be specified by
+ * `source_vector` and `target_vector` respectively.
  *
- * If the same index appears more than once in the scatter map, the result is
- * undefined.
- *
- * The caller must update the null mask in the output column.
- *
- * @tparam SourceIterator must produce list_view objects
  * @tparam MapIterator must produce index values within the target column.
  *
+ * @param source_vector A vector of `unbound_list_view` into source column
+ * @param target_vector A vector of `unbound_list_view` into target column
+ * @param scatter_map_begin Start iterator of scatter map
+ * @param scatter_map_end End iterator of scatter map
+ * @param source Source column view
+ * @param target Target column view
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New lists column.
  */
 template <typename MapIterator>
-std::unique_ptr<column> scatter(
-  column_view const& source,
+std::unique_ptr<column> scatter_impl(
+  rmm::device_uvector<unbound_list_view> const& source_vector,
+  rmm::device_uvector<unbound_list_view>& target_vector,
   MapIterator scatter_map_begin,
   MapIterator scatter_map_end,
+  column_view const& source,
   column_view const& target,
-  rmm::cuda_stream_view stream        = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto const num_rows = target.size();
-
-  if (num_rows == 0) { return cudf::empty_like(target); }
-
-  auto const child_column_type = lists_column_view(target).child().type();
-
   assert_same_data_type(source, target);
 
-  using lists_column_device_view = cudf::detail::lists_column_device_view;
-  using unbound_list_view        = cudf::lists::detail::unbound_list_view;
-
-  auto const source_device_view = column_device_view::create(source, stream);
-  auto const source_vector      = list_vector_from_column(unbound_list_view::label_type::SOURCE,
-                                                     lists_column_device_view(*source_device_view),
-                                                     stream,
-                                                     mr);
-
-  auto const target_device_view = column_device_view::create(target, stream);
-  auto target_vector            = list_vector_from_column(unbound_list_view::label_type::TARGET,
-                                               lists_column_device_view(*target_device_view),
-                                               stream,
-                                               mr);
+  auto const child_column_type = lists_column_view(target).child().type();
 
   // Scatter.
   thrust::scatter(rmm::exec_policy(stream),
@@ -792,7 +728,7 @@ std::unique_ptr<column> scatter(
   auto null_mask =
     target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr};
 
-  return cudf::make_lists_column(num_rows,
+  return cudf::make_lists_column(target.size(),
                                  std::move(offsets_column),
                                  std::move(child_column),
                                  cudf::UNKNOWN_NULL_COUNT,
@@ -801,6 +737,143 @@ std::unique_ptr<column> scatter(
                                  mr);
 }
 
+}  // namespace
+
+/**
+ * @brief Scatters lists into a copy of the target column
+ * according to a scatter map.
+ *
+ * The scatter is performed according to the scatter iterator such that row
+ * `scatter_map[i]` of the output column is replaced by the source list-row.
+ * All other rows of the output column equal corresponding rows of the target table.
+ *
+ * If the same index appears more than once in the scatter map, the result is
+ * undefined.
+ *
+ * The caller must update the null mask in the output column.
+ *
+ * @tparam MapIterator must produce index values within the target column.
+ *
+ * @param source Source column view
+ * @param scatter_map_begin Start iterator of scatter map
+ * @param scatter_map_end End iterator of scatter map
+ * @param target Target column view
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New lists column.
+ */
+template <typename MapIterator>
+std::unique_ptr<column> scatter(
+  column_view const& source,
+  MapIterator scatter_map_begin,
+  MapIterator scatter_map_end,
+  column_view const& target,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const num_rows = target.size();
+  if (num_rows == 0) { return cudf::empty_like(target); }
+
+  auto const source_device_view = column_device_view::create(source, stream);
+  auto const scatter_map_size   = thrust::distance(scatter_map_begin, scatter_map_end);
+  auto const source_vector =
+    list_vector_from_column(unbound_list_view::label_type::SOURCE,
+                            cudf::detail::lists_column_device_view(*source_device_view),
+                            thrust::make_counting_iterator<size_type>(0),
+                            thrust::make_counting_iterator<size_type>(scatter_map_size),
+                            stream,
+                            mr);
+
+  auto const target_device_view = column_device_view::create(target, stream);
+  auto target_vector =
+    list_vector_from_column(unbound_list_view::label_type::TARGET,
+                            cudf::detail::lists_column_device_view(*target_device_view),
+                            thrust::make_counting_iterator<size_type>(0),
+                            thrust::make_counting_iterator<size_type>(num_rows),
+                            stream,
+                            mr);
+
+  return scatter_impl(
+    source_vector, target_vector, scatter_map_begin, scatter_map_end, source, target, stream, mr);
+}
+
+/**
+ * @brief Scatters list scalar (a single row) into a copy of the target column
+ * according to a scatter map.
+ *
+ * Returns a copy of the target column where every row specified in the `scatter_map`
+ * is replaced by the row value.
+ *
+ * If the same index appears more than once in the scatter map, the result is
+ * undefined.
+ *
+ * The caller must update the null mask in the output column.
+ *
+ * @tparam MapIterator must produce index values within the target column.
+ *
+ * @param slr Source scalar, specifying row data
+ * @param scatter_map_begin Start iterator of scatter map
+ * @param scatter_map_end End iterator of scatter map
+ * @param target Target column view
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New lists column.
+ */
+template <typename MapIterator>
+std::unique_ptr<column> scatter(
+  scalar const& slr,
+  MapIterator scatter_map_begin,
+  MapIterator scatter_map_end,
+  column_view const& target,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const num_rows = target.size();
+  if (num_rows == 0) { return cudf::empty_like(target); }
+
+  auto lv        = static_cast<list_scalar const*>(&slr);
+  bool slr_valid = slr.is_valid(stream);
+  rmm::device_buffer null_mask =
+    slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
+              : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
+  auto offset_column = make_numeric_column(
+    data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
+  thrust::sequence(rmm::exec_policy(stream),
+                   offset_column->mutable_view().begin<offset_type>(),
+                   offset_column->mutable_view().end<offset_type>(),
+                   0,
+                   lv->view().size());
+  auto wrapped = column_view(data_type{type_id::LIST},
+                             1,
+                             nullptr,
+                             static_cast<bitmask_type const*>(null_mask.data()),
+                             slr_valid ? 0 : 1,
+                             0,
+                             {offset_column->view(), lv->view()});
+
+  auto const source_device_view = column_device_view::create(wrapped, stream);
+  auto const scatter_map_size   = thrust::distance(scatter_map_begin, scatter_map_end);
+  auto const source_vector =
+    list_vector_from_column(unbound_list_view::label_type::SOURCE,
+                            cudf::detail::lists_column_device_view(*source_device_view),
+                            thrust::make_constant_iterator<size_type>(0),
+                            thrust::make_constant_iterator<size_type>(0) + scatter_map_size,
+                            stream,
+                            mr);
+
+  auto const target_device_view = column_device_view::create(target, stream);
+  auto target_vector =
+    list_vector_from_column(unbound_list_view::label_type::TARGET,
+                            cudf::detail::lists_column_device_view(*target_device_view),
+                            thrust::make_counting_iterator<size_type>(0),
+                            thrust::make_counting_iterator<size_type>(num_rows),
+                            stream,
+                            mr);
+
+  return scatter_impl(
+    source_vector, target_vector, scatter_map_begin, scatter_map_end, wrapped, target, stream, mr);
+}
+
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 4f207474526..802639f2393 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -40,10 +40,10 @@ class list_device_view {
     cudf_assert(row_index >= 0 && row_index < lists_column.size() && row_index < offsets.size() &&
                 "row_index out of bounds");
 
-    begin_offset = offsets.element<size_type>(row_index);
+    begin_offset = offsets.element<size_type>(row_index + lists_column.offset());
     cudf_assert(begin_offset >= 0 && begin_offset <= lists_column.child().size() &&
                 "begin_offset out of bounds.");
-    _size = offsets.element<size_type>(row_index + 1) - begin_offset;
+    _size = offsets.element<size_type>(row_index + 1 + lists_column.offset()) - begin_offset;
   }
 
   ~list_device_view() = default;
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index 187b9c2cf6a..d8f082c9a42 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -75,6 +75,12 @@ class lists_column_device_view {
    */
   CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const { return underlying.is_null(idx); }
 
+  /**
+   * @brief Fetches the offset of the underlying column_device_view,
+   *        in case it is a sliced/offset column.
+   */
+  CUDA_DEVICE_CALLABLE size_type offset() const { return underlying.offset(); }
+
  private:
   column_device_view underlying;
 };
diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp
new file mode 100644
index 00000000000..bdf06cfa9e7
--- /dev/null
+++ b/cpp/include/cudf/lists/lists_column_factories.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/types.hpp>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+
+/**
+ * @brief Internal API to construct a lists column from a `list_scalar`, for public
+ * use, use `cudf::make_column_from_scalar`.
+ *
+ * @param[in] value The `list_scalar` to construct from
+ * @param[in] size The number of rows for the output column.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @param[in] mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<cudf::column> make_lists_column_from_scalar(
+  list_scalar const& value,
+  size_type size,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index 44a64a01c5e..4fb1b4a7319 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/types.hpp>
 
 #include <memory>
@@ -58,7 +59,7 @@ std::unique_ptr<column> rolling_window(
   size_type preceding_window,
   size_type following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& agg,
+  rolling_aggregation const& agg,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,7 +68,7 @@ std::unique_ptr<column> rolling_window(
  *            size_type preceding_window,
  *            size_type following_window,
  *            size_type min_periods,
- *            std::unique_ptr<aggregation> const& agg,
+ *            rolling_aggregation const& agg,
  *            rmm::mr::device_memory_resource* mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
@@ -80,7 +81,7 @@ std::unique_ptr<column> rolling_window(
   size_type preceding_window,
   size_type following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& agg,
+  rolling_aggregation const& agg,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -119,6 +120,7 @@ struct window_bounds {
   {
   }
 };
+
 /**
  * @brief  Applies a grouping-aware, fixed-size rolling window function to the values in a column.
  *
@@ -195,7 +197,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type preceding_window,
   size_type following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
+  rolling_aggregation const& aggr,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -205,7 +207,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type preceding_window,
  *            size_type following_window,
  *            size_type min_periods,
- *            std::unique_ptr<aggregation> const& aggr,
+ *            rolling_aggregation const& aggr,
  *            rmm::mr::device_memory_resource* mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
@@ -214,7 +216,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds preceding_window,
   window_bounds following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
+  rolling_aggregation const& aggr,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -224,7 +226,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type preceding_window,
  *            size_type following_window,
  *            size_type min_periods,
- *            std::unique_ptr<aggregation> const& aggr,
+ *            rolling_aggregation const& aggr,
  *            rmm::mr::device_memory_resource* mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
@@ -238,7 +240,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type preceding_window,
   size_type following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
+  rolling_aggregation const& aggr,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -249,7 +251,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type preceding_window,
  *            size_type following_window,
  *            size_type min_periods,
- *            std::unique_ptr<aggregation> const& aggr,
+ *            rolling_aggregation const& aggr,
  *            rmm::mr::device_memory_resource* mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
@@ -259,12 +261,12 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds preceding_window,
   window_bounds following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
+  rolling_aggregation const& aggr,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
- *column.
+ *         column.
  *
  * Like `rolling_window()`, this function aggregates values in a window around each
  * element of a specified `input` column. It differs from `rolling_window()` in two respects:
@@ -353,20 +355,40 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type preceding_window_in_days,
   size_type following_window_in_days,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
+  rolling_aggregation const& aggr,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc  std::unique_ptr<column> grouped_time_range_rolling_window(
- *             table_view const& group_keys,
- *             column_view const& timestamp_column,
- *             cudf::order const& timestamp_order,
- *             column_view const& input,
- *             size_type preceding_window_in_days,
- *             size_type following_window_in_days,
- *             size_type min_periods,
- *             std::unique_ptr<aggregation> const& aggr,
- *             rmm::mr::device_memory_resource* mr)
+ * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
+ *         column,.
+ *
+ * @copydetails  std::unique_ptr<column> grouped_time_range_rolling_window(
+ *                table_view const& group_keys,
+ *                column_view const& timestamp_column,
+ *                cudf::order const& timestamp_order,
+ *                column_view const& input,
+ *                size_type preceding_window_in_days,
+ *                size_type following_window_in_days,
+ *                size_type min_periods,
+ *                rolling_aggregation const& aggr,
+ *                rmm::mr::device_memory_resource* mr)
+ *
+ * The `preceding_window_in_days` and `following_window_in_days` supports "unbounded" windows,
+ * if set to `window_bounds::unbounded()`.
+ *
+ * @param[in] group_keys The (pre-sorted) grouping columns
+ * @param[in] timestamp_column The (pre-sorted) timestamps for each row
+ * @param[in] timestamp_order  The order (ASCENDING/DESCENDING) in which the timestamps are sorted
+ * @param[in] input The input column (to be aggregated)
+ * @param[in] preceding_window_in_days Possibly unbounded time-interval in the backward direction,
+ *                                     specified as a `window_bounds`
+ * @param[in] following_window_in_days Possibly unbounded time-interval in the forward direction,
+ *                                     specified as a `window_bounds`
+ * @param[in] min_periods Minimum number of observations in window required to have a value,
+ *                        otherwise element `i` is null.
+ * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ *
+ * @returns   A nullable output column containing the rolling window results
  */
 std::unique_ptr<column> grouped_time_range_rolling_window(
   table_view const& group_keys,
@@ -376,7 +398,126 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   window_bounds preceding_window_in_days,
   window_bounds following_window_in_days,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
+  rolling_aggregation const& aggr,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief  Applies a grouping-aware, value range-based rolling window function to the values in a
+ *         column.
+ *
+ * This function aggregates rows in a window around each element of a specified `input` column.
+ * The window is determined based on the values of an ordered `orderby` column, and on the values
+ * of a `preceding` and `following` scalar representing an inclusive range of orderby column values.
+ *
+ *   1. The elements of the `input` column are grouped into distinct groups (e.g. the result of a
+ *      groupby), determined by the corresponding values of the columns under `group_keys`. The
+ *      window-aggregation cannot cross the group boundaries.
+ *   2. Within a group, with all rows sorted by the `orderby` column, the aggregation window
+ *      for a row at index `i` is determined as follows:
+ *      a) If `orderby` is ASCENDING, aggregation window for row `i` includes all `input` rows at
+ *         index `j` such that:
+ *         @code{.pseudo}
+ *           (orderby[i] - preceding) <= orderby[j] <= orderby[i] + following
+ *         @endcode
+ *      b) If `orderby` is DESCENDING, aggregation window for row `i` includes all `input` rows at
+ *         index `j` such that:
+ *         @code{.pseudo}
+ *           (orderby[i] + preceding) >= orderby[j] >= orderby[i] - following
+ *         @endcode
+ *
+ * Note: This method requires that the rows are presorted by the group keys and orderby column
+ * values.
+ *
+ * The window intervals are specified as scalar values appropriate for the orderby column.
+ * Currently, only the following combinations of `orderby` column type and range types
+ * are supported:
+ *   1. If `orderby` column is a TIMESTAMP, the `preceding`/`following` windows are specified
+ *      in terms of `DURATION` scalars of the same resolution.
+ *      E.g. For `orderby` column of type `TIMESTAMP_SECONDS`, the intervals may only be
+ *      `DURATION_SECONDS`. Durations of higher resolution (e.g. `DURATION_NANOSECONDS`)
+ *      or lower (e.g. `DURATION_DAYS`) cannot be used.
+ *   2. If the `orderby` column is an integral type (e.g. `INT32`), the `preceding`/`following`
+ *      should be the exact same type (`INT32`).
+ *
+ * @code{.pseudo}
+ * Example: Consider an motor-racing statistics dataset, containing the following columns:
+ *   1. driver_name:   (STRING) Name of the car driver
+ *   2. num_overtakes: (INT32)  Number of times the driver overtook another car in a lap
+ *   3. lap_number:    (INT32)  The number of the lap
+ *
+ * The `group_range_rolling_window()` function allows one to calculate the total number of overtakes
+ * each driver made within any 3 lap window of each entry:
+ *   1. Group/partition the dataset by `driver_id` (This is the group_keys argument.)
+ *   2. Sort each group by the `lap_number` (i.e. This is the orderby_column.)
+ *   3. Calculate the SUM(num_overtakes) over a window (preceding=1, following=1)
+ *
+ * For the following input:
+ *
+ *  [ // driver_name,  num_overtakes,  lap_number
+ *    {   "bottas",        1,            1        },
+ *    {   "hamilton",      2,            1        },
+ *    {   "bottas",        2,            2        },
+ *    {   "bottas",        1,            3        },
+ *    {   "hamilton",      3,            1        },
+ *    {   "hamilton",      8,            2        },
+ *    {   "bottas",        5,            7        },
+ *    {   "bottas",        6,            8        },
+ *    {   "hamilton",      4,            4        }
+ *  ]
+ *
+ * Partitioning (grouping) by `driver_name`, and ordering by `lap_number` yields the following
+ * `num_overtakes` vector (with 2 groups, one for each distinct `driver_name`):
+ *
+ * lap_number:      [ 1,  2,  3,  7,  8,   1,  1,   2,  4 ]
+ * num_overtakes:   [ 1,  2,  1,  5,  6,   2,  3,   8,  4 ]
+ *                    <-----bottas------>|<----hamilton--->
+ *
+ * The SUM aggregation is applied, with 1 preceding, and 1 following, with a minimum of 1
+ * period. The aggregation window is thus 3 (laps) wide, yielding the following output column:
+ *
+ *  Results:        [ 3,  4,  3,  11, 11,  13, 13,  13,  4 ]
+ *
+ * @endcode
+ *
+ * Note: The number of rows participating in each window might vary, based on the index within the
+ * group, datestamp, and `min_periods`. Apropos:
+ *  1. results[0] considers 2 values, because it is at the beginning of its group, and has no
+ *     preceding values.
+ *  2. results[5] considers 3 values, despite being at the beginning of its group. It must include 2
+ *     following values, based on its orderby_column value.
+ *
+ * Each aggregation operation cannot cross group boundaries.
+ *
+ * The type of the returned column depends on the input column type `T`, and the aggregation:
+ *   1. COUNT   returns `INT32` columns
+ *   2. MIN/MAX returns `T` columns
+ *   3. SUM     returns the promoted type for T. Sum on `INT32` yields `INT64`.
+ *   4. MEAN    returns FLOAT64 columns
+ *   5. COLLECT returns columns of type `LIST<T>`.
+ *
+ * LEAD/LAG/ROW_NUMBER are undefined for range queries.
+ *
+ * @param[in] group_keys The (pre-sorted) grouping columns
+ * @param[in] orderby_column The (pre-sorted) order-by column, for range comparisons
+ * @param[in] order  The order (ASCENDING/DESCENDING) in which the order-by column is sorted
+ * @param[in] input The input column (to be aggregated)
+ * @param[in] preceding The interval value in the backward direction
+ * @param[in] following The interval value in the forward direction.
+ * @param[in] min_periods Minimum number of observations in window required to have a value,
+ *                        otherwise element `i` is null.
+ * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ *
+ * @returns   A nullable output column containing the rolling window results
+ */
+std::unique_ptr<column> grouped_range_rolling_window(
+  table_view const& group_keys,
+  column_view const& orderby_column,
+  cudf::order const& order,
+  column_view const& input,
+  range_window_bounds const& preceding,
+  range_window_bounds const& following,
+  size_type min_periods,
+  rolling_aggregation const& aggr,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -418,7 +559,7 @@ std::unique_ptr<column> rolling_window(
   column_view const& preceding_window,
   column_view const& following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& agg,
+  rolling_aggregation const& agg,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
new file mode 100644
index 00000000000..0c86bd3cf86
--- /dev/null
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/scalar/scalar.hpp>
+
+namespace cudf {
+
+/**
+ * @brief Abstraction for window boundary sizes, to be used with
+ *        `grouped_range_rolling_window()`.
+ *
+ * Similar to `window_bounds` in `grouped_rolling_window()`, `range_window_bounds`
+ * represents window boundaries for use with `grouped_range_rolling_window()`.
+ * A window may be specified as either of the following:
+ *   1. A fixed-width numeric scalar value. E.g.
+ *      a) A `DURATION_DAYS` scalar, for use with a `TIMESTAMP_DAYS` orderby column
+ *      b) An `INT32` scalar, for use with an `INT32` orderby column
+ *   2. "unbounded", indicating that the bounds stretch to the first/last
+ *      row in the group.
+ */
+struct range_window_bounds {
+ public:
+  /**
+   * @brief Factory method to construct a bounded window boundary.
+   *
+   * @param value Finite window boundary
+   *
+   */
+  static range_window_bounds get(scalar const&);
+
+  /**
+   * @brief Factory method to construct an unbounded window boundary.
+   *
+   * @param @type The datatype of the window boundary
+   */
+  static range_window_bounds unbounded(data_type type);
+
+  /**
+   * @brief Whether or not the window is unbounded
+   *
+   * @return true If window is unbounded
+   * @return false If window is of finite bounds
+   */
+  bool is_unbounded() const { return _is_unbounded; }
+
+  /**
+   * @brief Returns the underlying scalar value for the bounds
+   */
+  scalar const& range_scalar() const { return *_range_scalar; }
+
+  range_window_bounds(range_window_bounds const&) =
+    default;                        // Required to return (by copy) from functions.
+  range_window_bounds() = default;  // Required for use as return types from dispatch functors.
+
+ private:
+  const bool _is_unbounded{true};
+  std::shared_ptr<scalar> _range_scalar{nullptr};  // To enable copy construction/assignment.
+
+  range_window_bounds(bool is_unbounded_, std::unique_ptr<scalar> range_scalar_);
+};
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index ded833f4ca0..6938ad5feaa 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -15,20 +15,15 @@
  */
 #pragma once
 
+#include <cudf/column/column.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <cudf/fixed_point/fixed_point.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 
-#include <memory>
-#include <utility>
-#include <vector>
-
 /**
  * @file
  * @brief Class definitions for cudf::scalar
@@ -50,16 +45,27 @@ namespace cudf {
  */
 class scalar {
  public:
-  virtual ~scalar()           = default;
-  scalar(scalar&& other)      = default;
-  scalar(scalar const& other) = default;
+  virtual ~scalar()      = default;
+  scalar(scalar&& other) = default;
+
   scalar& operator=(scalar const& other) = delete;
   scalar& operator=(scalar&& other) = delete;
 
+  /**
+   * @brief Construct a new scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  scalar(scalar const& other,
+         rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Returns the scalar's logical value type
    */
-  data_type type() const noexcept { return _type; }
+  data_type type() const noexcept;
 
   /**
    * @brief Updates the validity of the value
@@ -67,10 +73,7 @@ class scalar {
    * @param is_valid true: set the value to valid. false: set it to null
    * @param stream CUDA stream used for device memory operations.
    */
-  void set_valid(bool is_valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
-  {
-    _is_valid.set_value(is_valid, stream);
-  }
+  void set_valid(bool is_valid, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Indicates whether the scalar contains a valid value
@@ -81,20 +84,17 @@ class scalar {
    * @return true Value is valid
    * @return false Value is invalid/null
    */
-  bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
-  {
-    return _is_valid.value(stream);
-  }
+  bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns a raw pointer to the validity bool in device memory
    */
-  bool* validity_data() { return _is_valid.data(); }
+  bool* validity_data();
 
   /**
    * @brief Returns a const raw pointer to the validity bool in device memory
    */
-  bool const* validity_data() const { return _is_valid.data(); }
+  bool const* validity_data() const;
 
  protected:
   data_type _type{type_id::EMPTY};       ///< Logical type of value in the scalar
@@ -116,10 +116,7 @@ class scalar {
   scalar(data_type type,
          bool is_valid                       = false,
          rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : _type(type), _is_valid(is_valid, stream, mr)
-  {
-  }
+         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
 namespace detail {
@@ -130,53 +127,57 @@ class fixed_width_scalar : public scalar {
  public:
   using value_type = T;
 
-  ~fixed_width_scalar()                               = default;
-  fixed_width_scalar(fixed_width_scalar&& other)      = default;
-  fixed_width_scalar(fixed_width_scalar const& other) = default;
+  ~fixed_width_scalar()                          = default;
+  fixed_width_scalar(fixed_width_scalar&& other) = default;
+
   fixed_width_scalar& operator=(fixed_width_scalar const& other) = delete;
   fixed_width_scalar& operator=(fixed_width_scalar&& other) = delete;
 
+  /**
+   * @brief Construct a new fixed-width scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  fixed_width_scalar(fixed_width_scalar const& other,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Set the value of the scalar
    *
    * @param value New value of scalar
    * @param stream CUDA stream used for device memory operations.
    */
-  void set_value(T value, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
-  {
-    _data.set_value(value, stream);
-    this->set_valid(true, stream);
-  }
+  void set_value(T value, rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
   /**
    * @brief Implicit conversion operator to get the value of the scalar on the host
    */
-  explicit operator value_type() const { return this->value(0); }
+  explicit operator value_type() const;
 
   /**
    * @brief Get the value of the scalar
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  T value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
-  {
-    return _data.value(stream);
-  }
+  T value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns a raw pointer to the value in device memory
    */
-  T* data() { return _data.data(); }
+  T* data();
 
   /**
    * @brief Returns a const raw pointer to the value in device memory
    */
-  T const* data() const { return _data.data(); }
+  T const* data() const;
 
  protected:
   rmm::device_scalar<T> _data{};  ///< device memory containing the value
 
-  fixed_width_scalar() : scalar(data_type(type_to_id<T>())) {}
+  fixed_width_scalar();
 
   /**
    * @brief Construct a new fixed width scalar object
@@ -189,10 +190,7 @@ class fixed_width_scalar : public scalar {
   fixed_width_scalar(T value,
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data(value, stream, mr)
-  {
-  }
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed width scalar object from existing device memory.
@@ -205,11 +203,7 @@ class fixed_width_scalar : public scalar {
   fixed_width_scalar(rmm::device_scalar<T>&& data,
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar(data_type(type_to_id<T>()), is_valid, stream, mr),
-      _data{std::forward<rmm::device_scalar<T>>(data)}
-  {
-  }
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
 }  // namespace detail
@@ -224,13 +218,24 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   static_assert(is_numeric<T>(), "Unexpected non-numeric type.");
 
  public:
-  numeric_scalar()                            = default;
-  ~numeric_scalar()                           = default;
-  numeric_scalar(numeric_scalar&& other)      = default;
-  numeric_scalar(numeric_scalar const& other) = default;
+  numeric_scalar()                       = default;
+  ~numeric_scalar()                      = default;
+  numeric_scalar(numeric_scalar&& other) = default;
+
   numeric_scalar& operator=(numeric_scalar const& other) = delete;
   numeric_scalar& operator=(numeric_scalar&& other) = delete;
 
+  /**
+   * @brief Construct a new numeric scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  numeric_scalar(numeric_scalar const& other,
+                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Construct a new numeric scalar object
    *
@@ -242,10 +247,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   numeric_scalar(T value,
                  bool is_valid                       = true,
                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
-  {
-  }
+                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new numeric scalar object from existing device memory.
@@ -258,10 +260,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   numeric_scalar(rmm::device_scalar<T>&& data,
                  bool is_valid                       = true,
                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
-  {
-  }
+                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
 /**
@@ -274,15 +273,27 @@ class fixed_point_scalar : public scalar {
   static_assert(is_fixed_point<T>(), "Unexpected non-fixed_point type.");
 
  public:
-  using rep_type = typename T::rep;
+  using rep_type   = typename T::rep;
+  using value_type = T;
+
+  fixed_point_scalar();
+  ~fixed_point_scalar()                          = default;
+  fixed_point_scalar(fixed_point_scalar&& other) = default;
 
-  fixed_point_scalar() : scalar(data_type(type_to_id<T>())){};
-  ~fixed_point_scalar()                               = default;
-  fixed_point_scalar(fixed_point_scalar&& other)      = default;
-  fixed_point_scalar(fixed_point_scalar const& other) = default;
   fixed_point_scalar& operator=(fixed_point_scalar const& other) = delete;
   fixed_point_scalar& operator=(fixed_point_scalar&& other) = delete;
 
+  /**
+   * @brief Construct a new fixed_point scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  fixed_point_scalar(fixed_point_scalar const& other,
+                     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Construct a new fixed_point scalar object from already shifted value and scale
    *
@@ -296,11 +307,7 @@ class fixed_point_scalar : public scalar {
                      numeric::scale_type scale,
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar{data_type{type_to_id<T>(), static_cast<int32_t>(scale)}, is_valid, stream, mr},
-      _data{value}
-  {
-  }
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a value and default 0-scale
@@ -313,10 +320,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(rep_type value,
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value}
-  {
-  }
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a fixed_point number
@@ -329,10 +333,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(T value,
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar{data_type{type_to_id<T>(), value.scale()}, is_valid, stream, mr}, _data{value.value()}
-  {
-  }
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
@@ -347,42 +348,31 @@ class fixed_point_scalar : public scalar {
                      numeric::scale_type scale,
                      bool is_valid                       = true,
                      rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr},
-      _data{std::forward<rmm::device_scalar<rep_type>>(data)}
-  {
-  }
+                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get the value of the scalar
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
-  {
-    return _data.value(stream);
-  }
+  rep_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Get the decimal32 or decimal64
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  T fixed_point_value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const
-  {
-    using namespace numeric;
-    return T{scaled_integer<rep_type>{_data.value(stream), scale_type{type().scale()}}};
-  }
+  T fixed_point_value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns a raw pointer to the value in device memory
    */
-  rep_type* data() { return _data.data(); }
+  rep_type* data();
 
   /**
    * @brief Returns a const raw pointer to the value in device memory
    */
-  rep_type const* data() const { return _data.data(); }
+  rep_type const* data() const;
 
  protected:
   rmm::device_scalar<rep_type> _data{};  ///< device memory containing the value
@@ -395,13 +385,24 @@ class string_scalar : public scalar {
  public:
   using value_type = cudf::string_view;
 
-  string_scalar() : scalar(data_type(type_id::STRING)) {}
-  ~string_scalar()                          = default;
-  string_scalar(string_scalar&& other)      = default;
-  string_scalar(string_scalar const& other) = default;
+  string_scalar();
+  ~string_scalar()                     = default;
+  string_scalar(string_scalar&& other) = default;
+
   string_scalar& operator=(string_scalar const& other) = delete;
   string_scalar& operator=(string_scalar&& other) = delete;
 
+  /**
+   * @brief Construct a new string scalar object by deep copying another string_scalar.
+   *
+   * @param[in] other The other string_scalar to copy
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  string_scalar(string_scalar const& other,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Construct a new string scalar object
    *
@@ -413,10 +414,7 @@ class string_scalar : public scalar {
   string_scalar(std::string const& string,
                 bool is_valid                       = true,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : scalar(data_type(type_id::STRING), is_valid), _data(string.data(), string.size(), stream, mr)
-  {
-  }
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view
@@ -449,7 +447,7 @@ class string_scalar : public scalar {
   /**
    * @brief Implicit conversion operator to get the value of the scalar in a host std::string
    */
-  explicit operator std::string() const { return this->to_string(0); }
+  explicit operator std::string() const;
 
   /**
    * @brief Get the value of the scalar in a host std::string
@@ -468,12 +466,12 @@ class string_scalar : public scalar {
   /**
    * @brief Returns the size of the string in bytes
    */
-  size_type size() const { return _data.size(); }
+  size_type size() const;
 
   /**
    * @brief Returns a raw pointer to the string in device memory
    */
-  const char* data() const { return static_cast<const char*>(_data.data()); }
+  const char* data() const;
 
  protected:
   rmm::device_buffer _data{};  ///< device memory containing the string
@@ -490,13 +488,24 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   static_assert(is_chrono<T>(), "Unexpected non-chrono type");
 
  public:
-  chrono_scalar()                           = default;
-  ~chrono_scalar()                          = default;
-  chrono_scalar(chrono_scalar&& other)      = default;
-  chrono_scalar(chrono_scalar const& other) = default;
+  chrono_scalar()                      = default;
+  ~chrono_scalar()                     = default;
+  chrono_scalar(chrono_scalar&& other) = default;
+
   chrono_scalar& operator=(chrono_scalar const& other) = delete;
   chrono_scalar& operator=(chrono_scalar&& other) = delete;
 
+  /**
+   * @brief Construct a new chrono scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  chrono_scalar(chrono_scalar const& other,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Construct a new chrono scalar object
    *
@@ -508,10 +517,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   chrono_scalar(T value,
                 bool is_valid                       = true,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
-  {
-  }
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new chrono scalar object from existing device memory.
@@ -524,18 +530,29 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   chrono_scalar(rmm::device_scalar<T>&& data,
                 bool is_valid                       = true,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
-  {
-  }
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
 template <typename T>
-struct timestamp_scalar : chrono_scalar<T> {
+class timestamp_scalar : public chrono_scalar<T> {
+ public:
   static_assert(is_timestamp<T>(), "Unexpected non-timestamp type");
   using chrono_scalar<T>::chrono_scalar;
+  using rep_type = typename T::rep;
 
-  timestamp_scalar() = default;
+  timestamp_scalar()                         = default;
+  timestamp_scalar(timestamp_scalar&& other) = default;
+
+  /**
+   * @brief Construct a new timestamp scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  timestamp_scalar(timestamp_scalar const& other,
+                   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new timestamp scalar object from a duration that is
@@ -551,23 +568,34 @@ struct timestamp_scalar : chrono_scalar<T> {
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
                    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : chrono_scalar<T>(T{typename T::duration{value}}, is_valid, stream, mr)
-  {
-  }
+                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Return the duration in number of ticks since the UNIX epoch.
    */
-  typename T::rep ticks_since_epoch() { return this->value().time_since_epoch().count(); }
+  rep_type ticks_since_epoch();
 };
 
 template <typename T>
-struct duration_scalar : chrono_scalar<T> {
+class duration_scalar : public chrono_scalar<T> {
+ public:
   static_assert(is_duration<T>(), "Unexpected non-duration type");
   using chrono_scalar<T>::chrono_scalar;
+  using rep_type = typename T::rep;
 
-  duration_scalar() = default;
+  duration_scalar()                        = default;
+  duration_scalar(duration_scalar&& other) = default;
+
+  /**
+   * @brief Construct a new duration scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  duration_scalar(duration_scalar const& other,
+                  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new duration scalar object from tick counts
@@ -577,18 +605,130 @@ struct duration_scalar : chrono_scalar<T> {
    * @param stream CUDA stream used for device memory operations.
    * @param mr Device memory resource to use for device memory allocation
    */
-  duration_scalar(typename T::rep value,
+  duration_scalar(rep_type value,
                   bool is_valid,
                   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : chrono_scalar<T>(T{value}, is_valid, stream, mr)
-  {
-  }
+                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Return the duration in number of ticks.
    */
-  typename T::rep count() { return this->value().count(); }
+  rep_type count();
 };
+
+/**
+ * @brief An owning class to represent a list value in device memory
+ */
+class list_scalar : public scalar {
+ public:
+  list_scalar();
+  ~list_scalar()                   = default;
+  list_scalar(list_scalar&& other) = default;
+
+  list_scalar& operator=(list_scalar const& other) = delete;
+  list_scalar& operator=(list_scalar&& other) = delete;
+
+  /**
+   * @brief Construct a new list scalar object by deep copying another.
+   *
+   * @param[in] other The scalar to copy.
+   * @param[in] stream CUDA stream used for device memory operations.
+   * @param[in] mr Device memory resource to use for device memory allocation
+   */
+  list_scalar(list_scalar const& other,
+              rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct a new list scalar object from column_view
+   *
+   * The input column_view is copied.
+   *
+   * @param data The column data to copy.
+   * @param is_valid Whether the value held by the scalar is valid
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  list_scalar(cudf::column_view const& data,
+              bool is_valid                       = true,
+              rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct a new list scalar object from existing column.
+   *
+   * @param data The column to take ownership of
+   * @param is_valid Whether the value held by the scalar is valid
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  list_scalar(cudf::column&& data,
+              bool is_valid                       = true,
+              rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Returns a non-owning, immutable view to underlying device data
+   */
+  column_view view() const;
+
+ private:
+  cudf::column _data;
+};
+
+/**
+ * @brief An owning class to represent a struct value in device memory
+ */
+class struct_scalar : public scalar {
+ public:
+  struct_scalar();
+  ~struct_scalar()                          = default;
+  struct_scalar(struct_scalar&& other)      = default;
+  struct_scalar(struct_scalar const& other) = default;
+  struct_scalar& operator=(struct_scalar const& other) = delete;
+  struct_scalar& operator=(struct_scalar&& other) = delete;
+
+  /**
+   * @brief Construct a new struct scalar object from table_view
+   *
+   * The input table_view is deep-copied.
+   *
+   * @param data The table data to copy.
+   * @param is_valid Whether the value held by the scalar is valid
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  struct_scalar(table_view const& data,
+                bool is_valid                       = true,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct a new struct scalar object from a host_span of column_views
+   *
+   * The input column_views are deep-copied.
+   *
+   * @param data The column_views to copy.
+   * @param is_valid Whether the value held by the scalar is valid
+   * @param stream CUDA stream used for device memory operations.
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  struct_scalar(host_span<column_view const> data,
+                bool is_valid                       = true,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Returns a non-owning, immutable view to underlying device data
+   */
+  table_view view() const;
+
+ private:
+  table _data;
+
+  void init(bool is_valid, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  void superimpose_nulls(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+};
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index a0a0a22091e..b96a8c65a04 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,5 +156,45 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
 }
 
+/**
+ * @brief Construct scalar using the given column of elements
+ *
+ * @param elements Elements of the list
+ * @param stream CUDA stream used for device memory operations.
+ * @param mr Device memory resource used to allocate the scalar's `data` and `is_valid` bool.
+ */
+std::unique_ptr<scalar> make_list_scalar(
+  column_view elements,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Construct a struct scalar using the given table_view.
+ *
+ * The columns must have 1 row.
+ *
+ * @param data The columnar data to store in the scalar object
+ * @param stream CUDA stream used for device memory operations.
+ * @param mr Device memory resource used to allocate the scalar's `data` and `is_valid` bool.
+ */
+std::unique_ptr<scalar> make_struct_scalar(
+  table_view const& data,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Construct a struct scalar using the given span of column views.
+ *
+ * The columns must have 1 row.
+ *
+ * @param value The columnar data to store in the scalar object
+ * @param stream CUDA stream used for device memory operations.
+ * @param mr Device memory resource used to allocate the scalar's `data` and `is_valid` bool.
+ */
+std::unique_ptr<scalar> make_struct_scalar(
+  host_span<column_view const> data,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index f47b4515b3a..192be4fb6a9 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -227,8 +227,9 @@ enum class duplicate_keep_option {
  * @param[in] input           input table_view to copy only unique rows
  * @param[in] keys            vector of indices representing key columns from `input`
  * @param[in] keep            keep first entry, last entry, or no entries if duplicates found
- * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL,
- * nulls are not equal if null_equality::UNEQUAL
+ * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
+ *                            equal if null_equality::UNEQUAL
+ * @param[in] null_precedence flag to denote nulls should appear before or after non-null items
  * @param[in] mr              Device memory resource used to allocate the returned table's device
  * memory
  *
@@ -239,6 +240,7 @@ std::unique_ptr<table> drop_duplicates(
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
   null_equality nulls_equal           = null_equality::EQUAL,
+  null_order null_precedence          = null_order::BEFORE,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 49f824b3805..3e069de2f0f 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -30,52 +31,29 @@ namespace strings {
  */
 
 /**
- * @brief Row-wise concatenates the given list of strings columns and
- * returns a single strings column result.
- *
- * Each new string is created by concatenating the strings from the same
- * row delimited by the separator provided.
- *
- * Any row with a null entry will result in the corresponding output
- * row to be null entry unless a narep string is specified to be used
- * in its place.
- *
- * The number of strings in the columns provided must be the same.
- *
- * @code{.pseudo}
- * Example:
- * s1 = ['aa', null, '', 'aa']
- * s2 = ['', 'bb', 'bb', null]
- * r1 = concatenate([s1,s2])
- * r1 is ['aa', null, 'bb', null]
- * r2 = concatenate([s1,s2],':','_')
- * r2 is ['aa:', '_:bb', ':bb', 'aa:_']
- * @endcode
- *
- * @throw cudf::logic_error if input columns are not all strings columns.
- * @throw cudf::logic_error if separator is not valid.
- *
- * @param strings_columns List of string columns to concatenate.
- * @param separator String that should inserted between each string from each row.
- *        Default is an empty string.
- * @param narep String that should be used in place of any null strings
- *        found in any column. Default of invalid-scalar means any null entry in any column will
- *        produces a null result for that row.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with concatenated results.
+ * @brief Setting for specifying how separators are added with
+ * null strings elements.
  */
-std::unique_ptr<column> concatenate(
-  table_view const& strings_columns,
-  string_scalar const& separator      = string_scalar(""),
-  string_scalar const& narep          = string_scalar("", false),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+enum class separator_on_nulls {
+  YES,  ///< Always add separators between elements
+  NO    ///< Do not add separators if an element is null
+};
+
+/**
+ * @brief Setting for specifying what will be output from `join_list_elements` when an input list
+ * is empty.
+ */
+enum class output_if_empty_list {
+  EMPTY_STRING,  ///< Empty list will result in empty string
+  NULL_ELEMENT   ///< Empty list will result in a null
+};
 
 /**
  * @brief Concatenates all strings in the column into one new string delimited
  * by an optional separator string.
  *
  * This returns a column with one string. Any null entries are ignored unless
- * the narep parameter specifies a replacement string.
+ * the @p narep parameter specifies a replacement string.
  *
  * @code{.pseudo}
  * Example:
@@ -110,11 +88,9 @@ std::unique_ptr<column> join_strings(
  *
  * - If row separator for a given row is null, output column for that row is null, unless
  *   there is a valid @p separator_narep
- * - If all column values for a given row is null, output column for that row is null, unless
- *   there is a valid @p col_narep
- * - null column values for a given row are skipped, if the column replacement isn't valid
- * - The separator is only applied between two valid column values
- * - If valid @p separator_narep and @p col_narep are provided, the output column is always
+ * - The separator is applied between two output row values if the @p separate_nulls
+ *   is `YES` or only between valid rows if @p separate_nulls is `NO`.
+ * - If @p separator_narep and @p col_narep are both valid, the output column is always
  *   non nullable
  *
  * @code{.pseudo}
@@ -123,16 +99,25 @@ std::unique_ptr<column> join_strings(
  * c1   = [null, 'cc', 'dd', null, null, 'gg']
  * c2   = ['bb', '',   null, null, null, 'hh']
  * sep  = ['::', '%%', '^^', '!',  '*',  null]
- * out0 = concatenate([c0, c1, c2], sep)
- * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null]
+ * out = concatenate({c0, c1, c2}, sep)
+ * // all rows have at least one null or sep[i]==null
+ * out is [null, null, null, null, null, null]
  *
  * sep_rep = '+'
- * out1    = concatenate([c0, c1, c2], sep, sep_rep)
- * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh']
+ * out = concatenate({c0, c1, c2}, sep, sep_rep)
+ * // all rows with at least one null output as null
+ * out is [null, null, null, null, null, 'ff+gg+hh']
+ *
+ * col_narep = '-'
+ * sep_na = non-valid scalar
+ * out = concatenate({c0, c1, c2}, sep, sep_na, col_narep)
+ * // only the null entry in the sep column produces a null row
+ * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
  *
- * col_rep = '-'
- * out2    = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep)
- * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
+ * col_narep = ''
+ * out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO)
+ * // parameter suppresses separator for null rows
+ * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh']
  * @endcode
  *
  * @throw cudf::logic_error if no input columns are specified - table view is empty
@@ -148,6 +133,8 @@ std::unique_ptr<column> join_strings(
  * @param col_narep String that should be used in place of any null strings
  *        found in any column. Default of invalid-scalar means no null column value replacements.
  *        Default is an invalid string.
+ * @param separate_nulls If YES, then the separator is included for null rows
+ *        if `col_narep` is valid.
  * @param mr Resource for allocating device memory.
  * @return New column with concatenated results.
  */
@@ -156,8 +143,184 @@ std::unique_ptr<column> concatenate(
   strings_column_view const& separators,
   string_scalar const& separator_narep = string_scalar("", false),
   string_scalar const& col_narep       = string_scalar("", false),
+  separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Row-wise concatenates the given list of strings columns and
+ * returns a single strings column result.
+ *
+ * Each new string is created by concatenating the strings from the same
+ * row delimited by the separator provided.
+ *
+ * Any row with a null entry will result in the corresponding output
+ * row to be null entry unless a narep string is specified to be used
+ * in its place.
+ *
+ * If @p separate_nulls is set to `NO` and @p narep is valid then
+ * separators are not added to the output between null elements.
+ * Otherwise, separators are always added if @p narep is valid.
+ *
+ * More than one column must be specified in the input @p strings_columns
+ * table.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s1 = ['aa', null, '', 'dd']
+ * s2 = ['', 'bb', 'cc', null]
+ * out = concatenate({s1, s2})
+ * out is ['aa', null, 'cc', null]
+ *
+ * out = concatenate({s1, s2}, ':', '_')
+ * out is ['aa:', '_:bb', ':cc', 'dd:_']
+ *
+ * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO)
+ * out is ['aa:', 'bb', ':cc', 'dd']
+ * @endcode
+ *
+ * @throw cudf::logic_error if input columns are not all strings columns.
+ * @throw cudf::logic_error if separator is not valid.
+ * @throw cudf::logic_error if only one column is specified
+ *
+ * @param strings_columns List of string columns to concatenate.
+ * @param separator String that should inserted between each string from each row.
+ *        Default is an empty string.
+ * @param narep String that should be used in place of any null strings
+ *        found in any column. Default of invalid-scalar means any null entry in any column will
+ *        produces a null result for that row.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New column with concatenated results.
+ */
+std::unique_ptr<column> concatenate(
+  table_view const& strings_columns,
+  string_scalar const& separator      = string_scalar(""),
+  string_scalar const& narep          = string_scalar("", false),
+  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
+ * within each row and returns a single strings column result.
+ *
+ * Each new string is created by concatenating the strings from the same row (same list element)
+ * delimited by the row separator provided in the @p separators strings column.
+ *
+ * A null list row will always result in a null string in the output row. Any non-null list row
+ * having a null element will result in the corresponding output row to be null unless a valid
+ * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators
+ * column will also result in a null output row unless a valid @p separator_narep scalar is provided
+ * to be used in place of the null separators.
+ *
+ * If @p separate_nulls is set to `NO` and @p string_narep is valid then separators are not added to
+ * the output between null elements. Otherwise, separators are always added if @p string_narep is
+ * valid.
+ *
+ * If @p empty_list_policy is set to `EMPTY_STRING`, any row that is an empty list will result in
+ * an empty output string. Otherwise, the output will be a null.
+ *
+ * In the special case when the input list row contains all null elements, the output will be the
+ * same as in case of empty input list regardless of @p string_narep and @p separate_nulls values.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ]
+ * sep  = ['::', '%%',  '!',  '*',  null]
+ *
+ * out = join_list_elements(s, sep)
+ * out is ['aa::bb::cc', null, '!dd', null, null]
+ *
+ * out = join_list_elements(s, sep, ':', '_')
+ * out is ['aa::bb::cc', null,  '!dd', 'ee*_', 'ff:gg']
+ *
+ * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO)
+ * out is ['aa::bb::cc', null,  '!dd', 'ee', 'ff:gg']
+ * @endcode
+ *
+ * @throw cudf::logic_error if input column is not lists of strings column.
+ * @throw cudf::logic_error if the number of rows from `separators` and `lists_strings_column` do
+ *        not match
+ *
+ * @param lists_strings_column Column containing lists of strings to concatenate.
+ * @param separators Strings column that provides separators for concatenation.
+ * @param separator_narep String that should be used to replace null separator, default is an
+ *        invalid-scalar denoting that rows containing null separator will result in null string in
+ *        the corresponding output rows.
+ * @param string_narep String that should be used to replace null strings in any non-null list row,
+ *        default is an invalid-scalar denoting that list rows containing null strings will result
+ *        in null string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
+ * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will
+ *        result in an empty string. Otherwise, it will result in a null.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings column with concatenated results.
+ */
+std::unique_ptr<column> join_list_elements(
+  const lists_column_view& lists_strings_column,
+  const strings_column_view& separators,
+  string_scalar const& separator_narep   = string_scalar("", false),
+  string_scalar const& string_narep      = string_scalar("", false),
+  separator_on_nulls separate_nulls      = separator_on_nulls::YES,
+  output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
+ * within each row and returns a single strings column result.
+ *
+ * Each new string is created by concatenating the strings from the same row (same list element)
+ * delimited by the @p separator provided.
+ *
+ * A null list row will always result in a null string in the output row. Any non-null list row
+ * having a null elenent will result in the corresponding output row to be null unless a
+ * @p narep string is specified to be used in its place.
+ *
+ * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
+ * output between null elements. Otherwise, separators are always added if @p narep is valid.
+ *
+ * If @p empty_list_policy is set to `EMPTY_STRING`, any row that is an empty list will result in
+ * an empty output string. Otherwise, the output will be a null.
+ *
+ * In the special case when the input list row contains all null elements, the output will be the
+ * same as in case of empty input list regardless of @p narep and @p separate_nulls values.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ]
+ *
+ * out = join_list_elements(s)
+ * out is ['aabbcc', null, 'dd', null, 'ff']
+ *
+ * out = join_list_elements(s, ':', '_')
+ * out is ['aa:bb:cc', null,  ':dd', 'ee:_', 'ff']
+ *
+ * out = join_list_elements(s, ':', '', separator_on_nulls::NO)
+ * out is ['aa:bb:cc', null,  ':dd', 'ee', 'ff']
+ * @endcode
+ *
+ * @throw cudf::logic_error if input column is not lists of strings column.
+ * @throw cudf::logic_error if separator is not valid.
+ *
+ * @param lists_strings_column Column containing lists of strings to concatenate.
+ * @param separator String that should inserted between strings of each list row, default is an
+ *        empty string.
+ * @param narep String that should be used to replace null strings in any non-null list row, default
+ *        is an invalid-scalar denoting that list rows containing null strings will result in null
+ *        string in the corresponding output rows.
+ * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
+ * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will result
+ *        in an empty string. Otherwise, it will result in a null.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings column with concatenated results.
+ */
+std::unique_ptr<column> join_list_elements(
+  const lists_column_view& lists_strings_column,
+  string_scalar const& separator         = string_scalar(""),
+  string_scalar const& narep             = string_scalar("", false),
+  separator_on_nulls separate_nulls      = separator_on_nulls::YES,
+  output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index ed783ca996c..d6bdf398886 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -32,11 +33,13 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate(table_view const& strings_columns,
-                                    string_scalar const& separator,
-                                    string_scalar const& narep,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> concatenate(
+  table_view const& strings_columns,
+  string_scalar const& separator,
+  string_scalar const& narep,
+  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar
@@ -44,11 +47,12 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> join_strings(strings_column_view const& strings,
-                                     string_scalar const& separator,
-                                     string_scalar const& narep,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+std::unique_ptr<column> join_strings(
+  strings_column_view const& strings,
+  string_scalar const& separator,
+  string_scalar const& narep,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 176a548da4d..bffcb5c1a31 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -73,8 +74,7 @@ std::unique_ptr<cudf::column> copy_if_else(
     stream,
     mr);
   size_type null_count = valid_mask.second;
-  rmm::device_buffer null_mask{0, stream, mr};
-  if (null_count) null_mask = valid_mask.first;
+  auto null_mask       = (null_count > 0) ? std::move(valid_mask.first) : rmm::device_buffer{};
 
   // build offsets column
   auto offsets_transformer = [lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
@@ -92,8 +92,9 @@ std::unique_ptr<cudf::column> copy_if_else(
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
   auto d_chars      = chars_column->mutable_view().template data<char>();
   // fill in chars
   thrust::for_each_n(
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index c5d87258b7a..c0fa74c4662 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -180,10 +181,10 @@ std::unique_ptr<column> copy_range(
 
     auto p_offsets =
       thrust::device_pointer_cast(p_offsets_column->view().template data<size_type>());
-    auto chars_bytes = p_offsets[target.size()];
-
-    auto p_chars_column = strings::detail::create_chars_child_column(
-      target.size(), null_count, chars_bytes, stream, mr);
+    auto const chars_bytes =
+      cudf::detail::get_value<int32_t>(p_offsets_column->view(), target.size(), stream);
+    auto p_chars_column =
+      strings::detail::create_chars_child_column(target.size(), chars_bytes, stream, mr);
 
     // copy to the chars column
 
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 988fa552100..86f79881408 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -65,7 +65,7 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
   auto const output_count = std::distance(map_begin, map_end);
   if (output_count == 0) return make_empty_column(data_type{type_id::INT8});
 
-  auto chars_column  = create_chars_child_column(output_count, 0, chars_bytes, stream, mr);
+  auto chars_column  = create_chars_child_column(output_count, chars_bytes, stream, mr);
   auto const d_chars = chars_column->mutable_view().template data<char>();
 
   auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp
index e6a0b49f102..85094175572 100644
--- a/cpp/include/cudf/strings/detail/json.hpp
+++ b/cpp/include/cudf/strings/detail/json.hpp
@@ -32,6 +32,7 @@ namespace detail {
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
   cudf::string_scalar const& json_path,
+  get_json_object_options options,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index caac0579085..8d893a120dc 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,10 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/merge.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -82,9 +82,9 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
   auto d_offsets = offsets_column->view().template data<int32_t>();
 
   // create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
   // merge the strings
   auto d_chars = chars_column->mutable_view().template data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
diff --git a/cpp/include/cudf/strings/detail/modify_strings.cuh b/cpp/include/cudf/strings/detail/modify_strings.cuh
deleted file mode 100644
index 6feaa039bab..00000000000
--- a/cpp/include/cudf/strings/detail/modify_strings.cuh
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <strings/utilities.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-
-/**
- * @brief Generic string modification in two passes: 1st pass probes for memory load requirements;
- * 2nd pass executes string modification.
- *
- * @tparam device_probe_functor Functor for probing memory requirements;
- * must implement `__device__ int32_t operator()(size_type idx) const`
- * @tparam device_execute_functor Functor for executing string modification; must
- * implement `__device__ int32_t operator()(size_type idx)`
- * @tparam ...Types Types of possible additional arguments to be forwarded
- * to the probe / execute functors (pre-condition: must both take the same trailling pack of
- * arguments, in addition to their required args)
- *
- * @param strings Number Column of strings to apply the modifications on;
- * it is not modified in place; rather a new column is returned instead
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * (cannot be a default argument because of the variadic pack);
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * (cannot be a default argument because of the variadic pack);
- * @param ...args Additional arguments to be forwarded to
- * the probe / execute constructors (can be empty);
- * @return modified strings column
- */
-template <typename device_probe_functor, typename device_execute_functor, typename... Types>
-std::unique_ptr<column> modify_strings(strings_column_view const& strings,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr,
-                                       Types&&... args)
-{
-  auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
-
-  auto strings_column  = column_device_view::create(strings.parent(), stream);
-  auto d_column        = *strings_column;
-  size_type null_count = strings.null_count();
-
-  // copy null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  // get the lookup tables used for case conversion
-
-  device_probe_functor d_probe_fctr{d_column, std::forward<Types>(args)...};
-
-  // build offsets column -- calculate the size of each output string
-  auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator(0, d_probe_fctr);
-  auto offsets_column          = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto offsets_view = offsets_column->view();
-  auto d_new_offsets =
-    offsets_view.template data<int32_t>();  // not sure why this requires `.template` and the next
-                                            // one (`d_chars = ...`) doesn't
-
-  // build the chars column -- convert characters based on case_flag parameter
-  size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.data<char>();
-
-  device_execute_functor d_execute_fctr{
-    d_column, d_new_offsets, d_chars, std::forward<Types>(args)...};
-
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     d_execute_fctr);
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             null_count,
-                             std::move(null_mask),
-                             stream,
-                             mr);
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index b1c6b9a6f0b..820168ce3de 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -78,10 +78,22 @@ std::unique_ptr<column> replace(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&,
- * rmm::mr::device_memory_resource*)
+ * @brief Replaces any null string entries with the given string.
  *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * This returns a strings column with no null entries.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["hello", nullptr, "goodbye"]
+ * r = replace_nulls(s,"**")
+ * r is now ["hello", "**", "goodbye"]
+ * @endcode
+ *
+ * @param strings Strings column for this operation.
+ * @param repl Replacement string for null entries. Default is empty string.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings column.
  */
 std::unique_ptr<column> replace_nulls(
   strings_column_view const& strings,
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 92cf537454c..166deb6560d 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -14,17 +14,15 @@
  * limitations under the License.
  */
 
-#include <strings/utilities.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/gather.cuh>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -115,7 +113,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     } else {
       // this approach is 2-3x faster for a large number of smaller string lengths
       auto chars_column =
-        strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+        strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
       auto d_chars    = chars_column->mutable_view().template data<char>();
       auto copy_chars = [d_chars] __device__(auto item) {
         string_index_pair const str = thrust::get<0>(item);
@@ -185,9 +183,8 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                     [] __device__(auto offset) { return static_cast<int32_t>(offset); });
 
   // build chars column
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto chars_view   = chars_column->mutable_view();
   thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_view.data<char>());
 
   return make_strings_column(strings_count,
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index ba903c87485..68ebb5dbe19 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -17,11 +17,15 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 
 #include <mutex>
@@ -87,6 +91,196 @@ std::unique_ptr<cudf::column> child_offsets_from_string_iterator(
   return make_offsets_child_column(begin, begin + num_strings, stream, mr);
 }
 
+/**
+ * @brief Copies input string data into a buffer and increments the pointer by the number of bytes
+ * copied.
+ *
+ * @param buffer Device buffer to copy to.
+ * @param input Data to copy from.
+ * @param bytes Number of bytes to copy.
+ * @return Pointer to the end of the output buffer after the copy.
+ */
+__device__ inline char* copy_and_increment(char* buffer, const char* input, size_type bytes)
+{
+  memcpy(buffer, input, bytes);
+  return buffer + bytes;
+}
+
+/**
+ * @brief Copies input string data into a buffer and increments the pointer by the number of bytes
+ * copied.
+ *
+ * @param buffer Device buffer to copy to.
+ * @param d_string String to copy.
+ * @return Pointer to the end of the output buffer after the copy.
+ */
+__device__ inline char* copy_string(char* buffer, const string_view& d_string)
+{
+  return copy_and_increment(buffer, d_string.data(), d_string.size_bytes());
+}
+
+/**
+ * @brief Creates child offsets and chars columns by applying the template function that
+ * can be used for computing the output size of each string as well as create the output.
+ *
+ * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
+ *         It must also have members d_offsets and d_chars which are set to
+ *         memory containing the offsets and chars columns during write.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string.
+ *        After that, the d_offsets and d_chars are set and this is called again to fill in the
+ *        chars memory.
+ * @param exec_size Number of rows for executing the `size_and_exec_fn` function.
+ * @param strings_count Number of strings.
+ * @param mr Device memory resource used to allocate the returned columns' device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return offsets child column and chars child column for a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(
+  SizeAndExecuteFunction size_and_exec_fn,
+  size_type exec_size,
+  size_type strings_count,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_view          = offsets_column->mutable_view();
+  auto d_offsets             = offsets_view.template data<int32_t>();
+  size_and_exec_fn.d_offsets = d_offsets;
+
+  // This is called twice -- once for offsets and once for chars.
+  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
+  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       exec_size,
+                       size_and_exec_fn);
+  };
+
+  // Compute the offsets values
+  for_each_fn(size_and_exec_fn);
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // Now build the chars column
+  auto const bytes = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
+  std::unique_ptr<column> chars_column =
+    create_chars_child_column(strings_count, bytes, stream, mr);
+
+  // Execute the function fn again to fill the chars column.
+  // Note that if the output chars column has zero size, the function fn should not be called to
+  // avoid accidentally overwriting the offsets.
+  if (bytes > 0) {
+    size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
+    for_each_fn(size_and_exec_fn);
+  }
+
+  return std::make_pair(std::move(offsets_column), std::move(chars_column));
+}
+
+/**
+ * @brief Creates child offsets and chars columns by applying the template function that
+ * can be used for computing the output size of each string as well as create the output.
+ *
+ * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
+ *         It must also have members d_offsets and d_chars which are set to
+ *         memory containing the offsets and chars columns during write.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string.
+ *        After that, the d_offsets and d_chars are set and this is called again to fill in the
+ *        chars memory.
+ * @param strings_count Number of strings.
+ * @param mr Device memory resource used to allocate the returned columns' device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return offsets child column and chars child column for a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(
+  SizeAndExecuteFunction size_and_exec_fn,
+  size_type strings_count,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
+}
+
+/**
+ * @brief Creates child offsets, chars columns and null mask, null count of a strings column by
+ * applying the template function that can be used for computing the output size of each string as
+ * well as create the output.
+ *
+ * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
+ *         It must have members `d_offsets`, `d_chars`, and `d_validities` which are set to memory
+ *         containing the offsets column, chars column and string validities during write.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string, which is
+ *                         written into the `d_offsets` array. After that, `d_chars` is set and this
+ *                         is called again to fill in the chars memory. The `d_validities` array may
+ *                         be modified to set the value `0` for the corresponding rows that contain
+ *                         null string elements.
+ * @param exec_size Range for executing the function `size_and_exec_fn`.
+ * @param strings_count Number of strings.
+ * @param mr Device memory resource used to allocate the returned columns' device memory.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return offsets child column, chars child column, null_mask, and null_count for a strings column.
+ */
+template <typename SizeAndExecuteFunction>
+std::tuple<std::unique_ptr<column>, std::unique_ptr<column>, rmm::device_buffer, size_type>
+make_strings_children_with_null_mask(
+  SizeAndExecuteFunction size_and_exec_fn,
+  size_type exec_size,
+  size_type strings_count,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_view          = offsets_column->mutable_view();
+  auto d_offsets             = offsets_view.template data<int32_t>();
+  size_and_exec_fn.d_offsets = d_offsets;
+
+  auto validities               = rmm::device_uvector<int8_t>(strings_count, stream);
+  size_and_exec_fn.d_validities = validities.begin();
+
+  // This is called twice: once for offsets and validities, and once for chars
+  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       exec_size,
+                       size_and_exec_fn);
+  };
+
+  // Compute the string sizes (storing in `d_offsets`) and string validities
+  for_each_fn(size_and_exec_fn);
+
+  // Compute the offsets from string sizes
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // Now build the chars column
+  auto const bytes  = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+
+  // Execute the function fn again to fill the chars column.
+  // Note that if the output chars column has zero size, the function fn should not be called to
+  // avoid accidentally overwriting the offsets.
+  if (bytes > 0) {
+    size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
+    for_each_fn(size_and_exec_fn);
+  }
+
+  // Finally compute null mask and null count from the validities array
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+
+  return std::make_tuple(std::move(offsets_column),
+                         std::move(chars_column),
+                         null_count > 0 ? std::move(null_mask) : rmm::device_buffer{},
+                         null_count);
+}
+
 // This template is a thin wrapper around per-context singleton objects.
 // It maintains a single object for each CUDA context.
 template <typename TableType>
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index a5db4d55001..4eff3f2dafc 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -30,7 +30,6 @@ namespace detail {
  * This will return the properly sized column to be filled in by the caller.
  *
  * @param strings_count Number of strings in the column.
- * @param null_count Number of null string entries in the column.
  * @param bytes Number of bytes for the chars column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
@@ -38,7 +37,6 @@ namespace detail {
  */
 std::unique_ptr<column> create_chars_child_column(
   size_type strings_count,
-  size_type null_count,
   size_type bytes,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp
index b39e4a2027c..9081fa23eec 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/strings/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <thrust/optional.h>
+
 namespace cudf {
 namespace strings {
 
@@ -26,6 +28,76 @@ namespace strings {
  * @file
  */
 
+/**
+ * @brief Settings for `get_json_object()`.
+ */
+class get_json_object_options {
+  // allow single quotes to represent strings in JSON
+  bool allow_single_quotes = false;
+
+  // individual string values are returned with quotes stripped.
+  bool strip_quotes_from_single_strings = true;
+
+ public:
+  /**
+   * @brief Default constructor.
+   */
+  explicit get_json_object_options() = default;
+
+  /**
+   * @brief Returns true/false depending on whether single-quotes for representing strings
+   * are allowed.
+   */
+  CUDA_HOST_DEVICE_CALLABLE bool get_allow_single_quotes() const { return allow_single_quotes; }
+
+  /**
+   * @brief Returns true/false depending on whether individually returned string values have
+   * their quotes stripped.
+   *
+   * When set to true, if the return value for a given row is an individual string
+   * (not an object, or an array of strings), strip the quotes from the string and return only the
+   * contents of the string itself.  Example:
+   *
+   * @code{.pseudo}
+   *
+   * With strip_quotes_from_single_strings OFF:
+   * Input  = {"a" : "b"}
+   * Query  = $.a
+   * Output = "b"
+   *
+   * With strip_quotes_from_single_strings ON:
+   * Input  = {"a" : "b"}
+   * Query  = $.a
+   * Output = b
+   *
+   * @endcode
+   */
+  CUDA_HOST_DEVICE_CALLABLE bool get_strip_quotes_from_single_strings() const
+  {
+    return strip_quotes_from_single_strings;
+  }
+
+  /**
+   * @brief Set whether single-quotes for strings are allowed.
+   *
+   * @param _allow_single_quotes bool indicating desired behavior.
+   */
+  void set_allow_single_quotes(bool _allow_single_quotes)
+  {
+    allow_single_quotes = _allow_single_quotes;
+  }
+
+  /**
+   * @brief Set whether individually returned string values have their quotes stripped.
+   *
+   * @param _strip_quotes_from_single_strings bool indicating desired behavior.
+   */
+  void set_strip_quotes_from_single_strings(bool _strip_quotes_from_single_strings)
+  {
+    strip_quotes_from_single_strings = _strip_quotes_from_single_strings;
+  }
+};
+
 /**
  * @brief Apply a JSONPath string to all rows in an input strings column.
  *
@@ -37,12 +109,14 @@ namespace strings {
  *
  * @param col The input strings column. Each row must contain a valid json string
  * @param json_path The JSONPath string to be applied to each row
+ * @param options Options for controlling the behavior of the function
  * @param mr Resource for allocating device memory.
  * @return New strings column containing the retrieved json object strings
  */
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
   cudf::string_scalar const& json_path,
+  get_json_object_options options     = get_json_object_options{},
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 8f0957d1020..e9091b88b08 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -151,28 +151,6 @@ std::unique_ptr<column> replace(
   strings_column_view const& repls,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Replaces any null string entries with the given string.
- *
- * This returns a strings column with no null entries.
- *
- * @code{.pseudo}
- * Example:
- * s = ["hello", nullptr, "goodbye"]
- * r = replace_nulls(s,"**")
- * r is now ["hello", "**", "goodbye"]
- * @endcode
- *
- * @param strings Strings column for this operation.
- * @param repl Replacement string for null entries. Default is empty string.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
- */
-std::unique_ptr<column> replace_nulls(
-  strings_column_view const& strings,
-  string_scalar const& repl           = string_scalar(""),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 4bcb46e4655..f5ab2046441 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -92,20 +92,6 @@ __device__ inline size_type string_view::length() const
 {
   if (_length == UNKNOWN_STRING_LENGTH)
     _length = strings::detail::characters_in_string(_data, _bytes);
-  if (_length && (_char_width == UNKNOWN_CHAR_WIDTH)) {
-    uint8_t const* ptr = reinterpret_cast<uint8_t const*>(data());
-    auto const first   = strings::detail::bytes_in_utf8_byte(*ptr);
-    // see if they are all the same width
-    _char_width = (thrust::find_if(thrust::seq,
-                                   ptr,
-                                   ptr + size_bytes(),
-                                   [first](auto ch) {
-                                     auto width = strings::detail::bytes_in_utf8_byte(ch);
-                                     return (width != 0) && (width != first);
-                                   })) == (ptr + size_bytes())
-                    ? first
-                    : VARIABLE_CHAR_WIDTH;
-  }
   return _length;
 }
 
@@ -251,7 +237,7 @@ __device__ inline size_type string_view::byte_offset(size_type pos) const
   size_type offset = 0;
   const char* sptr = _data;
   const char* eptr = sptr + _bytes;
-  if (_char_width > 0) return pos * _char_width;
+  if (length() == size_bytes()) return pos;
   while ((pos > 0) && (sptr < eptr)) {
     size_type charbytes = strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(*sptr++));
     if (charbytes) --pos;
@@ -408,7 +394,7 @@ __device__ inline string_view string_view::substr(size_type pos, size_type lengt
 
 __device__ inline size_type string_view::character_offset(size_type bytepos) const
 {
-  if (_char_width > 0) return bytepos / _char_width;
+  if (length() == size_bytes()) return bytepos;
   return strings::detail::characters_in_string(data(), bytepos);
 }
 
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 667a25c7641..4b1a901d72f 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -36,13 +36,6 @@ using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
  */
 constexpr cudf::size_type UNKNOWN_STRING_LENGTH{-1};
 
-/**
- * @brief The char width is initialized to this value as a place-holder.
- *
- * The byte-width of the characters in a string is computed on-demand.
- */
-constexpr int8_t UNKNOWN_CHAR_WIDTH{-1};
-
 /**
  * @brief This value is assigned to the _char_width member if the string
  * contains characters of different widths.
@@ -314,7 +307,7 @@ class string_view {
   /**
    * @brief Default constructor represents an empty string.
    */
-  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0), _char_width(0) {}
+  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0) {}
 
   /**
    * @brief Create instance from existing device char array.
@@ -323,7 +316,7 @@ class string_view {
    * @param bytes Number of bytes in data array.
    */
   CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes)
-    : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH), _char_width(UNKNOWN_CHAR_WIDTH)
+    : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH)
   {
   }
 
@@ -334,10 +327,9 @@ class string_view {
   string_view& operator=(string_view&&) = default;
 
  private:
-  const char* _data{};           ///< Pointer to device memory contain char array for this string
-  size_type _bytes{};            ///< Number of bytes in _data for this string
-  mutable size_type _length{};   ///< Number of characters in this string (computed)
-  mutable int8_t _char_width{};  ///< Number of bytes per character if uniform width (computed)
+  const char* _data{};          ///< Pointer to device memory contain char array for this string
+  size_type _bytes{};           ///< Number of bytes in _data for this string
+  mutable size_type _length{};  ///< Number of characters in this string (computed)
 
   /**
    * @brief Return the character position of the given byte offset.
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 0c7270b3ba8..4d3c2dcdc56 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -19,7 +19,7 @@
 #include <cudf/column/column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 /**
  * @file
@@ -86,23 +86,6 @@ class strings_column_view : private column_view {
 
 //! Strings column APIs.
 namespace strings {
-/**
- * @brief Prints the strings to stdout.
- *
- * @param strings Strings instance for this operation.
- * @param start Index of first string to print.
- * @param end Index of last string to print. Specify -1 for all strings.
- * @param max_width Maximum number of characters to print per string.
- *        Specify -1 to print all characters.
- * @param delimiter The chars to print between each string.
- *        Default is new-line character.
- */
-void print(strings_column_view const& strings,
-           size_type start       = 0,
-           size_type end         = -1,
-           size_type max_width   = -1,
-           const char* delimiter = "\n");
-
 /**
  * @brief Create output per Arrow strings format.
  *
@@ -110,10 +93,10 @@ void print(strings_column_view const& strings,
  *
  * @param strings Strings instance for this operation.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned device_vectors.
+ * @param mr Device memory resource used to allocate the returned device vectors.
  * @return Pair containing a vector of chars and a vector of offsets.
  */
-std::pair<rmm::device_vector<char>, rmm::device_vector<size_type>> create_offsets(
+std::pair<rmm::device_uvector<char>, rmm::device_uvector<size_type>> create_offsets(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 61d714c5538..bec5299ab77 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -475,17 +475,19 @@ class row_hasher {
     // Hash the first column w/ the seed
     auto const initial_hash =
       hash_combiner(hash_value_type{0},
-                    type_dispatcher(_table.column(0).type(),
-                                    element_hasher_with_seed<hash_function, has_nulls>{_seed},
-                                    _table.column(0),
-                                    row_index));
+                    type_dispatcher<dispatch_storage_type>(
+                      _table.column(0).type(),
+                      element_hasher_with_seed<hash_function, has_nulls>{_seed},
+                      _table.column(0),
+                      row_index));
 
     // Hashes an element in a column
     auto hasher = [=](size_type column_index) {
-      return cudf::type_dispatcher(_table.column(column_index).type(),
-                                   element_hasher<hash_function, has_nulls>{},
-                                   _table.column(column_index),
-                                   row_index);
+      return cudf::type_dispatcher<dispatch_storage_type>(
+        _table.column(column_index).type(),
+        element_hasher<hash_function, has_nulls>{},
+        _table.column(column_index),
+        row_index);
     };
 
     // Hash each element and combine all the hash values together
@@ -528,10 +530,11 @@ class row_hasher_initial_values {
 
     // Hashes an element in a column and combines with an initial value
     auto hasher = [=](size_type column_index) {
-      auto hash_value = cudf::type_dispatcher(_table.column(column_index).type(),
-                                              element_hasher<hash_function, has_nulls>{},
-                                              _table.column(column_index),
-                                              row_index);
+      auto hash_value =
+        cudf::type_dispatcher<dispatch_storage_type>(_table.column(column_index).type(),
+                                                     element_hasher<hash_function, has_nulls>{},
+                                                     _table.column(column_index),
+                                                     row_index);
 
       return hash_combiner(_initial_hash[column_index], hash_value);
     };
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index a225e590f9a..1ff701c3b01 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -257,9 +257,19 @@ class mutable_table_view : public detail::table_view_base<mutable_column_view> {
   mutable_table_view(std::vector<mutable_table_view> const& views);
 };
 
-inline bool has_nulls(table_view view)
+inline bool has_nulls(table_view const& view)
 {
-  return std::any_of(view.begin(), view.end(), [](column_view col) { return col.has_nulls(); });
+  return std::any_of(view.begin(), view.end(), [](auto const& col) { return col.has_nulls(); });
+}
+
+inline bool has_nested_nulls(table_view const& input)
+{
+  return std::any_of(input.begin(), input.end(), [](auto const& col) {
+    return col.has_nulls() ||
+           std::any_of(col.child_begin(), col.child_end(), [](auto const& child_col) {
+             return has_nested_nulls(table_view{{child_col}});
+           });
+  });
 }
 
 /**
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index d7b6402fe4e..8116097e38e 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -61,6 +61,7 @@ class scalar;
 
 // clang-format off
 class list_scalar;
+class struct_scalar;
 class string_scalar;
 template <typename T> class numeric_scalar;
 template <typename T> class fixed_point_scalar;
@@ -74,8 +75,6 @@ template <typename T> class timestamp_scalar_device_view;
 template <typename T> class duration_scalar_device_view;
 // clang-format on
 
-class struct_scalar;
-
 class table;
 class table_view;
 class mutable_table_view;
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 999306d4ee7..52ad0648e23 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -22,6 +22,7 @@
 
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 
 #include <cstddef>
 #include <limits>
@@ -119,6 +120,11 @@ struct is_host_span_supported_container<  //
   thrust::host_vector<T, Alloc>> : std::true_type {
 };
 
+template <typename T, typename Alloc>
+struct is_host_span_supported_container<  //
+  std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {
+};
+
 template <typename T, std::size_t Extent = cudf::dynamic_extent>
 struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent>> {
   using base = cudf::detail::span_base<T, Extent, host_span<T, Extent>>;
@@ -257,6 +263,17 @@ class base_2dspan {
     return {this->data() + flatten_index(row, 0, this->size()), this->size().second};
   }
 
+  constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept
+  {
+    return base_2dspan(
+      _data + flatten_index(first_row, 0, this->size()), num_rows, this->size().second);
+  }
+
+  constexpr RowType<T, dynamic_extent> flat_view()
+  {
+    return {this->data(), this->size().first * this->size().second};
+  }
+
   template <typename OtherT,
             template <typename, size_t>
             typename OtherRowType,
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index aa5f554ad40..e2f5f6db624 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -140,7 +140,7 @@ constexpr inline bool is_numeric()
 
 struct is_numeric_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_numeric<T>();
   }
@@ -181,7 +181,7 @@ constexpr inline bool is_index_type()
 
 struct is_index_type_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_index_type<T>();
   }
@@ -218,7 +218,7 @@ constexpr inline bool is_unsigned()
 
 struct is_unsigned_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_unsigned<T>();
   }
@@ -264,7 +264,7 @@ constexpr inline bool is_floating_point()
 
 struct is_floating_point_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_floating_point<T>();
   }
@@ -332,7 +332,7 @@ constexpr inline bool is_timestamp()
 
 struct is_timestamp_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_timestamp<T>();
   }
@@ -367,7 +367,7 @@ constexpr inline bool is_fixed_point()
 
 struct is_fixed_point_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_fixed_point<T>();
   }
@@ -400,7 +400,7 @@ constexpr inline bool is_duration()
 
 struct is_duration_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_duration<T>();
   }
@@ -435,7 +435,7 @@ constexpr inline bool is_chrono()
 
 struct is_chrono_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_chrono<T>();
   }
@@ -488,7 +488,7 @@ constexpr inline bool is_dictionary()
 
 struct is_dictionary_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_dictionary<T>();
   }
@@ -524,7 +524,7 @@ constexpr inline bool is_fixed_width()
 
 struct is_fixed_width_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_fixed_width<T>();
   }
@@ -567,7 +567,7 @@ constexpr inline bool is_compound()
 
 struct is_compound_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_compound<T>();
   }
@@ -609,7 +609,7 @@ constexpr inline bool is_nested()
 
 struct is_nested_impl {
   template <typename T>
-  bool operator()()
+  constexpr bool operator()()
   {
     return is_nested<T>();
   }
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 66710960296..b8b63b3be81 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -199,19 +199,24 @@ template <>
 inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c)
 {
   auto strings_data = cudf::strings::create_offsets(strings_column_view(c));
-  thrust::host_vector<char> h_chars(strings_data.first);
-  thrust::host_vector<size_type> h_offsets(strings_data.second);
+  thrust::host_vector<char> h_chars(strings_data.first.size());
+  thrust::host_vector<size_type> h_offsets(strings_data.second.size());
+  CUDA_TRY(
+    cudaMemcpy(h_chars.data(), strings_data.first.data(), h_chars.size(), cudaMemcpyDeviceToHost));
+  CUDA_TRY(cudaMemcpy(h_offsets.data(),
+                      strings_data.second.data(),
+                      h_offsets.size() * sizeof(cudf::size_type),
+                      cudaMemcpyDeviceToHost));
 
   // build std::string vector from chars and offsets
   std::vector<std::string> host_data;
   host_data.reserve(c.size());
-
-  // When C++17, replace this loop with std::adjacent_difference()
-  for (size_type idx = 0; idx < c.size(); ++idx) {
-    auto offset = h_offsets[idx];
-    auto length = h_offsets[idx + 1] - offset;
-    host_data.push_back(std::string(h_chars.data() + offset, length));
-  }
+  std::transform(
+    std::begin(h_offsets),
+    std::end(h_offsets) - 1,
+    std::begin(h_offsets) + 1,
+    std::back_inserter(host_data),
+    [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); });
 
   return {host_data, bitmask_to_host(c)};
 }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 7667254ffbf..74d22085b26 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -145,7 +145,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transform_begin = thrust::make_transform_iterator(begin, transformer);
   auto const size      = cudf::distance(begin, end);
   auto const elements  = thrust::host_vector<ElementTo>(transform_begin, transform_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(ElementTo)};
+  return rmm::device_buffer{elements.data(), size * sizeof(ElementTo), rmm::cuda_stream_default};
 }
 
 /**
@@ -171,7 +171,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transform_begin = thrust::make_transform_iterator(begin, transformer);
   auto const size      = cudf::distance(begin, end);
   auto const elements  = thrust::host_vector<RepType>(transform_begin, transform_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(RepType)};
+  return rmm::device_buffer{elements.data(), size * sizeof(RepType), rmm::cuda_stream_default};
 }
 
 /**
@@ -198,7 +198,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
   auto const size        = cudf::distance(begin, end);
   auto const elements = thrust::host_vector<RepType>(transformer_begin, transformer_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(RepType)};
+  return rmm::device_buffer{elements.data(), size * sizeof(RepType), rmm::cuda_stream_default};
 }
 
 /**
@@ -245,7 +245,8 @@ rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end)
 {
   auto null_mask = make_null_mask_vector(begin, end);
   return rmm::device_buffer{null_mask.data(),
-                            null_mask.size() * sizeof(decltype(null_mask.front()))};
+                            null_mask.size() * sizeof(decltype(null_mask.front())),
+                            rmm::cuda_stream_default};
 }
 
 /**
@@ -514,8 +515,10 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
     auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
     auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
 
-    wrapped.reset(
-      new cudf::column{data_type, size, rmm::device_buffer{elements.data(), size * sizeof(Rep)}});
+    wrapped.reset(new cudf::column{
+      data_type,
+      size,
+      rmm::device_buffer{elements.data(), size * sizeof(Rep), rmm::cuda_stream_default}});
   }
 
   /**
@@ -577,11 +580,12 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
     auto const id           = is_decimal32 ? type_id::DECIMAL32 : type_id::DECIMAL64;
     auto const data_type    = cudf::data_type{id, static_cast<int32_t>(scale)};
 
-    wrapped.reset(new cudf::column{data_type,
-                                   size,
-                                   rmm::device_buffer{elements.data(), size * sizeof(Rep)},
-                                   detail::make_null_mask(v, v + size),
-                                   cudf::UNKNOWN_NULL_COUNT});
+    wrapped.reset(new cudf::column{
+      data_type,
+      size,
+      rmm::device_buffer{elements.data(), size * sizeof(Rep), rmm::cuda_stream_default},
+      detail::make_null_mask(v, v + size),
+      cudf::UNKNOWN_NULL_COUNT});
   }
 
   /**
@@ -1514,7 +1518,7 @@ class lists_column_wrapper : public detail::column_wrapper {
                         std::move(offsets),
                         std::move(data),
                         v.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
-                        v.size() <= 0 ? rmm::device_buffer{0}
+                        v.size() <= 0 ? rmm::device_buffer{}
                                       : cudf::test::detail::make_null_mask(v.begin(), v.end()));
   }
 
@@ -1544,7 +1548,7 @@ class lists_column_wrapper : public detail::column_wrapper {
 
     size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1;
     wrapped =
-      make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{0});
+      make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{});
   }
 
   /**
@@ -1776,7 +1780,7 @@ class structs_column_wrapper : public detail::column_wrapper {
       num_rows,
       std::move(child_columns),
       validity.size() <= 0 ? 0 : cudf::UNKNOWN_NULL_COUNT,
-      validity.size() <= 0 ? rmm::device_buffer{0}
+      validity.size() <= 0 ? rmm::device_buffer{}
                            : detail::make_null_mask(validity.begin(), validity.end()));
   }
 
diff --git a/cpp/include/cudf_test/iterator_utilities.hpp b/cpp/include/cudf_test/iterator_utilities.hpp
index 297bcbf175c..f777ceed675 100644
--- a/cpp/include/cudf_test/iterator_utilities.hpp
+++ b/cpp/include/cudf_test/iterator_utilities.hpp
@@ -49,7 +49,7 @@ namespace test {
  * @return auto Validity iterator
  */
 template <typename Iter>
-static auto iterator_with_null_at(Iter index_start, Iter index_end)
+[[maybe_unused]] static auto iterator_with_null_at(Iter index_start, Iter index_end)
 {
   using index_type = typename std::iterator_traits<Iter>::value_type;
 
@@ -77,7 +77,7 @@ static auto iterator_with_null_at(Iter index_start, Iter index_end)
  * @param indices The indices for which the validity iterator must return `false` (i.e. null)
  * @return auto Validity iterator
  */
-static auto iterator_with_null_at(cudf::host_span<cudf::size_type const> const& indices)
+[[maybe_unused]] static auto iterator_with_null_at(cudf::host_span<cudf::size_type const> indices)
 {
   return iterator_with_null_at(indices.begin(), indices.end());
 }
@@ -97,10 +97,24 @@ static auto iterator_with_null_at(cudf::host_span<cudf::size_type const> const&
  * @param index The index for which the validity iterator must return `false` (i.e. null)
  * @return auto Validity iterator
  */
-static auto iterator_with_null_at(cudf::size_type const& index)
+[[maybe_unused]] static auto iterator_with_null_at(cudf::size_type index)
 {
   return iterator_with_null_at(std::vector<size_type>{index});
 }
 
+/**
+ * @brief Bool iterator for marking all elements are null
+ *
+ * @return auto Validity iterator which always yields `false`
+ */
+[[maybe_unused]] static auto iterator_all_nulls() { return thrust::make_constant_iterator(false); }
+
+/**
+ * @brief Bool iterator for marking all elements are valid (non-null)
+ *
+ * @return auto Validity iterator which always yields `true`
+ */
+[[maybe_unused]] static auto iterator_no_null() { return thrust::make_constant_iterator(true); }
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/timestamp_utilities.cuh b/cpp/include/cudf_test/timestamp_utilities.cuh
index 201b837e936..6cab8b92283 100644
--- a/cpp/include/cudf_test/timestamp_utilities.cuh
+++ b/cpp/include/cudf_test/timestamp_utilities.cuh
@@ -55,11 +55,10 @@ inline cudf::test::fixed_width_column_wrapper<T, int64_t> generate_timestamps(in
   auto lhs = start.time_since_epoch().count();
   auto rhs = stop.time_since_epoch().count();
 
-  // When C++17, auto [min, max] = std::minmax(lhs, rhs)
-  auto min   = std::min(lhs, rhs);
-  auto max   = std::max(lhs, rhs);
-  auto range = max - min;
-  auto iter  = cudf::detail::make_counting_transform_iterator(0, [=](auto i) {
+  auto const min   = std::min(lhs, rhs);
+  auto const max   = std::max(lhs, rhs);
+  auto const range = max - min;
+  auto iter        = cudf::detail::make_counting_transform_iterator(0, [=](auto i) {
     return cuda::std::chrono::floor<ToDuration>(
              cuda::std::chrono::milliseconds(min + (range / count) * i))
       .count();
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 71c2b74b37b..a344173144d 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -25,6 +25,8 @@
 #include <cudf/wrappers/timestamps.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 
+#include <thrust/host_vector.h>
+
 #include <array>
 #include <tuple>
 
@@ -79,10 +81,10 @@ constexpr auto types_to_ids()
 template <typename TypeParam, typename T>
 typename std::enable_if<cudf::is_fixed_width<TypeParam>() &&
                           !cudf::is_timestamp_t<TypeParam>::value,
-                        std::vector<TypeParam>>::type
+                        thrust::host_vector<TypeParam>>::type
 make_type_param_vector(std::initializer_list<T> const& init_list)
 {
-  std::vector<TypeParam> vec(init_list.size());
+  thrust::host_vector<TypeParam> vec(init_list.size());
   std::transform(std::cbegin(init_list), std::cend(init_list), std::begin(vec), [](auto const& e) {
     if (std::is_unsigned<TypeParam>::value)
       return static_cast<TypeParam>(std::abs(e));
@@ -93,10 +95,11 @@ make_type_param_vector(std::initializer_list<T> const& init_list)
 }
 
 template <typename TypeParam, typename T>
-typename std::enable_if<cudf::is_timestamp_t<TypeParam>::value, std::vector<TypeParam>>::type
+typename std::enable_if<cudf::is_timestamp_t<TypeParam>::value,
+                        thrust::host_vector<TypeParam>>::type
 make_type_param_vector(std::initializer_list<T> const& init_list)
 {
-  std::vector<TypeParam> vec(init_list.size());
+  thrust::host_vector<TypeParam> vec(init_list.size());
   std::transform(std::cbegin(init_list), std::cend(init_list), std::begin(vec), [](auto const& e) {
     return TypeParam{typename TypeParam::duration{e}};
   });
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index f78ff98d49d..dda8ce87432 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -143,6 +143,7 @@
  * @}
  * @defgroup lists_apis Lists
  * @{
+ *   @defgroup lists_combine Combining
  *   @defgroup lists_extract Extracting
  *   @defgroup lists_contains Searching
  *   @defgroup lists_gather Gathering
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index e178f5a6280..d6b69e0bf73 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -15,7 +15,7 @@
 #=============================================================================
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
-project(CUDA_KAFKA VERSION 0.19.0 LANGUAGES CXX)
+project(CUDA_KAFKA VERSION 21.06.00 LANGUAGES CXX)
 
 ###################################################################################################
 # - Build options
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
index 1f7c15d4f75..5b0f31035c3 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/CUDF_KAFKA_GetCUDF.cmake
@@ -14,22 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
-function(cudfkafka_save_if_enabled var)
-    if(CUDF_KAFKA_${var})
-        unset(${var} PARENT_SCOPE)
-        unset(${var} CACHE)
-    endif()
-endfunction()
-
-function(cudfkafka_restore_if_enabled var)
-    if(CUDF_KAFKA_${var})
-        set(${var} ON CACHE INTERNAL "" FORCE)
-    endif()
-endfunction()
-
 function(find_and_configure_cudf VERSION)
-    cudfkafka_save_if_enabled(BUILD_TESTS)
-    cudfkafka_save_if_enabled(BUILD_BENCHMARKS)
     CPMFindPackage(NAME cudf
         VERSION         ${VERSION}
         GIT_REPOSITORY  https://github.com/rapidsai/cudf.git
@@ -38,9 +23,16 @@ function(find_and_configure_cudf VERSION)
         SOURCE_SUBDIR   cpp
         OPTIONS         "BUILD_TESTS OFF"
                         "BUILD_BENCHMARKS OFF")
-    cudfkafka_restore_if_enabled(BUILD_TESTS)
-    cudfkafka_restore_if_enabled(BUILD_BENCHMARKS)
+    if(cudf_ADDED)
+        set(cudf_ADDED TRUE PARENT_SCOPE)
+    endif()
 endfunction()
 
-set(CUDF_KAFKA_MIN_VERSION_cudf 0.19)
-find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION_cudf})
+set(CUDA_KAFKA_MIN_VERSION_cudf "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.00")
+find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
+
+if(cudf_ADDED)
+    # Since we are building cudf as part of ourselves we need
+    # to enable the CUDA language in the top-most scope
+    enable_language(CUDA)
+endif()
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 3a044a42101..a878dbe1535 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -22,142 +22,491 @@
 
 namespace cudf {
 
-std::vector<aggregation::Kind> aggregation::get_simple_aggregations(data_type col_type) const
+namespace detail {
+
+// simple_aggregations_collector ----------------------------------------
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, aggregation const& agg)
+{
+  std::vector<std::unique_ptr<aggregation>> aggs;
+  aggs.push_back(agg.clone());
+  return aggs;
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, sum_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, product_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, min_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, max_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, count_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, any_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, all_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, sum_of_squares_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, mean_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, var_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, std_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, median_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, quantile_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, argmax_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, argmin_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, nunique_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, nth_element_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, row_number_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, collect_list_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, collect_set_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, lead_lag_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, udf_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+// aggregation_finalizer ----------------------------------------
+
+void aggregation_finalizer::visit(aggregation const& agg) {}
+
+void aggregation_finalizer::visit(sum_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(product_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(min_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(max_aggregation const& agg)
 {
-  return {this->kind};
+  visit(static_cast<aggregation const&>(agg));
 }
-void aggregation::finalize(cudf::detail::aggregation_finalizer& finalizer)
+
+void aggregation_finalizer::visit(count_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(any_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(all_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(sum_of_squares_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(mean_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(var_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(std_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(median_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(quantile_aggregation const& agg)
 {
-  finalizer.visit(*this);
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(argmax_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(argmin_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(nunique_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(nth_element_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(row_number_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(collect_list_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(collect_set_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(lead_lag_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(udf_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+}  // namespace detail
+
+std::vector<std::unique_ptr<aggregation>> aggregation::get_simple_aggregations(
+  data_type col_type, cudf::detail::simple_aggregations_collector& collector) const
+{
+  return collector.visit(col_type, *this);
 }
 
 /// Factory to create a SUM aggregation
-std::unique_ptr<aggregation> make_sum_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_sum_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::SUM);
+  return std::make_unique<detail::sum_aggregation>();
 }
+template std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
+template std::unique_ptr<rolling_aggregation> make_sum_aggregation<rolling_aggregation>();
+
 /// Factory to create a PRODUCT aggregation
-std::unique_ptr<aggregation> make_product_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_product_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::PRODUCT);
+  return std::make_unique<detail::product_aggregation>();
 }
+template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
+
 /// Factory to create a MIN aggregation
-std::unique_ptr<aggregation> make_min_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_min_aggregation()
 {
   return std::make_unique<detail::min_aggregation>();
 }
+template std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
+template std::unique_ptr<rolling_aggregation> make_min_aggregation<rolling_aggregation>();
+
 /// Factory to create a MAX aggregation
-std::unique_ptr<aggregation> make_max_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_max_aggregation()
 {
   return std::make_unique<detail::max_aggregation>();
 }
+template std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
+template std::unique_ptr<rolling_aggregation> make_max_aggregation<rolling_aggregation>();
+
 /// Factory to create a COUNT aggregation
-std::unique_ptr<aggregation> make_count_aggregation(null_policy null_handling)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_count_aggregation(null_policy null_handling)
 {
   auto kind =
     (null_handling == null_policy::INCLUDE) ? aggregation::COUNT_ALL : aggregation::COUNT_VALID;
-  return std::make_unique<aggregation>(kind);
+  return std::make_unique<detail::count_aggregation>(kind);
 }
+template std::unique_ptr<aggregation> make_count_aggregation<aggregation>(
+  null_policy null_handling);
+template std::unique_ptr<rolling_aggregation> make_count_aggregation<rolling_aggregation>(
+  null_policy null_handling);
+
 /// Factory to create a ANY aggregation
-std::unique_ptr<aggregation> make_any_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_any_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::ANY);
+  return std::make_unique<detail::any_aggregation>();
 }
+template std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
+
 /// Factory to create a ALL aggregation
-std::unique_ptr<aggregation> make_all_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_all_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::ALL);
+  return std::make_unique<detail::all_aggregation>();
 }
+template std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
+
 /// Factory to create a SUM_OF_SQUARES aggregation
-std::unique_ptr<aggregation> make_sum_of_squares_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_sum_of_squares_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::SUM_OF_SQUARES);
+  return std::make_unique<detail::sum_of_squares_aggregation>();
 }
+template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
+
 /// Factory to create a MEAN aggregation
-std::unique_ptr<aggregation> make_mean_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_mean_aggregation()
 {
   return std::make_unique<detail::mean_aggregation>();
 }
+template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
+template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
+
 /// Factory to create a VARIANCE aggregation
-std::unique_ptr<aggregation> make_variance_aggregation(size_type ddof)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
 {
   return std::make_unique<detail::var_aggregation>(ddof);
-};
+}
+template std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(size_type ddof);
+
 /// Factory to create a STD aggregation
-std::unique_ptr<aggregation> make_std_aggregation(size_type ddof)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_std_aggregation(size_type ddof)
 {
   return std::make_unique<detail::std_aggregation>(ddof);
-};
+}
+template std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
+
 /// Factory to create a MEDIAN aggregation
-std::unique_ptr<aggregation> make_median_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_median_aggregation()
 {
-  // TODO I think this should just return a quantile_aggregation?
-  return std::make_unique<aggregation>(aggregation::MEDIAN);
+  return std::make_unique<detail::median_aggregation>();
 }
+template std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
+
 /// Factory to create a QUANTILE aggregation
-std::unique_ptr<aggregation> make_quantile_aggregation(std::vector<double> const& q,
-                                                       interpolation i)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& q, interpolation i)
 {
   return std::make_unique<detail::quantile_aggregation>(q, i);
 }
-/// Factory to create a ARGMAX aggregation
-std::unique_ptr<aggregation> make_argmax_aggregation()
+template std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
+  std::vector<double> const& q, interpolation i);
+
+/// Factory to create an ARGMAX aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_argmax_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::ARGMAX);
+  return std::make_unique<detail::argmax_aggregation>();
 }
-/// Factory to create a ARGMIN aggregation
-std::unique_ptr<aggregation> make_argmin_aggregation()
+template std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
+template std::unique_ptr<rolling_aggregation> make_argmax_aggregation<rolling_aggregation>();
+
+/// Factory to create an ARGMIN aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_argmin_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::ARGMIN);
+  return std::make_unique<detail::argmin_aggregation>();
 }
-/// Factory to create a NUNIQUE aggregation
-std::unique_ptr<aggregation> make_nunique_aggregation(null_policy null_handling)
+template std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
+template std::unique_ptr<rolling_aggregation> make_argmin_aggregation<rolling_aggregation>();
+
+/// Factory to create an NUNIQUE aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::nunique_aggregation>(null_handling);
 }
-/// Factory to create a NTH_ELEMENT aggregation
-std::unique_ptr<aggregation> make_nth_element_aggregation(size_type n, null_policy null_handling)
+template std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
+  null_policy null_handling);
+
+/// Factory to create an NTH_ELEMENT aggregation
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null_handling)
 {
   return std::make_unique<detail::nth_element_aggregation>(n, null_handling);
 }
+template std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
+  size_type n, null_policy null_handling);
+
 /// Factory to create a ROW_NUMBER aggregation
-std::unique_ptr<aggregation> make_row_number_aggregation()
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_row_number_aggregation()
 {
-  return std::make_unique<aggregation>(aggregation::ROW_NUMBER);
+  return std::make_unique<detail::row_number_aggregation>();
 }
+template std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>();
+template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rolling_aggregation>();
+
 /// Factory to create a COLLECT_LIST aggregation
-std::unique_ptr<aggregation> make_collect_list_aggregation(null_policy null_handling)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_collect_list_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::collect_list_aggregation>(null_handling);
 }
+template std::unique_ptr<aggregation> make_collect_list_aggregation<aggregation>(
+  null_policy null_handling);
+template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<rolling_aggregation>(
+  null_policy null_handling);
+
 /// Factory to create a COLLECT_SET aggregation
-std::unique_ptr<aggregation> make_collect_set_aggregation(null_policy null_handling,
-                                                          null_equality nulls_equal,
-                                                          nan_equality nans_equal)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling,
+                                                   null_equality nulls_equal,
+                                                   nan_equality nans_equal)
 {
   return std::make_unique<detail::collect_set_aggregation>(null_handling, nulls_equal, nans_equal);
 }
+template std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
+  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
+template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolling_aggregation>(
+  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
+
 /// Factory to create a LAG aggregation
-std::unique_ptr<aggregation> make_lag_aggregation(size_type offset)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lag_aggregation(size_type offset)
 {
-  return std::make_unique<cudf::detail::lead_lag_aggregation>(aggregation::LAG, offset);
+  return std::make_unique<detail::lead_lag_aggregation>(aggregation::LAG, offset);
 }
+template std::unique_ptr<aggregation> make_lag_aggregation<aggregation>(size_type offset);
+template std::unique_ptr<rolling_aggregation> make_lag_aggregation<rolling_aggregation>(
+  size_type offset);
+
 /// Factory to create a LEAD aggregation
-std::unique_ptr<aggregation> make_lead_aggregation(size_type offset)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_lead_aggregation(size_type offset)
 {
-  return std::make_unique<cudf::detail::lead_lag_aggregation>(aggregation::LEAD, offset);
+  return std::make_unique<detail::lead_lag_aggregation>(aggregation::LEAD, offset);
 }
+template std::unique_ptr<aggregation> make_lead_aggregation<aggregation>(size_type offset);
+template std::unique_ptr<rolling_aggregation> make_lead_aggregation<rolling_aggregation>(
+  size_type offset);
+
 /// Factory to create a UDF aggregation
-std::unique_ptr<aggregation> make_udf_aggregation(udf_type type,
-                                                  std::string const& user_defined_aggregator,
-                                                  data_type output_type)
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_udf_aggregation(udf_type type,
+                                           std::string const& user_defined_aggregator,
+                                           data_type output_type)
 {
-  aggregation* a =
+  auto* a =
     new detail::udf_aggregation{type == udf_type::PTX ? aggregation::PTX : aggregation::CUDA,
                                 user_defined_aggregator,
                                 output_type};
-  return std::unique_ptr<aggregation>(a);
+  return std::unique_ptr<detail::udf_aggregation>(a);
 }
+template std::unique_ptr<aggregation> make_udf_aggregation<aggregation>(
+  udf_type type, std::string const& user_defined_aggregator, data_type output_type);
+template std::unique_ptr<rolling_aggregation> make_udf_aggregation<rolling_aggregation>(
+  udf_type type, std::string const& user_defined_aggregator, data_type output_type);
 
 namespace detail {
 namespace {
diff --git a/cpp/src/aggregation/result_cache.cpp b/cpp/src/aggregation/result_cache.cpp
index f35c05349b0..b259e5965ef 100644
--- a/cpp/src/aggregation/result_cache.cpp
+++ b/cpp/src/aggregation/result_cache.cpp
@@ -50,11 +50,8 @@ std::unique_ptr<column> result_cache::release_result(size_t col_idx, aggregation
 {
   CUDF_EXPECTS(has_result(col_idx, agg), "Result does not exist in cache");
 
-  // unordered_map.extract() is a c++17 feature so we do this:
-  auto result_it                 = _cache[col_idx].find(agg);
-  std::unique_ptr<column> result = std::move(result_it->second.second);
-  _cache[col_idx].erase(result_it);
-  return result;
+  auto result_it = _cache[col_idx].extract(agg);
+  return std::move(result_it.mapped().second);
 }
 
 }  // namespace detail
diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/linearizer.cpp
index cc70845e1ff..66a32ead35e 100644
--- a/cpp/src/ast/linearizer.cpp
+++ b/cpp/src/ast/linearizer.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/ast/detail/linearizer.hpp>
-#include <cudf/ast/linearizer.hpp>
+#include <cudf/ast/nodes.hpp>
 #include <cudf/ast/operators.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu
index bc055d46869..43d3bde97c2 100644
--- a/cpp/src/ast/transform.cu
+++ b/cpp/src/ast/transform.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cudf/ast/detail/transform.cuh>
-#include <cudf/ast/linearizer.hpp>
+#include <cudf/ast/nodes.hpp>
 #include <cudf/ast/operators.hpp>
 #include <cudf/ast/transform.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -61,27 +61,25 @@ namespace detail {
  * each thread.
  */
 template <cudf::size_type max_block_size>
-__launch_bounds__(max_block_size) __global__
-  void compute_column_kernel(table_device_view const table,
-                             const cudf::detail::fixed_width_scalar_device_view_base* literals,
-                             mutable_column_device_view output_column,
-                             const detail::device_data_reference* data_references,
-                             const ast_operator* operators,
-                             const cudf::size_type* operator_source_indices,
-                             cudf::size_type num_operators,
-                             cudf::size_type num_intermediates)
+__launch_bounds__(max_block_size) __global__ void compute_column_kernel(
+  table_device_view const table,
+  device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals,
+  mutable_column_device_view output_column,
+  device_span<const detail::device_data_reference> data_references,
+  device_span<const ast_operator> operators,
+  device_span<const cudf::size_type> operator_source_indices,
+  cudf::size_type num_intermediates)
 {
   extern __shared__ std::int64_t intermediate_storage[];
   auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates];
-  auto const start_idx             = cudf::size_type(threadIdx.x + blockIdx.x * blockDim.x);
-  auto const stride                = cudf::size_type(blockDim.x * gridDim.x);
-  auto const num_rows              = table.num_rows();
+  auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
+  auto const stride    = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
   auto const evaluator =
     cudf::ast::detail::row_evaluator(table, literals, thread_intermediate_storage, &output_column);
 
-  for (cudf::size_type row_index = start_idx; row_index < num_rows; row_index += stride) {
+  for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
     evaluate_row_expression(
-      evaluator, data_references, operators, operator_source_indices, num_operators, row_index);
+      evaluator, data_references, operators, operator_source_indices, row_index);
   }
 }
 
@@ -90,40 +88,8 @@ std::unique_ptr<column> compute_column(table_view const table,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  // Linearize the AST
-  auto const expr_linearizer         = linearizer(expr, table);
-  auto const data_references         = expr_linearizer.data_references();
-  auto const literals                = expr_linearizer.literals();
-  auto const operators               = expr_linearizer.operators();
-  auto const num_operators           = cudf::size_type(operators.size());
-  auto const operator_source_indices = expr_linearizer.operator_source_indices();
-  auto const expr_data_type          = expr_linearizer.root_data_type();
-
-  // Create ast_plan and device buffer
-  auto plan = ast_plan();
-  plan.add_to_plan(data_references);
-  plan.add_to_plan(literals);
-  plan.add_to_plan(operators);
-  plan.add_to_plan(operator_source_indices);
-  auto const host_data_buffer = plan.get_host_data_buffer();
-  auto const buffer_offsets   = plan.get_offsets();
-  auto const buffer_size      = host_data_buffer.second;
-  auto device_data_buffer =
-    rmm::device_buffer(host_data_buffer.first.get(), buffer_size, stream, mr);
-  // To reduce overhead, we don't call a stream sync here.
-  // The stream is synced later when the table_device_view is created.
-
-  // Create device pointers to components of plan
-  auto const device_data_buffer_ptr = static_cast<const char*>(device_data_buffer.data());
-  auto const device_data_references = reinterpret_cast<const detail::device_data_reference*>(
-    device_data_buffer_ptr + buffer_offsets[0]);
-  auto const device_literals =
-    reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
-      device_data_buffer_ptr + buffer_offsets[1]);
-  auto const device_operators =
-    reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]);
-  auto const device_operator_source_indices =
-    reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]);
+  auto const expr_linearizer = linearizer(expr, table);                // Linearize the AST
+  auto const plan            = ast_plan{expr_linearizer, stream, mr};  // Create ast_plan
 
   // Create table device view
   auto table_device         = table_device_view::create(table, stream);
@@ -131,7 +97,7 @@ std::unique_ptr<column> compute_column(table_view const table,
 
   // Prepare output column
   auto output_column = cudf::make_fixed_width_column(
-    expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream, mr);
+    expr_linearizer.root_data_type(), table_num_rows, mask_state::UNALLOCATED, stream, mr);
   auto mutable_output_device =
     cudf::mutable_column_device_view::create(output_column->mutable_view(), stream);
 
@@ -155,12 +121,11 @@ std::unique_ptr<column> compute_column(table_view const table,
   cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
       *table_device,
-      device_literals,
+      plan._device_literals,
       *mutable_output_device,
-      device_data_references,
-      device_operators,
-      device_operator_source_indices,
-      num_operators,
+      plan._device_data_references,
+      plan._device_operators,
+      plan._device_operator_source_indices,
       num_intermediates);
   CHECK_CUDA(stream.value());
   return output_column;
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 7d43524f608..2b24e0cfa3d 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -123,7 +123,8 @@ struct binary_op {
       auto out_view        = out->mutable_view();
       auto out_itr         = out_view.begin<Out>();
       auto lhs_device_view = column_device_view::create(lhs, stream);
-      auto rhs_scalar      = static_cast<cudf::scalar_type_t<Rhs> const&>(rhs);
+      using rhs_type       = cudf::scalar_type_t<Rhs>;
+      auto rhs_scalar      = rhs_type(static_cast<rhs_type const&>(rhs), stream);
       auto rhs_scalar_view = get_scalar_device_view(rhs_scalar);
       if (lhs.has_nulls()) {
         auto lhs_itr = cudf::detail::make_null_replacement_iterator(*lhs_device_view, Lhs{});
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
new file mode 100644
index 00000000000..47870e01567
--- /dev/null
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -0,0 +1,47 @@
+
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+bool is_element_valid_sync(column_view const& col_view,
+                           size_type element_index,
+                           rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
+  if (!col_view.nullable()) { return true; }
+
+  bitmask_type word;
+  // null_mask() returns device ptr to bitmask without offset
+  size_type index = element_index + col_view.offset();
+  CUDA_TRY(cudaMemcpyAsync(&word,
+                           col_view.null_mask() + word_index(index),
+                           sizeof(bitmask_type),
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+  stream.synchronize();
+  return static_cast<bool>(word & (bitmask_type{1} << intra_word_index(index)));
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index d30e5fc746a..3ee8e0a33a9 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -43,19 +43,8 @@
 #include <vector>
 
 namespace cudf {
-// Copy constructor
-column::column(column const &other)
-  : _type{other._type},
-    _size{other._size},
-    _data{other._data},
-    _null_mask{other._null_mask},
-    _null_count{other._null_count}
-{
-  _children.reserve(other.num_children());
-  for (auto const &c : other._children) { _children.emplace_back(std::make_unique<column>(*c)); }
-}
 
-// Copy ctor w/ explicit stream/mr
+// Copy ctor w/ optional stream/mr
 column::column(column const &other,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
@@ -165,14 +154,16 @@ void column::set_null_mask(rmm::device_buffer &&new_null_mask, size_type new_nul
   _null_count = new_null_count;
 }
 
-void column::set_null_mask(rmm::device_buffer const &new_null_mask, size_type new_null_count)
+void column::set_null_mask(rmm::device_buffer const &new_null_mask,
+                           size_type new_null_count,
+                           rmm::cuda_stream_view stream)
 {
   if (new_null_count > 0) {
     CUDF_EXPECTS(new_null_mask.size() >= cudf::bitmask_allocation_size_bytes(this->size()),
                  "Column with null values must be nullable and the null mask \
                   buffer size should match the size of the column.");
   }
-  _null_mask  = new_null_mask;  // copy
+  _null_mask  = rmm::device_buffer{new_null_mask, stream};  // copy
   _null_count = new_null_count;
 }
 
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 03339c2e0a8..86059a72e8f 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -20,7 +20,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/fill.hpp>
 #include <cudf/utilities/error.hpp>
@@ -159,90 +158,6 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
   /// clang-format on
 }
 
-struct column_from_scalar_dispatch {
-  template <typename T>
-  std::unique_ptr<cudf::column> operator()(scalar const& value,
-                                           size_type size,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
-  {
-    if (!value.is_valid())
-      return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr);
-    auto output_column =
-      make_fixed_width_column(value.type(), size, mask_state::UNALLOCATED, stream, mr);
-    auto view = output_column->mutable_view();
-    detail::fill_in_place(view, 0, size, value, stream);
-    return output_column;
-  }
-};
-
-template <>
-std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::string_view>(
-  scalar const& value,
-  size_type size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
-{
-  auto null_mask = detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
-
-  if (!value.is_valid())
-    return std::make_unique<column>(value.type(),
-                                    size,
-                                    rmm::device_buffer{0, stream, mr},
-                                    null_mask,
-                                    size);
-
-  // Create a strings column_view with all nulls and no children.
-  // Since we are setting every row to the scalar, the fill() never needs to access
-  // any of the children in the strings column which would otherwise cause an exception.
-  column_view sc{
-    data_type{type_id::STRING}, size, nullptr, static_cast<bitmask_type*>(null_mask.data()), size};
-  auto sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
-  // fill the column with the scalar
-  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr);
-  output->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);  // should be no nulls
-  return output;
-}
-
-template <>
-std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::dictionary32>(
-  scalar const& value,
-  size_type size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
-{
-  CUDF_FAIL("dictionary not supported when creating from scalar");
-}
-
-template <>
-std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::list_view>(
-  scalar const& value,
-  size_type size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
-{
-  CUDF_FAIL("TODO");
-}
-
-template <>
-std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::struct_view>(
-  scalar const& value,
-  size_type size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
-{
-  CUDF_FAIL("TODO. struct_view currently not supported.");
-}
-
-std::unique_ptr<column> make_column_from_scalar(scalar const& s,
-                                                size_type size,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
-{
-  if (size == 0) return make_empty_column(s.type());
-  return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr);
-}
-
 std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
                                                     size_type size,
                                                     rmm::cuda_stream_view stream,
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
new file mode 100644
index 00000000000..9168d47aaf7
--- /dev/null
+++ b/cpp/src/column/column_factories.cu
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/fill.hpp>
+#include <cudf/detail/gather.cuh>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/lists/lists_column_factories.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/fill.hpp>
+
+namespace cudf {
+
+namespace {
+
+struct column_from_scalar_dispatch {
+  template <typename T>
+  std::unique_ptr<cudf::column> operator()(scalar const& value,
+                                           size_type size,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr) const
+  {
+    if (size == 0) return make_empty_column(value.type());
+    if (!value.is_valid())
+      return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr);
+    auto output_column =
+      make_fixed_width_column(value.type(), size, mask_state::UNALLOCATED, stream, mr);
+    auto view = output_column->mutable_view();
+    detail::fill_in_place(view, 0, size, value, stream);
+    return output_column;
+  }
+};
+
+template <>
+std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::string_view>(
+  scalar const& value,
+  size_type size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  if (size == 0) return make_empty_column(value.type());
+  auto null_mask = detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
+
+  if (!value.is_valid())
+    return std::make_unique<column>(
+      value.type(), size, rmm::device_buffer{}, std::move(null_mask), size);
+
+  // Create a strings column_view with all nulls and no children.
+  // Since we are setting every row to the scalar, the fill() never needs to access
+  // any of the children in the strings column which would otherwise cause an exception.
+  column_view sc{
+    data_type{type_id::STRING}, size, nullptr, static_cast<bitmask_type*>(null_mask.data()), size};
+  auto sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
+  // fill the column with the scalar
+  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr);
+  output->set_null_mask(rmm::device_buffer{}, 0);  // should be no nulls
+  return output;
+}
+
+template <>
+std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::dictionary32>(
+  scalar const& value,
+  size_type size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  CUDF_FAIL("dictionary not supported when creating from scalar");
+}
+
+template <>
+std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::list_view>(
+  scalar const& value,
+  size_type size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  auto lv = static_cast<list_scalar const*>(&value);
+  return lists::detail::make_lists_column_from_scalar(*lv, size, stream, mr);
+}
+
+template <>
+std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::struct_view>(
+  scalar const& value,
+  size_type size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr) const
+{
+  if (size == 0) CUDF_FAIL("0-length struct column is unsupported.");
+  auto ss   = static_cast<scalar_type_t<cudf::struct_view> const&>(value);
+  auto iter = thrust::make_constant_iterator(0);
+
+  auto children =
+    detail::gather(ss.view(), iter, iter + size, out_of_bounds_policy::NULLIFY, stream, mr);
+  auto const is_valid = ss.is_valid();
+  return make_structs_column(size,
+                             std::move(children->release()),
+                             is_valid ? 0 : size,
+                             is_valid
+                               ? rmm::device_buffer{}
+                               : detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr),
+                             stream,
+                             mr);
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<column> make_column_from_scalar(scalar const& s,
+                                                size_type size,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 1b948083982..6ba10bef396 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -57,9 +57,6 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     column_device_view::create(std::declval<column_view>(), std::declval<rmm::cuda_stream_view>()));
   auto device_view_owners = std::vector<CDViewPtr>(views.size());
   std::transform(views.begin(), views.end(), device_view_owners.begin(), [stream](auto const& col) {
-    // TODO creating this device view can invoke null count computation
-    // even though it isn't used. See this issue:
-    // https://github.com/rapidsai/cudf/issues/4368
     return column_device_view::create(col, stream);
   });
 
@@ -70,10 +67,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
                  device_view_owners.cend(),
                  std::back_inserter(device_views),
                  [](auto const& col) { return *col; });
-  // TODO each of these device vector copies invoke stream synchronization
-  // which appears to add unnecessary overhead. See this issue:
-  // https://github.com/rapidsai/rmm/issues/120
-  auto d_views = make_device_uvector_async(device_views);
+
+  auto d_views = make_device_uvector_async(device_views, stream);
 
   // Compute the partition offsets
   auto offsets = thrust::host_vector<size_t>(views.size() + 1);
@@ -84,7 +79,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
     thrust::plus<size_t>{});
-  auto d_offsets         = make_device_uvector_async(offsets);
+  auto d_offsets         = make_device_uvector_async(offsets, stream);
   auto const output_size = offsets.back();
 
   return std::make_tuple(
@@ -455,7 +450,8 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
     rmm::device_buffer null_mask =
       create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr);
 
-    detail::concatenate_masks(views, static_cast<bitmask_type*>(null_mask.data()), 0);
+    detail::concatenate_masks(
+      views, static_cast<bitmask_type*>(null_mask.data()), rmm::cuda_stream_default);
 
     return null_mask;
   }
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 9a2f0f26f74..809390553a4 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -248,14 +248,12 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
  * the actual copy.
  *
  * @param num_src_bufs Total number of source buffers (N)
- * @param num_partitions Number of partitions the each source buffer is split into (M)
  * @param src_bufs Input source buffers (N)
  * @param dst_bufs Desination buffers (N*M)
  * @param buf_info Information on the range of values to be copied for each destination buffer.
  */
 template <int block_size>
 __global__ void copy_partition(int num_src_bufs,
-                               int num_partitions,
                                uint8_t** src_bufs,
                                uint8_t** dst_bufs,
                                dst_buf_info* buf_info)
@@ -447,6 +445,13 @@ struct buf_info_functor {
     return {current + 1, offset_stack_pos + offset_depth};
   }
 
+  template <typename T, typename... Args>
+  std::enable_if_t<std::is_same<T, cudf::dictionary32>::value, std::pair<src_buf_info*, size_type>>
+  operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported type");
+  }
+
  private:
   std::pair<src_buf_info*, size_type> add_null_buffer(column_view const& col,
                                                       src_buf_info* current,
@@ -599,17 +604,6 @@ std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::struct_vi
                                offset_depth);
 }
 
-template <>
-std::pair<src_buf_info*, size_type> buf_info_functor::operator()<cudf::dictionary32>(
-  column_view const& col,
-  src_buf_info* current,
-  int offset_stack_pos,
-  int parent_offset_index,
-  int offset_depth)
-{
-  CUDF_FAIL("Unsupported type");
-}
-
 template <typename InputIter>
 std::pair<src_buf_info*, size_type> setup_source_buf_info(InputIter begin,
                                                           InputIter end,
@@ -660,10 +654,7 @@ BufInfo build_output_columns(InputIter begin,
 {
   auto current_info = info_begin;
   std::transform(begin, end, out_begin, [&current_info, base_ptr](column_view const& src) {
-    // Use C++17 structured bindings
-    bitmask_type const* bitmask_ptr;
-    size_type null_count;
-    std::tie(bitmask_ptr, null_count) = [&]() {
+    auto [bitmask_ptr, null_count] = [&]() {
       if (src.nullable()) {
         auto const ptr =
           current_info->num_elements == 0
@@ -1024,9 +1015,9 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
   // copy.  1 block per buffer
   {
-    constexpr size_type block_size = 512;
+    constexpr size_type block_size = 256;
     copy_partition<block_size><<<num_bufs, block_size, 0, stream.value()>>>(
-      num_src_bufs, num_partitions, d_src_bufs, d_dst_bufs, d_dst_buf_info);
+      num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info);
   }
 
   // DtoH dst info (to retrieve null counts)
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 50bf168037d..670c147aa7e 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -21,6 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -44,6 +45,79 @@ inline mask_state should_allocate_mask(mask_allocation_policy mask_alloc, bool m
   }
 }
 
+/**
+ * @brief Functor to produce an empty column of the same type as the
+ * input scalar.
+ *
+ * In the case of nested types, full column hierarchy is preserved.
+ */
+template <typename T>
+struct scalar_empty_like_functor_impl {
+  std::unique_ptr<column> operator()(scalar const& input)
+  {
+    return cudf::make_empty_column(input.type());
+  }
+};
+
+template <>
+struct scalar_empty_like_functor_impl<cudf::string_view> {
+  std::unique_ptr<column> operator()(scalar const& input)
+  {
+    return cudf::strings::detail::make_empty_strings_column(rmm::cuda_stream_default,
+                                                            rmm::mr::get_current_device_resource());
+  }
+};
+
+template <>
+struct scalar_empty_like_functor_impl<cudf::list_view> {
+  std::unique_ptr<column> operator()(scalar const& input)
+  {
+    auto ls = static_cast<list_scalar const*>(&input);
+
+    // TODO:  add a manual constructor for lists_column_view.
+    column_view offsets{cudf::data_type{cudf::type_id::INT32}, 0, nullptr};
+    std::vector<column_view> children;
+    children.push_back(offsets);
+    children.push_back(ls->view());
+    column_view lcv{cudf::data_type{cudf::type_id::LIST}, 0, nullptr, nullptr, 0, 0, children};
+
+    return empty_like(lcv);
+  }
+};
+
+template <>
+struct scalar_empty_like_functor_impl<cudf::struct_view> {
+  std::unique_ptr<column> operator()(scalar const& input)
+  {
+    auto ss = static_cast<struct_scalar const*>(&input);
+
+    // TODO: add a manual constructor for structs_column_view
+    // TODO: add cudf::get_element() support for structs
+    cudf::table_view tbl = ss->view();
+    std::vector<column_view> children(tbl.begin(), tbl.end());
+    column_view scv{cudf::data_type{cudf::type_id::STRUCT}, 0, nullptr, nullptr, 0, 0, children};
+
+    return empty_like(scv);
+  }
+};
+
+template <>
+struct scalar_empty_like_functor_impl<cudf::dictionary32> {
+  std::unique_ptr<column> operator()(scalar const& input)
+  {
+    CUDF_FAIL("Dictionary scalars not supported");
+  }
+};
+
+struct scalar_empty_like_functor {
+  template <typename T>
+  std::unique_ptr<column> operator()(scalar const& input)
+  {
+    scalar_empty_like_functor_impl<T> func;
+    return func(input);
+  }
+};
+
 }  // namespace
 
 /*
@@ -91,6 +165,15 @@ std::unique_ptr<column> empty_like(column_view const& input)
     input.type(), 0, rmm::device_buffer{}, rmm::device_buffer{}, 0, std::move(children));
 }
 
+/*
+ * Initializes and returns an empty column of the same type as the `input`.
+ */
+std::unique_ptr<column> empty_like(scalar const& input)
+{
+  CUDF_FUNC_RANGE();
+  return type_dispatcher(input.type(), detail::scalar_empty_like_functor{}, input);
+};
+
 /*
  * Creates a table of empty columns with the same types as the `input_table`
  */
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index fecf7d18d46..9f8e6f7bdcb 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -17,12 +17,14 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_if_else.cuh>
+#include <cudf/detail/gather.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/scatter.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
-
 #include <cudf/utilities/traits.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -38,11 +40,28 @@ struct copy_if_else_functor_impl {
   }
 };
 
+/**
+ * @brief Functor to fetch a device-view for the specified scalar/column_view.
+ */
+struct get_iterable_device_view {
+  template <typename T, CUDF_ENABLE_IF(std::is_same<T, cudf::column_view>::value)>
+  auto operator()(T const& input)
+  {
+    return cudf::column_device_view::create(input);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same<T, cudf::scalar>::value)>
+  auto operator()(T const& input)
+  {
+    return &input;
+  }
+};
+
 template <typename T>
 struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>()>> {
   template <typename Left, typename Right, typename Filter>
-  std::unique_ptr<column> operator()(Left const& lhs,
-                                     Right const& rhs,
+  std::unique_ptr<column> operator()(Left const& lhs_h,
+                                     Right const& rhs_h,
                                      size_type size,
                                      bool left_nullable,
                                      bool right_nullable,
@@ -50,6 +69,11 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
+    auto p_lhs      = get_iterable_device_view{}(lhs_h);
+    auto p_rhs      = get_iterable_device_view{}(rhs_h);
+    auto const& lhs = *p_lhs;
+    auto const& rhs = *p_rhs;
+
     if (left_nullable) {
       if (right_nullable) {
         auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
@@ -81,8 +105,8 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
 template <>
 struct copy_if_else_functor_impl<string_view> {
   template <typename Left, typename Right, typename Filter>
-  std::unique_ptr<column> operator()(Left const& lhs,
-                                     Right const& rhs,
+  std::unique_ptr<column> operator()(Left const& lhs_h,
+                                     Right const& rhs_h,
                                      size_type size,
                                      bool left_nullable,
                                      bool right_nullable,
@@ -92,6 +116,11 @@ struct copy_if_else_functor_impl<string_view> {
   {
     using T = string_view;
 
+    auto p_lhs      = get_iterable_device_view{}(lhs_h);
+    auto p_rhs      = get_iterable_device_view{}(rhs_h);
+    auto const& lhs = *p_lhs;
+    auto const& rhs = *p_rhs;
+
     if (left_nullable) {
       if (right_nullable) {
         auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
@@ -115,40 +144,111 @@ struct copy_if_else_functor_impl<string_view> {
 };
 
 /**
- * @brief Specialization of copy_if_else_functor for list_views.
+ * @brief Functor to generate gather-map for LHS column
+ *
+ * If specified `Predicate` evaluates to `true` for index `i`,
+ * gather map must contain `i` (to select LHS[i]).
+ * If false, gather map must have `null_index`, so that a null
+ * is gathered in its place.
  */
-template <>
-struct copy_if_else_functor_impl<list_view> {
-  template <typename Left, typename Right, typename Filter>
-  std::unique_ptr<column> operator()(Left const& lhs,
-                                     Right const& rhs,
-                                     size_type size,
-                                     bool left_nullable,
-                                     bool right_nullable,
-                                     Filter filter,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+template <typename Predicate>
+class lhs_gather_map_functor {
+ public:
+  lhs_gather_map_functor(Predicate predicate, size_type null_index)
+    : _pred(predicate), _null_index(null_index)
   {
-    CUDF_FAIL("copy_if_else not supported for list_view yet");
   }
+
+  size_type __device__ operator()(size_type i) const { return _pred(i) ? i : _null_index; }
+
+ private:
+  Predicate _pred;
+  size_type _null_index;
 };
 
-template <>
-struct copy_if_else_functor_impl<struct_view> {
-  template <typename Left, typename Right, typename Filter>
-  std::unique_ptr<column> operator()(Left const& lhs,
-                                     Right const& rhs,
-                                     size_type size,
-                                     bool left_nullable,
-                                     bool right_nullable,
-                                     Filter filter,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("copy_if_else not supported for struct_view yet");
-  }
+/**
+ * @brief Adapter to negate predicates.
+ */
+template <typename Predicate>
+class logical_not {
+ public:
+  explicit logical_not(Predicate predicate) : _pred{predicate} {}
+
+  bool __device__ operator()(size_type i) const { return not _pred(i); }
+
+ private:
+  Predicate _pred;
 };
 
+/**
+ * @brief Implementation of copy_if_else() with gather()/scatter().
+ *
+ * Currently supports only nested-type column_views. Scalars are not supported.
+ */
+template <typename Left, typename Right, typename Filter>
+std::unique_ptr<column> scatter_gather_based_if_else(Left const& lhs,
+                                                     Right const& rhs,
+                                                     size_type size,
+                                                     Filter is_left,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  if constexpr (std::is_same<Left, cudf::column_view>::value &&
+                std::is_same<Right, cudf::column_view>::value) {
+    auto const null_map_entry = size + 1;  // Out of bounds index, for gather() to nullify.
+
+    auto const gather_lhs = make_counting_transform_iterator(
+      size_type{0}, lhs_gather_map_functor<Filter>{is_left, null_map_entry});
+
+    auto const lhs_gathered_columns =
+      cudf::detail::gather(table_view{std::vector<cudf::column_view>{lhs}},
+                           gather_lhs,
+                           gather_lhs + size,
+                           out_of_bounds_policy::NULLIFY,
+                           stream,
+                           mr)
+        ->release();
+    auto& lhs_partial_output = lhs_gathered_columns[0];
+
+    auto scatter_map_rhs = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
+    auto const scatter_map_end = thrust::copy_if(rmm::exec_policy(stream),
+                                                 thrust::make_counting_iterator(size_type{0}),
+                                                 thrust::make_counting_iterator(size_type{size}),
+                                                 scatter_map_rhs.begin(),
+                                                 logical_not{is_left});
+
+    auto const scatter_src_rhs = cudf::detail::gather(table_view{std::vector<column_view>{rhs}},
+                                                      scatter_map_rhs.begin(),
+                                                      scatter_map_end,
+                                                      out_of_bounds_policy::DONT_CHECK,
+                                                      stream);
+
+    auto result = cudf::detail::scatter(
+      table_view{std::vector<column_view>{scatter_src_rhs->get_column(0).view()}},
+      scatter_map_rhs.begin(),
+      scatter_map_end,
+      table_view{std::vector<column_view>{lhs_partial_output->view()}},
+      false,
+      stream,
+      mr);
+
+    return std::move(result->release()[0]);
+  }
+
+  // Bail out for Scalars.
+  // For nested types types, scatter/gather based copy_if_else() is not currently supported
+  // if either `lhs` or `rhs` is a scalar, partially because:
+  //   1. Struct scalars are not yet available.
+  //   2. List scalars do not yet support explosion to a full column.
+  CUDF_FAIL("Scalars of nested types are not currently supported!");
+  (void)lhs;
+  (void)rhs;
+  (void)size;
+  (void)is_left;
+  (void)stream;
+  (void)mr;
+}
+
 /**
  * @brief Functor called by the `type_dispatcher` to invoke copy_if_else on combinations
  *        of column_view and scalar
@@ -164,6 +264,12 @@ struct copy_if_else_functor {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
+    if constexpr (std::is_same_v<T, cudf::list_view> or std::is_same_v<T, cudf::struct_view>) {
+      (void)left_nullable;
+      (void)right_nullable;
+      return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
+    }
+
     copy_if_else_functor_impl<T> copier{};
     return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr);
   }
@@ -183,7 +289,7 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
   CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
                "Boolean mask column must be of type type_id::BOOL8");
 
-  if (boolean_mask.is_empty()) { return cudf::make_empty_column(lhs.type()); }
+  if (boolean_mask.is_empty()) { return cudf::empty_like(lhs); }
 
   auto bool_mask_device_p             = column_device_view::create(boolean_mask);
   column_device_view bool_mask_device = *bool_mask_device_p;
@@ -230,13 +336,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns");
   CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size");
-  return copy_if_else(*column_device_view::create(lhs),
-                      *column_device_view::create(rhs),
-                      lhs.has_nulls(),
-                      rhs.has_nulls(),
-                      boolean_mask,
-                      stream,
-                      mr);
+  return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
@@ -247,13 +347,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column");
-  return copy_if_else(lhs,
-                      *column_device_view::create(rhs),
-                      !lhs.is_valid(),
-                      rhs.has_nulls(),
-                      boolean_mask,
-                      stream,
-                      mr);
+  return copy_if_else(lhs, rhs, !lhs.is_valid(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
@@ -264,13 +358,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column");
-  return copy_if_else(*column_device_view::create(lhs),
-                      rhs,
-                      lhs.has_nulls(),
-                      !rhs.is_valid(),
-                      boolean_mask,
-                      stream,
-                      mr);
+  return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(), boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index f4ce9ea27ac..39a947d2ab9 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -90,17 +90,6 @@ struct out_of_place_copy_range_dispatch {
   cudf::column_view const& source;
   cudf::column_view const& target;
 
-  template <typename T, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
-  std::unique_ptr<cudf::column> operator()(
-    cudf::size_type source_begin,
-    cudf::size_type source_end,
-    cudf::size_type target_begin,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-  {
-    CUDF_FAIL("Unsupported type for out of place copy.");
-  }
-
   template <typename T, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
   std::unique_ptr<cudf::column> operator()(
     cudf::size_type source_begin,
@@ -122,6 +111,13 @@ struct out_of_place_copy_range_dispatch {
 
     return p_ret;
   }
+
+  template <typename T, typename... Args>
+  std::enable_if_t<not cudf::is_rep_layout_compatible<T>(), std::unique_ptr<cudf::column>>
+  operator()(Args...)
+  {
+    CUDF_FAIL("Unsupported type for out of place copy.");
+  }
 };
 
 template <>
@@ -212,17 +208,6 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
                                 null_count);
 }
 
-template <>
-std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf::list_view>(
-  cudf::size_type source_begin,
-  cudf::size_type source_end,
-  cudf::size_type target_begin,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FAIL("list_view type not supported");
-}
-
 }  // namespace
 
 namespace cudf {
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 446f9b0dda9..a4d863d204d 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,11 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
 #include <cudf/detail/indexalator.cuh>
+#include <cudf/detail/is_element_valid.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/lists/detail/copying.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 
@@ -122,7 +125,22 @@ struct get_element_functor {
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
-    CUDF_FAIL("get_element_functor not supported for list_view");
+    bool valid               = is_element_valid_sync(input, index, stream);
+    auto const child_col_idx = lists_column_view::child_column_index;
+
+    if (valid) {
+      lists_column_view lcv(input);
+      // Make a copy of the row
+      auto row_slice_contents =
+        lists::detail::copy_slice(lcv, index, index + 1, stream, mr)->release();
+      // Construct scalar with row data
+      return std::make_unique<list_scalar>(
+        std::move(*row_slice_contents.children[child_col_idx]), valid, stream, mr);
+    } else {
+      auto empty_row_contents = empty_like(input)->release();
+      return std::make_unique<list_scalar>(
+        std::move(*empty_row_contents.children[child_col_idx]), valid, stream, mr);
+    }
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()> *p = nullptr>
@@ -156,12 +174,9 @@ struct get_element_functor {
                                                    mr);
   }
 
-  template <typename T, std::enable_if_t<std::is_same<T, struct_view>::value> *p = nullptr>
-  std::unique_ptr<scalar> operator()(
-    column_view const &input,
-    size_type index,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
+  template <typename T, typename... Args>
+  std::enable_if_t<std::is_same<T, struct_view>::value, std::unique_ptr<scalar>> operator()(
+    Args &&...)
   {
     CUDF_FAIL("get_element_functor not supported for struct_view");
   }
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 0d1bb5a8312..182e3ff0584 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -17,6 +17,8 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 namespace cudf {
 namespace detail {
 
@@ -216,7 +218,7 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
 packed_columns pack(cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::pack(input, 0, mr);
+  return detail::pack(input, rmm::cuda_stream_default, mr);
 }
 
 /**
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index db0984068cf..42dc9f76b18 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -25,7 +25,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/random.h>
 #include <thrust/random/uniform_int_distribution.h>
 #include <thrust/shuffle.h>
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index cedac96cee6..a932957ada4 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -149,18 +149,15 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    CUDF_FAIL("scatter scalar to list_view not implemented");
+    return lists::detail::scatter(
+      source, scatter_iter, scatter_iter + scatter_rows, target, stream, mr);
   }
 };
 
 template <typename MapIterator>
 struct column_scalar_scatterer_impl<struct_view, MapIterator> {
-  std::unique_ptr<column> operator()(std::reference_wrapper<const scalar> const& source,
-                                     MapIterator scatter_iter,
-                                     size_type scatter_rows,
-                                     column_view const& target,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...) const
   {
     CUDF_FAIL("scatter scalar to struct_view not implemented");
   }
@@ -197,8 +194,8 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
     auto contents           = new_indices->release();
     auto indices_column     = std::make_unique<column>(indices_type,
                                                    static_cast<size_type>(output_size),
-                                                   *(contents.data.release()),
-                                                   rmm::device_buffer{0, stream, mr},
+                                                   std::move(*(contents.data.release())),
+                                                   rmm::device_buffer{},
                                                    0);
     // use the keys from the matched column
     std::unique_ptr<column> keys_column(std::move(dict_target->release().children.back()));
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
new file mode 100644
index 00000000000..6fc785a61c6
--- /dev/null
+++ b/cpp/src/copying/segmented_shift.cu
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/copy_if_else.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/transform_iterator.h>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Helper function to invoke general `copy_if_else`
+ */
+template <typename PairIterator, typename ScalarIterator>
+std::unique_ptr<column> segmented_shift_rep_impl(PairIterator input_pair_iterator,
+                                                 ScalarIterator fill_pair_iterator,
+                                                 bool nullable,
+                                                 size_type offset,
+                                                 device_span<size_type const> segment_offsets,
+                                                 data_type value_type,
+                                                 size_type column_size,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  if (offset > 0) {
+    auto filter = [segment_offsets, offset] __device__(auto const& i) {
+      auto segment_bound_idx =
+        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) - 1;
+      return not(*segment_bound_idx <= i and i < *segment_bound_idx + offset);
+    };
+    return copy_if_else(nullable,
+                        input_pair_iterator,
+                        input_pair_iterator + column_size,
+                        fill_pair_iterator,
+                        filter,
+                        value_type,
+                        stream,
+                        mr);
+  } else {
+    auto filter = [segment_offsets, offset] __device__(auto const& i) {
+      auto segment_bound_idx =
+        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i);
+      return not(*segment_bound_idx + offset <= i and i < *segment_bound_idx);
+    };
+    return copy_if_else(nullable,
+                        input_pair_iterator,
+                        input_pair_iterator + column_size,
+                        fill_pair_iterator,
+                        filter,
+                        value_type,
+                        stream,
+                        mr);
+  }
+}
+
+/**
+ * @brief Helper function to invoke string specialization of `copy_if_else`
+ */
+template <typename PairIterator, typename ScalarIterator>
+std::unique_ptr<column> segmented_shift_string_impl(PairIterator input_pair_iterator,
+                                                    ScalarIterator fill_pair_iterator,
+                                                    size_type offset,
+                                                    device_span<size_type const> segment_offsets,
+                                                    size_type column_size,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  if (offset > 0) {
+    auto filter = [segment_offsets, offset] __device__(auto const& i) {
+      auto segment_bound_idx =
+        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) - 1;
+      return not(*segment_bound_idx <= i and i < *segment_bound_idx + offset);
+    };
+    return strings::detail::copy_if_else(input_pair_iterator,
+                                         input_pair_iterator + column_size,
+                                         fill_pair_iterator,
+                                         filter,
+                                         stream,
+                                         mr);
+  } else {
+    auto filter = [segment_offsets, offset] __device__(auto const& i) {
+      auto segment_bound_idx =
+        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i);
+      return not(*segment_bound_idx + offset <= i and i < *segment_bound_idx);
+    };
+    return strings::detail::copy_if_else(input_pair_iterator,
+                                         input_pair_iterator + column_size,
+                                         fill_pair_iterator,
+                                         filter,
+                                         stream,
+                                         mr);
+  }
+}
+
+template <typename T, typename Enable = void>
+struct segmented_shift_functor {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Unsupported type for segmented_shift.");
+  }
+};
+
+/**
+ * @brief Segmented shift specialization for representation layout compatible types.
+ */
+template <typename T>
+struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()>> {
+  std::unique_ptr<column> operator()(column_view const& segmented_values,
+                                     device_span<size_type const> segment_offsets,
+                                     size_type offset,
+                                     scalar const& fill_value,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto values_device_view = column_device_view::create(segmented_values, stream);
+    auto fill_pair_iterator = make_pair_iterator<T>(fill_value);
+    bool nullable           = not fill_value.is_valid() or segmented_values.nullable();
+
+    if (segmented_values.has_nulls()) {
+      auto input_pair_iterator = make_pair_iterator<T, true>(*values_device_view) - offset;
+      return segmented_shift_rep_impl(input_pair_iterator,
+                                      fill_pair_iterator,
+                                      nullable,
+                                      offset,
+                                      segment_offsets,
+                                      segmented_values.type(),
+                                      segmented_values.size(),
+                                      stream,
+                                      mr);
+    } else {
+      auto input_pair_iterator = make_pair_iterator<T, false>(*values_device_view) - offset;
+      return segmented_shift_rep_impl(input_pair_iterator,
+                                      fill_pair_iterator,
+                                      nullable,
+                                      offset,
+                                      segment_offsets,
+                                      segmented_values.type(),
+                                      segmented_values.size(),
+                                      stream,
+                                      mr);
+    }
+  }
+};
+
+/**
+ * @brief Segmented shift specialization for `string_view`.
+ */
+template <>
+struct segmented_shift_functor<string_view> {
+  std::unique_ptr<column> operator()(column_view const& segmented_values,
+                                     device_span<size_type const> segment_offsets,
+                                     size_type offset,
+                                     scalar const& fill_value,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    using T = string_view;
+
+    auto values_device_view = column_device_view::create(segmented_values, stream);
+    auto fill_pair_iterator = make_pair_iterator<T>(fill_value);
+    if (segmented_values.has_nulls()) {
+      auto input_pair_iterator = make_pair_iterator<T, true>(*values_device_view) - offset;
+      return segmented_shift_string_impl(input_pair_iterator,
+                                         fill_pair_iterator,
+                                         offset,
+                                         segment_offsets,
+                                         segmented_values.size(),
+                                         stream,
+                                         mr);
+    } else {
+      auto input_pair_iterator = make_pair_iterator<T, false>(*values_device_view) - offset;
+      return segmented_shift_string_impl(input_pair_iterator,
+                                         fill_pair_iterator,
+                                         offset,
+                                         segment_offsets,
+                                         segmented_values.size(),
+                                         stream,
+                                         mr);
+    }
+  }
+};
+
+/**
+ * @brief Functor to instantiate the specializations for segmented shift and
+ * forward arguments.
+ */
+struct segmented_shift_functor_forwarder {
+  template <typename T>
+  std::unique_ptr<column> operator()(column_view const& segmented_values,
+                                     device_span<size_type const> segment_offsets,
+                                     size_type offset,
+                                     scalar const& fill_value,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    segmented_shift_functor<T> shifter;
+    return shifter(segmented_values, segment_offsets, offset, fill_value, stream, mr);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
+                                        device_span<size_type const> segment_offsets,
+                                        size_type offset,
+                                        scalar const& fill_value,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  if (segmented_values.is_empty()) { return empty_like(segmented_values); }
+  if (offset == 0) { return std::make_unique<column>(segmented_values); };
+
+  return type_dispatcher<dispatch_storage_type>(segmented_values.type(),
+                                                segmented_shift_functor_forwarder{},
+                                                segmented_values,
+                                                segment_offsets,
+                                                offset,
+                                                fill_value,
+                                                stream,
+                                                mr);
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index cf85bf51e80..ebeaf0e3b20 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -46,8 +46,7 @@ inline bool __device__ out_of_bounds(size_type size, size_type idx)
 
 struct shift_functor {
   template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(
-    Args&&... args)
+  std::enable_if_t<not cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(Args&&...)
   {
     CUDF_FAIL("shift does not support non-fixed-width types.");
   }
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index f32f351487a..e3d1ea88ece 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -63,6 +63,7 @@ std::unique_ptr<column> add_keys(
                                                   std::vector<size_type>{0},  // only one key column
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
+                                                  null_order::BEFORE,
                                                   stream,
                                                   mr)
                       ->release();
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index cdf086e3f4a..4aa1e3e2278 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -169,18 +169,12 @@ struct dispatch_compute_indices {
     return result;
   }
 
-  template <typename Element>
+  template <typename Element, typename... Args>
   typename std::enable_if_t<!cudf::is_relationally_comparable<Element, Element>(),
                             std::unique_ptr<column>>
-  operator()(column_view const&,
-             column_view const&,
-             column_view const&,
-             offsets_pair const*,
-             size_type const*,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource*)
+  operator()(Args&&...)
   {
-    CUDF_FAIL("list_view as keys for dictionary not supported");
+    CUDF_FAIL("dictionary concatenate not supported for this column type");
   }
 };
 
@@ -213,6 +207,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                                   std::vector<size_type>{0},
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
+                                                  null_order::BEFORE,
                                                   stream,
                                                   mr)
                       ->release();
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 73d1becf639..35e7d5fbc27 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,7 +134,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
     // If the types match, then just commandeer the column's data buffer.
     if (new_type.id() == indices_type) {
       return std::make_unique<column>(
-        new_type, indices_size, *(contents.data.release()), rmm::device_buffer{0, stream, mr}, 0);
+        new_type, indices_size, std::move(*(contents.data.release())), rmm::device_buffer{}, 0);
     }
     // If the new type does not match, then convert the data.
     cudf::column_view cast_view{cudf::data_type{indices_type}, indices_size, contents.data->data()};
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index f8f1d01b4a5..9b644f38794 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -85,7 +85,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
-  if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent()); }
+  if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
   CUDF_EXPECTS(input.keys().type() == replacement.keys().type(), "keys must match");
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
@@ -118,7 +118,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls() || !replacement.is_valid()) {
-    return std::make_unique<cudf::column>(input.parent());
+    return std::make_unique<cudf::column>(input.parent(), stream, mr);
   }
   CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
 
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 0aaf10707f4..5db12d75d62 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -45,7 +45,7 @@ struct dispatch_scalar_index {
   template <typename IndexType,
             typename... Args,
             std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
-  std::unique_ptr<scalar> operator()(Args&&... args)
+  std::unique_ptr<scalar> operator()(Args&&...)
   {
     CUDF_FAIL("indices must be an integral type");
   }
@@ -89,33 +89,18 @@ struct find_index_fn {
                            stream,
                            mr);
   }
-  template <typename Element,
-            std::enable_if_t<std::is_same<Element, dictionary32>::value>* = nullptr>
-  std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
-                                     scalar const& key,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    CUDF_FAIL("dictionary column cannot be the keys column of another dictionary");
-  }
-
-  template <typename Element, std::enable_if_t<std::is_same<Element, list_view>::value>* = nullptr>
-  std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
-                                     scalar const& key,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
-  {
-    CUDF_FAIL("list_view column cannot be the keys column of a dictionary");
-  }
 
   template <typename Element,
-            std::enable_if_t<std::is_same<Element, struct_view>::value>* = nullptr>
-  std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
-                                     scalar const& key,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+            std::enable_if_t<std::is_same<Element, dictionary32>::value or
+                             std::is_same<Element, list_view>::value or
+                             std::is_same<Element, struct_view>::value>* = nullptr>
+  std::unique_ptr<scalar> operator()(dictionary_column_view const&,
+                                     scalar const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
-    CUDF_FAIL("struct_view column cannot be the keys column of a dictionary");
+    CUDF_FAIL(
+      "dictionary, list_view, and struct_view columns cannot be the keys column of a dictionary");
   }
 };
 
@@ -151,12 +136,12 @@ struct find_insert_index_fn {
             std::enable_if_t<std::is_same<Element, dictionary32>::value or
                              std::is_same<Element, list_view>::value or
                              std::is_same<Element, struct_view>::value>* = nullptr>
-  std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
-                                     scalar const& key,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+  std::unique_ptr<scalar> operator()(dictionary_column_view const&,
+                                     scalar const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*) const
   {
-    CUDF_FAIL("column cannot be the keys for dictionary");
+    CUDF_FAIL("dictionary, list_view, and struct_view columns cannot be the keys for a dictionary");
   }
 };
 
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index f3f1ffcfdab..8f07c9cbbed 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -85,15 +85,12 @@ struct dispatch_compute_indices {
     return result;
   }
 
-  template <typename Element>
+  template <typename Element, typename... Args>
   typename std::enable_if_t<!cudf::is_relationally_comparable<Element, Element>(),
                             std::unique_ptr<column>>
-  operator()(dictionary_column_view const& input,
-             column_view const& new_keys,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+  operator()(Args&&...)
   {
-    CUDF_FAIL("list_view dictionary set_keys not supported yet");
+    CUDF_FAIL("dictionary set_keys not supported for this column type");
   }
 };
 
@@ -115,6 +112,7 @@ std::unique_ptr<column> set_keys(
                                                   std::vector<size_type>{0},
                                                   duplicate_keep_option::KEEP_FIRST,
                                                   null_equality::EQUAL,
+                                                  null_order::BEFORE,
                                                   stream,
                                                   mr)
                       ->release();
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 0c4a4ae127e..28a21d92ef9 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -29,7 +29,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -72,38 +72,26 @@ struct count_accessor {
   }
 };
 
-struct compute_offsets {
-  cudf::column_view const* p_column = nullptr;
+struct count_checker {
+  cudf::column_view const& count;
 
   template <typename T>
-  std::enable_if_t<std::is_integral<T>::value, rmm::device_vector<cudf::size_type>> operator()(
-    bool check_count, rmm::cuda_stream_view stream)
+  std::enable_if_t<std::is_integral<T>::value, void> operator()(rmm::cuda_stream_view stream)
   {
     // static_cast is necessary due to bool
-    if (check_count && static_cast<int64_t>(std::numeric_limits<T>::max()) >
-                         std::numeric_limits<cudf::size_type>::max()) {
-      auto max = thrust::reduce(p_column->begin<T>(), p_column->end<T>(), 0, thrust::maximum<T>());
+    if (static_cast<int64_t>(std::numeric_limits<T>::max()) >
+        std::numeric_limits<cudf::size_type>::max()) {
+      auto max = thrust::reduce(
+        rmm::exec_policy(stream), count.begin<T>(), count.end<T>(), 0, thrust::maximum<T>());
       CUDF_EXPECTS(max <= std::numeric_limits<cudf::size_type>::max(),
-                   "count should not have values larger than size_type's limit.");
+                   "count should not have values larger than size_type maximum.");
     }
-    rmm::device_vector<cudf::size_type> offsets(p_column->size());
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream), p_column->begin<T>(), p_column->end<T>(), offsets.begin());
-    if (check_count == true) {
-      CUDF_EXPECTS(
-        thrust::is_sorted(rmm::exec_policy(stream), offsets.begin(), offsets.end()) == true,
-        "count has negative values or the resulting table has more \
-                    rows than size_type's limit.");
-    }
-
-    return offsets;
   }
 
   template <typename T>
-  std::enable_if_t<not std::is_integral<T>::value, rmm::device_vector<cudf::size_type>> operator()(
-    bool check_count, rmm::cuda_stream_view stream)
+  std::enable_if_t<not std::is_integral<T>::value, void> operator()(rmm::cuda_stream_view stream)
   {
-    CUDF_FAIL("count value should be a integral type.");
+    CUDF_FAIL("count value type should be integral.");
   }
 };
 
@@ -122,10 +110,21 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 
   if (input_table.num_rows() == 0) { return cudf::empty_like(input_table); }
 
-  auto offsets = cudf::type_dispatcher(count.type(), compute_offsets{&count}, check_count, stream);
+  if (check_count) { cudf::type_dispatcher(count.type(), count_checker{count}, stream); }
+
+  auto count_iter = cudf::detail::indexalator_factory::make_input_iterator(count);
+
+  rmm::device_uvector<cudf::size_type> offsets(count.size(), stream);
+  thrust::inclusive_scan(
+    rmm::exec_policy(stream), count_iter, count_iter + count.size(), offsets.begin());
+
+  if (check_count) {
+    CUDF_EXPECTS(thrust::is_sorted(rmm::exec_policy(stream), offsets.begin(), offsets.end()),
+                 "count has negative values or the resulting table has too many rows.");
+  }
 
-  size_type output_size{offsets.back()};
-  rmm::device_vector<size_type> indices(output_size);
+  size_type output_size{offsets.back_element(stream)};
+  rmm::device_uvector<size_type> indices(output_size, stream);
   thrust::upper_bound(rmm::exec_policy(stream),
                       offsets.begin(),
                       offsets.end(),
@@ -150,8 +149,8 @@ std::unique_ptr<table> repeat(table_view const& input_table,
   if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); }
 
   auto output_size = input_table.num_rows() * count;
-  auto map_begin   = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0), [count] __device__(auto i) { return i / count; });
+  auto map_begin   = cudf::detail::make_counting_transform_iterator(
+    0, [count] __device__(auto i) { return i / count; });
   auto map_end = map_begin + output_size;
 
   return gather(input_table, map_begin, map_end, out_of_bounds_policy::DONT_CHECK, stream, mr);
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 34c57996af3..b265e1c3112 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -19,8 +19,10 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby.hpp>
+#include <cudf/detail/groupby/group_replace_nulls.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -33,6 +35,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <memory>
 #include <utility>
@@ -78,6 +81,44 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
 groupby::~groupby() = default;
 
 namespace {
+
+/**
+ * @brief Factory to construct empty result columns.
+ *
+ * Adds special handling for COLLECT_LIST/COLLECT_SET, because:
+ * 1. `make_empty_column()` does not support construction of nested columns.
+ * 2. Empty lists need empty child columns, to persist type information.
+ */
+struct empty_column_constructor {
+  column_view values;
+
+  template <typename ValuesType, aggregation::Kind k>
+  std::unique_ptr<cudf::column> operator()() const
+  {
+    using namespace cudf;
+    using namespace cudf::detail;
+
+    if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
+      return make_lists_column(
+        0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(values), 0, {});
+    }
+
+    // If `values` is LIST typed, and the aggregation results match the type,
+    // construct empty results based on `values`.
+    // Most generally, this applies if input type matches output type.
+    //
+    // Note: `target_type_t` is not recursive, and `ValuesType` does not consider children.
+    //       It is important that `COLLECT_LIST` and `COLLECT_SET` are handled before this
+    //       point, because `COLLECT_LIST(LIST)` produces `LIST<LIST>`, but `target_type_t`
+    //       wouldn't know the difference.
+    if constexpr (std::is_same_v<target_type_t<ValuesType, k>, ValuesType>) {
+      return empty_like(values);
+    }
+
+    return make_empty_column(target_type(values.type(), k));
+  }
+};
+
 /// Make an empty table with appropriate types for requested aggs
 auto empty_results(host_span<aggregation_request const> requests)
 {
@@ -92,7 +133,8 @@ auto empty_results(host_span<aggregation_request const> requests)
         request.aggregations.end(),
         std::back_inserter(results),
         [&request](auto const& agg) {
-          return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind));
+          return cudf::detail::dispatch_type_and_aggregation(
+            request.values.type(), agg->kind, empty_column_constructor{request.values});
         });
 
       return aggregation_result{std::move(results)};
@@ -118,25 +160,6 @@ void verify_valid_requests(host_span<aggregation_request const> requests)
           });
       }),
     "Invalid type/aggregation combination.");
-
-// The aggregations listed in the lambda below will not work with a values column of type
-// dictionary if this is compiled with nvcc/ptxas 10.2.
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-#if (__CUDACC_VER_MAJOR__ == 10) and (__CUDACC_VER_MINOR__ == 2)
-  CUDF_EXPECTS(
-    std::all_of(
-      requests.begin(),
-      requests.end(),
-      [](auto const& request) {
-        return std::all_of(
-          request.aggregations.begin(), request.aggregations.end(), [&request](auto const& agg) {
-            return (!cudf::is_dictionary(request.values.type()) ||
-                    !(agg->kind == aggregation::SUM or agg->kind == aggregation::MEAN or
-                      agg->kind == aggregation::STD or agg->kind == aggregation::VARIANCE));
-          });
-      }),
-    "dictionary type not supported for this aggregation");
-#endif
 }
 
 }  // namespace
@@ -202,6 +225,35 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
   }
 }
 
+std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls(
+  table_view const& values,
+  host_span<cudf::replace_policy const> replace_policies,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(_keys.num_rows() == values.num_rows(),
+               "Size mismatch between group labels and value.");
+  CUDF_EXPECTS(static_cast<cudf::size_type>(replace_policies.size()) == values.num_columns(),
+               "Size mismatch between num_columns and replace_policies.");
+
+  if (values.is_empty()) { return std::make_pair(empty_like(_keys), empty_like(values)); }
+  auto const stream = rmm::cuda_stream_default;
+
+  auto const& group_labels = helper().group_labels(stream);
+  std::vector<std::unique_ptr<column>> results;
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(values.num_columns()),
+                 std::back_inserter(results),
+                 [&](auto i) {
+                   auto grouped_values = helper().grouped_values(values.column(i), stream);
+                   return detail::group_replace_nulls(
+                     grouped_values->view(), group_labels, replace_policies[i], stream, mr);
+                 });
+
+  return std::make_pair(std::move(helper().sorted_keys(stream, mr)),
+                        std::make_unique<table>(std::move(results)));
+}
+
 // Get the sort helper object
 detail::sort::sort_groupby_helper& groupby::helper()
 {
@@ -211,5 +263,37 @@ detail::sort::sort_groupby_helper& groupby::helper()
   return *_helper;
 };
 
+std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
+  table_view const& values,
+  host_span<size_type const> offsets,
+  std::vector<std::reference_wrapper<const scalar>> const& fill_values,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
+               "Mismatch number of fill_values and columns.");
+  CUDF_EXPECTS(
+    std::all_of(thrust::make_counting_iterator(0),
+                thrust::make_counting_iterator(values.num_columns()),
+                [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }),
+    "values and fill_value should have the same type.");
+
+  auto stream = rmm::cuda_stream_default;
+  std::vector<std::unique_ptr<column>> results;
+  auto const& group_offsets = helper().group_offsets(stream);
+  std::transform(
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(values.num_columns()),
+    std::back_inserter(results),
+    [&](size_type i) {
+      auto grouped_values = helper().grouped_values(values.column(i), stream);
+      return cudf::detail::segmented_shift(
+        grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
+    });
+
+  return std::make_pair(helper().sorted_keys(stream, mr),
+                        std::make_unique<cudf::table>(std::move(results)));
+}
+
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 38aacbe59a7..31b48790861 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -55,33 +55,37 @@ namespace groupby {
 namespace detail {
 namespace hash {
 namespace {
-// This is a temporary fix due to compiler bug and we can resort back to
-// constexpr once cuda 10.2 becomes RAPIDS's minimum compiler version
-#if 0
+
 /**
  * @brief List of aggregation operations that can be computed with a hash-based
  * implementation.
  */
-constexpr std::array<aggregation::Kind, 10> hash_aggregations{
-    aggregation::SUM, aggregation::MIN, aggregation::MAX,
-    aggregation::COUNT_VALID, aggregation::COUNT_ALL,
-    aggregation::ARGMIN, aggregation::ARGMAX,
-    aggregation::SUM_OF_SQUARES,
-    aggregation::MEAN, aggregation::STD, aggregation::VARIANCE};
-
-//Could be hash: SUM, PRODUCT, MIN, MAX, COUNT_VALID, COUNT_ALL, ANY, ALL,
+constexpr std::array<aggregation::Kind, 12> hash_aggregations{aggregation::SUM,
+                                                              aggregation::PRODUCT,
+                                                              aggregation::MIN,
+                                                              aggregation::MAX,
+                                                              aggregation::COUNT_VALID,
+                                                              aggregation::COUNT_ALL,
+                                                              aggregation::ARGMIN,
+                                                              aggregation::ARGMAX,
+                                                              aggregation::SUM_OF_SQUARES,
+                                                              aggregation::MEAN,
+                                                              aggregation::STD,
+                                                              aggregation::VARIANCE};
+
+// Could be hash: SUM, PRODUCT, MIN, MAX, COUNT_VALID, COUNT_ALL, ANY, ALL,
 // Compound: MEAN(SUM, COUNT_VALID), VARIANCE, STD(MEAN (SUM, COUNT_VALID), COUNT_VALID),
 // ARGMAX, ARGMIN
-// FIXME(kn): adding SUM_OF_SQUARES causes ptxas compiler crash (<=CUDA 10.2) for more than 3 types!
 
+// TODO replace with std::find in C++20 onwards.
 template <class T, size_t N>
-constexpr bool array_contains(std::array<T, N> const& haystack, T needle) {
-  for (auto i = 0u; i < N; ++i) {
-    if (haystack[i] == needle) return true;
+constexpr bool array_contains(std::array<T, N> const& haystack, T needle)
+{
+  for (auto const& val : haystack) {
+    if (val == needle) return true;
   }
   return false;
 }
-#endif
 
 /**
  * @brief Indicates whether the specified aggregation operation can be computed
@@ -93,16 +97,67 @@ constexpr bool array_contains(std::array<T, N> const& haystack, T needle) {
  */
 bool constexpr is_hash_aggregation(aggregation::Kind t)
 {
-  // this is a temporary fix due to compiler bug and we can resort back to
-  // constexpr once cuda 10.2 becomes RAPIDS's minimum compiler version
-  // return array_contains(hash_aggregations, t);
-  return (t == aggregation::SUM) or (t == aggregation::MIN) or (t == aggregation::MAX) or
-         (t == aggregation::COUNT_VALID) or (t == aggregation::COUNT_ALL) or
-         (t == aggregation::ARGMIN) or (t == aggregation::ARGMAX) or
-         (t == aggregation::SUM_OF_SQUARES) or (t == aggregation::MEAN) or
-         (t == aggregation::STD) or (t == aggregation::VARIANCE);
+  return array_contains(hash_aggregations, t);
 }
 
+class groupby_simple_aggregations_collector final
+  : public cudf::detail::simple_aggregations_collector {
+ public:
+  using cudf::detail::simple_aggregations_collector::visit;
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::min_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
+                                                    : make_min_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::max_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
+                                                    : make_max_aggregation());
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, cudf::detail::mean_aggregation const& agg) override
+  {
+    CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type");
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::var_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::std_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
+};
+
 template <typename Map>
 class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer {
   size_t col_idx;
@@ -118,6 +173,8 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   rmm::cuda_stream_view stream;
 
  public:
+  using cudf::detail::aggregation_finalizer::visit;
+
   hash_compound_agg_finalizer(size_t col_idx,
                               column_view col,
                               cudf::detail::result_cache* sparse_results,
@@ -156,10 +213,9 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   }
 
   // Enables conversion of ARGMIN/ARGMAX into MIN/MAX
-  auto gather_argminmax(aggregation::Kind const& agg_kind)
+  auto gather_argminmax(aggregation const& agg)
   {
-    auto transformed_agg = std::make_unique<aggregation>(agg_kind);
-    auto arg_result      = to_dense_agg_result(*transformed_agg);
+    auto arg_result = to_dense_agg_result(agg);
     // We make a view of ARG(MIN/MAX) result without a null mask and gather
     // using this map. The values in data buffer of ARG(MIN/MAX) result
     // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL
@@ -178,7 +234,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                            stream,
                            mr);
     return std::move(gather_argminmax->release()[0]);
-  };
+  }
 
   // Declare overloads for each kind of aggregation to dispatch
   void visit(cudf::aggregation const& agg) override
@@ -190,20 +246,24 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   void visit(cudf::detail::min_aggregation const& agg) override
   {
     if (dense_results->has_result(col_idx, agg)) return;
-    if (result_type.id() == type_id::STRING)
-      dense_results->add_result(col_idx, agg, gather_argminmax(aggregation::ARGMIN));
-    else
+    if (result_type.id() == type_id::STRING) {
+      auto transformed_agg = make_argmin_aggregation();
+      dense_results->add_result(col_idx, agg, gather_argminmax(*transformed_agg));
+    } else {
       dense_results->add_result(col_idx, agg, to_dense_agg_result(agg));
+    }
   }
 
   void visit(cudf::detail::max_aggregation const& agg) override
   {
     if (dense_results->has_result(col_idx, agg)) return;
 
-    if (result_type.id() == type_id::STRING)
-      dense_results->add_result(col_idx, agg, gather_argminmax(aggregation::ARGMAX));
-    else
+    if (result_type.id() == type_id::STRING) {
+      auto transformed_agg = make_argmax_aggregation();
+      dense_results->add_result(col_idx, agg, gather_argminmax(*transformed_agg));
+    } else {
       dense_results->add_result(col_idx, agg, to_dense_agg_result(agg));
+    }
   }
 
   void visit(cudf::detail::mean_aggregation const& agg) override
@@ -262,19 +322,22 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   {
     if (dense_results->has_result(col_idx, agg)) return;
     auto var_agg = make_variance_aggregation(agg._ddof);
-    this->visit(*static_cast<cudf::detail::var_aggregation*>(var_agg.get()));
+    this->visit(*dynamic_cast<cudf::detail::var_aggregation*>(var_agg.get()));
     column_view variance = dense_results->get_result(col_idx, *var_agg);
 
     auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr);
     dense_results->add_result(col_idx, agg, std::move(result));
   }
 };
-
 // flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<size_t>>
+std::tuple<table_view,
+           std::vector<aggregation::Kind>,
+           std::vector<std::unique_ptr<aggregation>>,
+           std::vector<size_t>>
 flatten_single_pass_aggs(host_span<aggregation_request const> requests)
 {
   std::vector<column_view> columns;
+  std::vector<std::unique_ptr<aggregation>> aggs;
   std::vector<aggregation::Kind> agg_kinds;
   std::vector<size_t> col_ids;
 
@@ -283,24 +346,30 @@ flatten_single_pass_aggs(host_span<aggregation_request const> requests)
     auto const& agg_v   = request.aggregations;
 
     std::unordered_set<aggregation::Kind> agg_kinds_set;
-    auto insert_agg = [&](size_t i, column_view const& request_values, aggregation::Kind k) {
-      if (agg_kinds_set.insert(k).second) {
-        agg_kinds.push_back(k);
-        columns.push_back(request_values);
-        col_ids.push_back(i);
-      }
-    };
+    auto insert_agg =
+      [&](size_t i, column_view const& request_values, std::unique_ptr<aggregation>&& agg) {
+        if (agg_kinds_set.insert(agg->kind).second) {
+          agg_kinds.push_back(agg->kind);
+          aggs.push_back(std::move(agg));
+          columns.push_back(request_values);
+          col_ids.push_back(i);
+        }
+      };
 
     auto values_type = cudf::is_dictionary(request.values.type())
                          ? cudf::dictionary_column_view(request.values).keys().type()
                          : request.values.type();
     for (auto&& agg : agg_v) {
-      for (auto const& agg_s : agg->get_simple_aggregations(values_type))
-        insert_agg(i, request.values, agg_s);
+      groupby_simple_aggregations_collector collector;
+
+      for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) {
+        insert_agg(i, request.values, std::move(agg_s));
+      }
     }
   }
 
-  return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(col_ids));
+  return std::make_tuple(
+    table_view(columns), std::move(agg_kinds), std::move(aggs), std::move(col_ids));
 }
 
 /**
@@ -428,17 +497,14 @@ void compute_single_pass_aggs(table_view const& keys,
                               rmm::cuda_stream_view stream)
 {
   // flatten the aggs to a table that can be operated on by aggregate_row
-  table_view flattened_values;
-  std::vector<aggregation::Kind> aggs;
-  std::vector<size_t> col_ids;
-  std::tie(flattened_values, aggs, col_ids) = flatten_single_pass_aggs(requests);
+  auto const [flattened_values, agg_kinds, aggs, col_ids] = flatten_single_pass_aggs(requests);
 
   // make table that will hold sparse results
-  table sparse_table = create_sparse_results_table(flattened_values, aggs, stream);
+  table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream);
   // prepare to launch kernel to do the actual aggregation
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
-  rmm::device_vector<aggregation::Kind> d_aggs(aggs);
+  auto const d_aggs   = cudf::detail::make_device_uvector_async(agg_kinds, stream);
 
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
@@ -452,15 +518,14 @@ void compute_single_pass_aggs(table_view const& keys,
                                            keys.num_rows(),
                                            *d_values,
                                            *d_sparse_table,
-                                           d_aggs.data().get(),
+                                           d_aggs.data(),
                                            static_cast<bitmask_type*>(row_bitmask.data()),
                                            skip_key_rows_with_nulls});
   // Add results back to sparse_results cache
   auto sparse_result_cols = sparse_table.release();
   for (size_t i = 0; i < aggs.size(); i++) {
     // Note that the cache will make a copy of this temporary aggregation
-    auto agg = std::make_unique<aggregation>(aggs[i]);
-    sparse_results->add_result(col_ids[i], *agg, std::move(sparse_result_cols[i]));
+    sparse_results->add_result(col_ids[i], *aggs[i], std::move(sparse_result_cols[i]));
   }
 }
 
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 46185e07600..9d8f145a7c9 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -51,7 +51,7 @@ namespace detail {
  * memoised sorted and/or grouped values and re-using will save on computation
  * of these values.
  */
-struct aggregrate_result_functor final : store_result_functor {
+struct aggregate_result_functor final : store_result_functor {
   using store_result_functor::store_result_functor;
   template <aggregation::Kind k>
   void operator()(aggregation const& agg)
@@ -61,7 +61,7 @@ struct aggregrate_result_functor final : store_result_functor {
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::COUNT_VALID>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::COUNT_VALID>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -76,7 +76,7 @@ void aggregrate_result_functor::operator()<aggregation::COUNT_VALID>(aggregation
 }
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -87,7 +87,7 @@ void aggregrate_result_functor::operator()<aggregation::COUNT_ALL>(aggregation c
 }
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::SUM>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -99,7 +99,19 @@ void aggregrate_result_functor::operator()<aggregation::SUM>(aggregation const&
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::ARGMAX>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::PRODUCT>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) return;
+
+  cache.add_result(
+    col_idx,
+    agg,
+    detail::group_product(
+      get_grouped_values(), helper.num_groups(stream), helper.group_labels(stream), stream, mr));
+};
+
+template <>
+void aggregate_result_functor::operator()<aggregation::ARGMAX>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -114,7 +126,7 @@ void aggregrate_result_functor::operator()<aggregation::ARGMAX>(aggregation cons
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::ARGMIN>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::ARGMIN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -129,7 +141,7 @@ void aggregrate_result_functor::operator()<aggregation::ARGMIN>(aggregation cons
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::MIN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -166,7 +178,7 @@ void aggregrate_result_functor::operator()<aggregation::MIN>(aggregation const&
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::MAX>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -203,7 +215,7 @@ void aggregrate_result_functor::operator()<aggregation::MAX>(aggregation const&
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::MEAN>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::MEAN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -227,11 +239,11 @@ void aggregrate_result_functor::operator()<aggregation::MEAN>(aggregation const&
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::VARIANCE>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::VARIANCE>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  auto var_agg   = static_cast<cudf::detail::var_aggregation const&>(agg);
+  auto var_agg   = dynamic_cast<cudf::detail::var_aggregation const&>(agg);
   auto mean_agg  = make_mean_aggregation();
   auto count_agg = make_count_aggregation();
   operator()<aggregation::MEAN>(*mean_agg);
@@ -250,11 +262,11 @@ void aggregrate_result_functor::operator()<aggregation::VARIANCE>(aggregation co
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::STD>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::STD>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  auto std_agg = static_cast<cudf::detail::std_aggregation const&>(agg);
+  auto std_agg = dynamic_cast<cudf::detail::std_aggregation const&>(agg);
   auto var_agg = make_variance_aggregation(std_agg._ddof);
   operator()<aggregation::VARIANCE>(*var_agg);
   column_view var_result = cache.get_result(col_idx, *var_agg);
@@ -264,14 +276,14 @@ void aggregrate_result_functor::operator()<aggregation::STD>(aggregation const&
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::QUANTILE>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::QUANTILE>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
   auto count_agg = make_count_aggregation();
   operator()<aggregation::COUNT_VALID>(*count_agg);
   column_view group_sizes = cache.get_result(col_idx, *count_agg);
-  auto quantile_agg       = static_cast<cudf::detail::quantile_aggregation const&>(agg);
+  auto quantile_agg       = dynamic_cast<cudf::detail::quantile_aggregation const&>(agg);
 
   auto result = detail::group_quantiles(get_sorted_values(),
                                         group_sizes,
@@ -285,7 +297,7 @@ void aggregrate_result_functor::operator()<aggregation::QUANTILE>(aggregation co
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::MEDIAN>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::MEDIAN>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
@@ -305,11 +317,11 @@ void aggregrate_result_functor::operator()<aggregation::MEDIAN>(aggregation cons
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::NUNIQUE>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::NUNIQUE>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  auto nunique_agg = static_cast<cudf::detail::nunique_aggregation const&>(agg);
+  auto nunique_agg = dynamic_cast<cudf::detail::nunique_aggregation const&>(agg);
 
   auto result = detail::group_nunique(get_sorted_values(),
                                       helper.group_labels(stream),
@@ -322,19 +334,20 @@ void aggregrate_result_functor::operator()<aggregation::NUNIQUE>(aggregation con
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation const& agg)
 {
   if (cache.has_result(col_idx, agg)) return;
 
-  auto nth_element_agg = static_cast<cudf::detail::nth_element_aggregation const&>(agg);
+  auto nth_element_agg = dynamic_cast<cudf::detail::nth_element_aggregation const&>(agg);
 
   auto count_agg = make_count_aggregation(nth_element_agg._null_handling);
-  if (count_agg->kind == aggregation::COUNT_VALID)
+  if (count_agg->kind == aggregation::COUNT_VALID) {
     operator()<aggregation::COUNT_VALID>(*count_agg);
-  else if (count_agg->kind == aggregation::COUNT_ALL)
+  } else if (count_agg->kind == aggregation::COUNT_ALL) {
     operator()<aggregation::COUNT_ALL>(*count_agg);
-  else
+  } else {
     CUDF_FAIL("Wrong count aggregation kind");
+  }
   column_view group_sizes = cache.get_result(col_idx, *count_agg);
 
   cache.add_result(col_idx,
@@ -351,37 +364,42 @@ void aggregrate_result_functor::operator()<aggregation::NTH_ELEMENT>(aggregation
 }
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::COLLECT_LIST>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::COLLECT_LIST>(aggregation const& agg)
 {
   auto null_handling =
-    static_cast<cudf::detail::collect_list_aggregation const&>(agg)._null_handling;
-  CUDF_EXPECTS(null_handling == null_policy::INCLUDE,
-               "null exclusion is not supported on groupby COLLECT_LIST aggregation.");
+    dynamic_cast<cudf::detail::collect_list_aggregation const&>(agg)._null_handling;
+  agg.do_hash();
 
   if (cache.has_result(col_idx, agg)) return;
 
-  auto result = detail::group_collect(
-    get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr);
+  auto result = detail::group_collect(get_grouped_values(),
+                                      helper.group_offsets(stream),
+                                      helper.num_groups(stream),
+                                      null_handling,
+                                      stream,
+                                      mr);
 
   cache.add_result(col_idx, agg, std::move(result));
 };
 
 template <>
-void aggregrate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation const& agg)
+void aggregate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation const& agg)
 {
   auto const null_handling =
-    static_cast<cudf::detail::collect_set_aggregation const&>(agg)._null_handling;
-  CUDF_EXPECTS(null_handling == null_policy::INCLUDE,
-               "null exclusion is not supported on groupby COLLECT_SET aggregation.");
+    dynamic_cast<cudf::detail::collect_set_aggregation const&>(agg)._null_handling;
 
   if (cache.has_result(col_idx, agg)) { return; }
 
-  auto const collect_result = detail::group_collect(
-    get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr);
+  auto const collect_result = detail::group_collect(get_grouped_values(),
+                                                    helper.group_offsets(stream),
+                                                    helper.num_groups(stream),
+                                                    null_handling,
+                                                    stream,
+                                                    mr);
   auto const nulls_equal =
-    static_cast<cudf::detail::collect_set_aggregation const&>(agg)._nulls_equal;
+    dynamic_cast<cudf::detail::collect_set_aggregation const&>(agg)._nulls_equal;
   auto const nans_equal =
-    static_cast<cudf::detail::collect_set_aggregation const&>(agg)._nans_equal;
+    dynamic_cast<cudf::detail::collect_set_aggregation const&>(agg)._nans_equal;
   cache.add_result(
     col_idx,
     agg,
@@ -403,7 +421,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
 
   for (size_t i = 0; i < requests.size(); i++) {
     auto store_functor =
-      detail::aggregrate_result_functor(i, requests[i].values, helper(), cache, stream, mr);
+      detail::aggregate_result_functor(i, requests[i].values, helper(), cache, stream, mr);
     for (size_t j = 0; j < requests[i].aggregations.size(); j++) {
       // TODO (dm): single pass compute all supported reductions
       cudf::detail::aggregation_dispatcher(
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index b7bcd05a72a..1e6a681af94 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,30 +17,101 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/gather.cuh>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <memory>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
+/**
+ * @brief Purge null entries in grouped values, and adjust group offsets.
+ *
+ * @param values Grouped values to be purged
+ * @param offsets Offsets of groups' starting points
+ * @param num_groups Number of groups
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Pair of null-eliminated grouped values and corresponding offsets
+ */
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
+  column_view const &values,
+  column_view const &offsets,
+  size_type num_groups,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource *mr)
+{
+  auto values_device_view = column_device_view::create(values, stream);
+
+  auto not_null_pred = [d_value = *values_device_view] __device__(auto i) {
+    return d_value.is_valid_nocheck(i);
+  };
+
+  // Purge null entries in grouped values.
+  auto null_purged_entries =
+    cudf::detail::copy_if(table_view{{values}}, not_null_pred, stream, mr)->release();
+
+  auto null_purged_values = std::move(null_purged_entries.front());
+
+  // Recalculate offsets after null entries are purged.
+  rmm::device_uvector<size_type> null_purged_sizes(num_groups, stream);
+
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_groups),
+    null_purged_sizes.begin(),
+    [d_offsets = offsets.template begin<size_type>(), not_null_pred] __device__(auto i) {
+      return thrust::count_if(thrust::seq,
+                              thrust::make_counting_iterator<size_type>(d_offsets[i]),
+                              thrust::make_counting_iterator<size_type>(d_offsets[i + 1]),
+                              not_null_pred);
+    });
+
+  auto null_purged_offsets = strings::detail::make_offsets_child_column(
+    null_purged_sizes.cbegin(), null_purged_sizes.cend(), stream, mr);
+
+  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(
+    std::move(null_purged_values), std::move(null_purged_offsets));
+}
+
 std::unique_ptr<column> group_collect(column_view const &values,
                                       cudf::device_span<size_type const> group_offsets,
                                       size_type num_groups,
+                                      null_policy null_handling,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource *mr)
 {
-  rmm::device_buffer offsets_data(
-    group_offsets.data(), group_offsets.size() * sizeof(cudf::size_type), stream, mr);
+  auto [child_column,
+        offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] {
+    auto offsets_column = make_numeric_column(
+      data_type(type_to_id<offset_type>()), num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+
+    thrust::copy(rmm::exec_policy(stream),
+                 group_offsets.begin(),
+                 group_offsets.end(),
+                 offsets_column->mutable_view().template begin<offset_type>());
 
-  auto offsets = std::make_unique<cudf::column>(
-    cudf::data_type(cudf::type_to_id<cudf::size_type>()), num_groups + 1, std::move(offsets_data));
+    // If column of grouped values contains null elements, and null_policy == EXCLUDE,
+    // those elements must be filtered out, and offsets recomputed.
+    if (null_handling == null_policy::EXCLUDE && values.has_nulls()) {
+      return cudf::groupby::detail::purge_null_entries(
+        values, offsets_column->view(), num_groups, stream, mr);
+    } else {
+      return std::make_pair(std::make_unique<cudf::column>(values, stream, mr),
+                            std::move(offsets_column));
+    }
+  }();
 
   return make_lists_column(num_groups,
-                           std::move(offsets),
-                           std::make_unique<cudf::column>(values, stream, mr),
+                           std::move(offsets_column),
+                           std::move(child_column),
                            0,
                            rmm::device_buffer{0, stream, mr},
                            stream,
diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu
index 4ad533aebdc..0caef47f0e3 100644
--- a/cpp/src/groupby/sort/group_count_scan.cu
+++ b/cpp/src/groupby/sort/group_count_scan.cu
@@ -23,7 +23,6 @@
 #include <thrust/scan.h>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index e6c10aa1056..c3d874f3b33 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -27,6 +27,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/uninitialized_fill.h>
+
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -45,7 +48,10 @@ std::unique_ptr<column> group_nth_element(column_view const &values,
 
   if (num_groups == 0) { return empty_like(values); }
 
-  auto nth_index = rmm::device_vector<size_type>(num_groups, values.size());
+  auto nth_index = rmm::device_uvector<size_type>(num_groups, stream);
+  // TODO: replace with async version
+  thrust::uninitialized_fill_n(
+    rmm::exec_policy(stream), nth_index.begin(), num_groups, values.size());
 
   // nulls_policy::INCLUDE (equivalent to pandas nth(dropna=None) but return nulls for n
   if (null_handling == null_policy::INCLUDE || !values.has_nulls()) {
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
new file mode 100644
index 00000000000..e9cf8611b58
--- /dev/null
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <groupby/sort/group_single_pass_reduction_util.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+std::unique_ptr<column> group_product(column_view const& values,
+                                      size_type num_groups,
+                                      cudf::device_span<size_type const> group_labels,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  auto values_type = cudf::is_dictionary(values.type())
+                       ? dictionary_column_view(values).keys().type()
+                       : values.type();
+  return type_dispatcher(values_type,
+                         reduce_functor<aggregation::PRODUCT>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index c9f9e3cad9e..64ddc8f6b9d 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -14,21 +14,22 @@
  * limitations under the License.
  */
 
+#include <quantiles/quantiles_util.hpp>
+#include "group_reductions.hpp"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <groupby/sort/group_reductions.hpp>
-#include <quantiles/quantiles_util.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <thrust/for_each.h>
 
 namespace cudf {
@@ -153,7 +154,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  rmm::device_vector<double> dv_quantiles(quantiles);
+  auto dv_quantiles = cudf::detail::make_device_uvector_async(quantiles, stream);
 
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index b69fe6a0291..7cc0aea8362 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,12 +24,24 @@
 
 #include <memory>
 
+/** @internal @file Internal API in this file are mostly segmented reduction operations on column,
+ * which are used in sort-based groupby aggregations.
+ *
+ */
 namespace cudf {
 namespace groupby {
 namespace detail {
 /**
  * @brief Internal API to calculate groupwise sum
  *
+ * @code{.pseudo}
+ * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * num_groups   = 4
+ *
+ * group_sum    = [7, -3, 4, <NA>]
+ * @endcode
+ *
  * @param values Grouped values to get sum of
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
@@ -42,9 +54,40 @@ std::unique_ptr<column> group_sum(column_view const& values,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Internal API to calculate groupwise product
+ *
+ * @code{.pseudo}
+ * values        = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels  = [0, 0, 0,  1,  1,    2, 2,    3]
+ * num_groups    = 4
+ *
+ * group_product = [6, 2, 4, <NA>]
+ * @endcode
+ *
+ * @param values Grouped values to get product of
+ * @param num_groups Number of groups
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> group_product(column_view const& values,
+                                      size_type num_groups,
+                                      cudf::device_span<size_type const> group_labels,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Internal API to calculate groupwise minimum value
  *
+ * @code{.pseudo}
+ * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * num_groups   = 4
+ *
+ * group_min    = [1, -2, 4, <NA>]
+ * @endcode
+ *
  * @param values Grouped values to get minimum from
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
@@ -60,6 +103,14 @@ std::unique_ptr<column> group_min(column_view const& values,
 /**
  * @brief Internal API to calculate groupwise maximum value
  *
+ * @code{.pseudo}
+ * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * num_groups   = 4
+ *
+ * group_max    = [4, -1, 4, <NA>]
+ * @endcode
+ *
  * @param values Grouped values to get maximum from
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
@@ -75,7 +126,15 @@ std::unique_ptr<column> group_max(column_view const& values,
 /**
  * @brief Internal API to calculate group-wise indices of maximum values.
  *
- * @param values Ungrouped values to get maximum value's index from
+ * @code{.pseudo}
+ * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * num_groups   = 4
+ *
+ * group_max    = [2, 0, 0, <NA>]
+ * @endcode
+ *
+ * @param values Grouped values to get maximum value's index from
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
  * @param key_sort_order Indices indicating sort order of groupby keys
@@ -92,7 +151,15 @@ std::unique_ptr<column> group_argmax(column_view const& values,
 /**
  * @brief Internal API to calculate group-wise indices of minimum values.
  *
- * @param values Ungrouped values to get minimum value's index from
+ * @code{.pseudo}
+ * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * num_groups   = 4
+ *
+ * group_max    = [1, 1, 0, <NA>]
+ * @endcode
+ *
+ * @param values Grouped values to get minimum value's index from
  * @param num_groups Number of groups
  * @param group_labels ID of group that the corresponding value belongs to
  * @param key_sort_order Indices indicating sort order of groupby keys
@@ -110,6 +177,14 @@ std::unique_ptr<column> group_argmin(column_view const& values,
  * @brief Internal API to calculate number of non-null values in each group of
  *  @p values
  *
+ * @code{.pseudo}
+ * values            = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels      = [0, 0, 0,  1,  1,    2, 2,    3]
+ * num_groups        = 4
+ *
+ * group_count_valid = [3, 2, 1, 0]
+ * @endcode
+ *
  * @param values Grouped values to get valid count of
  * @param group_labels ID of group that the corresponding value belongs to
  * @param num_groups Number of groups ( unique values in @p group_labels )
@@ -125,6 +200,13 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 /**
  * @brief Internal API to calculate number of values in each group of @p values
  *
+ * @code{.pseudo}
+ * group_offsets = [0, 3, 5, 7, 8]
+ * num_groups    = 4
+ *
+ * group_count_all = [3, 2, 2, 1]
+ * @endcode
+ *
  * @param group_offsets Offsets of groups' starting points within @p values
  * @param num_groups Number of groups ( unique values in @p group_labels )
  * @param mr Device memory resource used to allocate the returned column's device memory
@@ -138,6 +220,16 @@ std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group
 /**
  * @brief Internal API to calculate groupwise variance
  *
+ * @code{.pseudo}
+ * values       = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * group_means  = [2.333333, -1.5, 4.0, <NA>]
+ * group_sizes  = [3, 2, 2, 1]
+ * ddof         = 1
+ *
+ * group_var    = [2.333333, 0.5, <NA>, <NA>]
+ * @endcode
+ *
  * @param values Grouped values to get variance of
  * @param group_means Pre-calculated groupwise MEAN
  * @param group_sizes Number of valid elements per group
@@ -158,6 +250,16 @@ std::unique_ptr<column> group_var(column_view const& values,
 /**
  * @brief Internal API to calculate groupwise quantiles
  *
+ * @code{.pseudo}
+ * values       = [1, 2, 4, -2, -1, <NA>, 4, <NA>]
+ * group_labels = [0, 0, 0,  1,  1,    2, 2,    3]
+ * group_sizes  = [3, 2, 2, 1]
+ * num_groups   = 4
+ * quantiles    = [0.25, 0.5]
+ *
+ * group_quantiles = [1.5, 2, -1.75, -1.5,  4,  4, <NA>, <NA>]
+ * @endcode
+ *
  * @param values Grouped and sorted (within group) values to get quantiles from
  * @param group_sizes Number of valid elements per group
  * @param group_offsets Offsets of groups' starting points within @p values
@@ -179,6 +281,16 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
  * @brief Internal API to calculate number of unique values in each group of
  *  @p values
  *
+ * @code{.pseudo}
+ * values        = [2, 4, 4, -1, -2, <NA>, 4, <NA>]
+ * group_labels  = [0, 0, 0,  1,  1,    2, 2,    3]
+ * group_offsets = [0,        3,        5,       7, 8]
+ * num_groups    = 4
+ *
+ * group_nunique(null_policy::EXCLUDE) = [2, 2, 1, 0]
+ * group_nunique(null_policy::INCLUDE) = [2, 2, 2, 1]
+ * @endcode
+ *
  * @param values Grouped and sorted (within group) values to get unique count of
  * @param group_labels ID of group that the corresponding value belongs to
  * @param num_groups Number of groups ( unique values in @p group_labels )
@@ -200,6 +312,17 @@ std::unique_ptr<column> group_nunique(column_view const& values,
 /**
  * @brief Internal API to calculate nth values in each group of  @p values
  *
+ * @code{.pseudo}
+ * values        = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_sizes   = [3,        2,        2,       1]
+ * group_labels  = [0, 0, 0,  1,  1,    2, 2,    3]
+ * group_offsets = [0,        3,        5,       7, 8]
+ * num_groups    = 4
+ *
+ * group_nth_element(n=0, null_policy::EXCLUDE) = [2, -1, 4, <NA>]
+ * group_nth_element(n=0, null_policy::INCLUDE) = [2, -1, <NA>, <NA>]
+ * @endcode
+ *
  * @param values Grouped values to get nth value of
  * @param group_sizes Number of elements per group
  * @param group_labels ID of group that the corresponding value belongs to
@@ -223,18 +346,32 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
 /**
  * @brief Internal API to collect grouped values into a lists column
  *
+ * @code{.pseudo}
+ * values        = [2, 1, 4, -1, -2, <NA>, 4, <NA>]
+ * group_offsets = [0,        3,        5,   7, 8]
+ * num_groups    = 4
+ *
+ * group_collect = [[2, 1, 4], [-1, -2] [<NA>, 4], [<NA>]]
+ * @endcode
+ *
  * @param values Grouped values to collect
  * @param group_offsets Offsets of groups' starting points within @p values
  * @param num_groups Number of groups
- * @param mr Device memory resource used to allocate the returned column's device memory
+ * @param null_handling Exclude nulls while counting if null_policy::EXCLUDE,
+ *  Include nulls if null_policy::INCLUDE.
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
  */
 std::unique_ptr<column> group_collect(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       size_type num_groups,
+                                      null_policy null_handling,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr);
 
+/** @endinternal
+ *
+ */
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
new file mode 100644
index 00000000000..56e4cb83f71
--- /dev/null
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/groupby/group_replace_nulls.hpp>
+#include <cudf/detail/replace/nulls.cuh>
+#include <cudf/replace.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+#include <utility>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_value,
+                                            device_span<size_type const> group_labels,
+                                            cudf::replace_policy replace_policy,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  cudf::size_type size = grouped_value.size();
+
+  auto device_in = cudf::column_device_view::create(grouped_value);
+  auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
+  auto valid_it  = cudf::detail::make_validity_iterator(*device_in);
+  auto in_begin  = thrust::make_zip_iterator(thrust::make_tuple(index, valid_it));
+
+  rmm::device_uvector<cudf::size_type> gather_map(size, stream);
+  auto gm_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
+
+  auto func = cudf::detail::replace_policy_functor();
+  thrust::equal_to<cudf::size_type> eq;
+  if (replace_policy == cudf::replace_policy::PRECEDING) {
+    thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                  group_labels.begin(),
+                                  group_labels.begin() + size,
+                                  in_begin,
+                                  gm_begin,
+                                  eq,
+                                  func);
+  } else {
+    auto gl_rbegin = thrust::make_reverse_iterator(group_labels.begin() + size);
+    auto in_rbegin = thrust::make_reverse_iterator(in_begin + size);
+    auto gm_rbegin = thrust::make_reverse_iterator(gm_begin + size);
+    thrust::inclusive_scan_by_key(
+      rmm::exec_policy(stream), gl_rbegin, gl_rbegin + size, in_rbegin, gm_rbegin, eq, func);
+  }
+
+  auto output = cudf::detail::gather(cudf::table_view({grouped_value}),
+                                     gather_map.begin(),
+                                     gather_map.end(),
+                                     cudf::out_of_bounds_policy::DONT_CHECK,
+                                     stream,
+                                     mr);
+
+  return std::move(output->release()[0]);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 9f8614a61b4..53d05b0c48b 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -27,7 +27,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/functional.h>
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 63a68974d6b..67062658c39 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -40,14 +40,17 @@ struct reduce_functor {
   template <typename T>
   static constexpr bool is_supported()
   {
-    if (K == aggregation::SUM)
-      return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
-    else if (K == aggregation::MIN or K == aggregation::MAX)
-      return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
-    else if (K == aggregation::ARGMIN or K == aggregation::ARGMAX)
-      return is_relationally_comparable<T, T>();
-    else
-      return false;
+    switch (K) {
+      case aggregation::SUM:
+        return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
+      case aggregation::PRODUCT: return cudf::detail::is_product_supported<T>();
+      case aggregation::MIN:
+      case aggregation::MAX:
+        return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
+      case aggregation::ARGMIN:
+      case aggregation::ARGMAX: return is_relationally_comparable<T, T>();
+      default: return false;
+    }
   }
 
   template <typename T>
@@ -62,7 +65,7 @@ struct reduce_functor {
     using OpType     = cudf::detail::corresponding_operator_t<K>;
     using ResultType = cudf::detail::target_type_t<T, K>;
 
-    auto result_type = is_fixed_point<T>()
+    auto result_type = is_fixed_point<ResultType>()
                          ? data_type{type_to_id<ResultType>(), values.type().scale()}
                          : data_type{type_to_id<ResultType>()};
 
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 53be019f73b..a882b33bcdf 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 #include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/gather.cuh>
 #include <cudf/detail/hashing.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/scatter.cuh>
-#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/partitioning.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
-#include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/tabulate.h>
 
 #include <algorithm>
 
 namespace cudf {
+namespace detail {
 namespace {
 
-// MD5 supported leaf data type check
-bool md5_type_check(data_type dt)
-{
-  return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING));
-}
-
 template <typename IterType>
 std::vector<column_view> to_leaf_columns(IterType iter_begin, IterType iter_end)
 {
@@ -58,88 +49,6 @@ std::vector<column_view> to_leaf_columns(IterType iter_begin, IterType iter_end)
 
 }  // namespace
 
-namespace detail {
-
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             std::vector<uint32_t> const& initial_hash,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
-    case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
-    case (hash_id::HASH_SERIAL_MURMUR3):
-      return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
-    case (hash_id::HASH_SPARK_MURMUR3):
-      return serial_murmur_hash3_32<SparkMurmurHash3_32>(input, seed, stream, mr);
-    default: return nullptr;
-  }
-}
-
-std::unique_ptr<column> md5_hash(table_view const& input,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  if (input.num_columns() == 0 || input.num_rows() == 0) {
-    const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e");
-    auto output = make_column_from_scalar(string_128bit, input.num_rows(), stream, mr);
-    return output;
-  }
-
-  // Accepts string and fixed width columns, or single layer list columns holding those types
-  CUDF_EXPECTS(
-    std::all_of(input.begin(),
-                input.end(),
-                [](auto col) {
-                  return md5_type_check(col.type()) ||
-                         (col.type().id() == type_id::LIST && md5_type_check(col.child(1).type()));
-                }),
-    "MD5 unsupported column type");
-
-  // Result column allocation and creation
-  auto begin = thrust::make_constant_iterator(32);
-  auto offsets_column =
-    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
-
-  auto chars_column = strings::detail::create_chars_child_column(
-    input.num_rows(), 0, input.num_rows() * 32, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.data<char>();
-
-  rmm::device_buffer null_mask{0, stream, mr};
-
-  auto const device_input = table_device_view::create(input, stream);
-
-  // Hash each row, hashing each element sequentially left to right
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(input.num_rows()),
-                   [d_chars, device_input = *device_input] __device__(auto row_index) {
-                     md5_intermediate_data hash_state;
-                     MD5Hash hasher = MD5Hash{};
-                     for (int col_index = 0; col_index < device_input.num_columns(); col_index++) {
-                       if (device_input.column(col_index).is_valid(row_index)) {
-                         cudf::type_dispatcher(device_input.column(col_index).type(),
-                                               hasher,
-                                               device_input.column(col_index),
-                                               row_index,
-                                               &hash_state);
-                       }
-                     }
-                     hasher.finalize(&hash_state, d_chars + (row_index * 32));
-                   });
-
-  return make_strings_column(input.num_rows(),
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             0,
-                             std::move(null_mask),
-                             stream,
-                             mr);
-}
-
 template <template <typename> class hash_function>
 std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
                                                uint32_t seed,
@@ -196,63 +105,29 @@ std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
   return output;
 }
 
-std::unique_ptr<column> murmur_hash3_32(table_view const& input,
-                                        std::vector<uint32_t> const& initial_hash,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> hash(table_view const& input,
+                             hash_id hash_function,
+                             cudf::host_span<uint32_t const> initial_hash,
+                             uint32_t seed,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
-  // TODO this should be UINT32
-  auto output = make_numeric_column(
-    data_type(type_id::INT32), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
-
-  // Return early if there's nothing to hash
-  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
-
-  bool const nullable     = has_nulls(input);
-  auto const device_input = table_device_view::create(input, stream);
-  auto output_view        = output->mutable_view();
-
-  // Compute the hash value for each row depending on the specified hash function
-  if (!initial_hash.empty()) {
-    CUDF_EXPECTS(initial_hash.size() == size_t(input.num_columns()),
-                 "Expected same size of initial hash values as number of columns");
-    auto device_initial_hash = rmm::device_vector<uint32_t>(initial_hash);
-
-    if (nullable) {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher_initial_values<MurmurHash3_32, true>(
-                         *device_input, device_initial_hash.data().get()));
-    } else {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher_initial_values<MurmurHash3_32, false>(
-                         *device_input, device_initial_hash.data().get()));
-    }
-  } else {
-    if (nullable) {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher<MurmurHash3_32, true>(*device_input));
-    } else {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher<MurmurHash3_32, false>(*device_input));
-    }
+  switch (hash_function) {
+    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
+    case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
+    case (hash_id::HASH_SERIAL_MURMUR3):
+      return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
+    case (hash_id::HASH_SPARK_MURMUR3):
+      return serial_murmur_hash3_32<SparkMurmurHash3_32>(input, seed, stream, mr);
+    default: return nullptr;
   }
-
-  return output;
 }
 
 }  // namespace detail
 
 std::unique_ptr<column> hash(table_view const& input,
                              hash_id hash_function,
-                             std::vector<uint32_t> const& initial_hash,
+                             cudf::host_span<uint32_t const> initial_hash,
                              uint32_t seed,
                              rmm::mr::device_memory_resource* mr)
 {
@@ -260,12 +135,4 @@ std::unique_ptr<column> hash(table_view const& input,
   return detail::hash(input, hash_function, initial_hash, seed, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> murmur_hash3_32(table_view const& input,
-                                        std::vector<uint32_t> const& initial_hash,
-                                        rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::murmur_hash3_32(input, initial_hash, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace cudf
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
new file mode 100644
index 00000000000..692c3ade6c6
--- /dev/null
+++ b/cpp/src/hash/md5_hash.cu
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/hashing.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/constant_iterator.h>
+
+namespace cudf {
+namespace {
+
+// MD5 supported leaf data type check
+bool md5_type_check(data_type dt)
+{
+  return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING));
+}
+
+}  // namespace
+
+namespace detail {
+
+std::unique_ptr<column> md5_hash(table_view const& input,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  if (input.num_columns() == 0 || input.num_rows() == 0) {
+    const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e");
+    auto output = make_column_from_scalar(string_128bit, input.num_rows(), stream, mr);
+    return output;
+  }
+
+  // Accepts string and fixed width columns, or single layer list columns holding those types
+  CUDF_EXPECTS(
+    std::all_of(input.begin(),
+                input.end(),
+                [](auto col) {
+                  return md5_type_check(col.type()) ||
+                         (col.type().id() == type_id::LIST && md5_type_check(col.child(1).type()));
+                }),
+    "MD5 unsupported column type");
+
+  // Result column allocation and creation
+  auto begin = thrust::make_constant_iterator(32);
+  auto offsets_column =
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+
+  auto chars_column =
+    strings::detail::create_chars_child_column(input.num_rows(), input.num_rows() * 32, stream, mr);
+  auto chars_view = chars_column->mutable_view();
+  auto d_chars    = chars_view.data<char>();
+
+  rmm::device_buffer null_mask{0, stream, mr};
+
+  auto const device_input = table_device_view::create(input, stream);
+
+  // Hash each row, hashing each element sequentially left to right
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(input.num_rows()),
+                   [d_chars, device_input = *device_input] __device__(auto row_index) {
+                     md5_intermediate_data hash_state;
+                     MD5Hash hasher = MD5Hash{};
+                     for (int col_index = 0; col_index < device_input.num_columns(); col_index++) {
+                       if (device_input.column(col_index).is_valid(row_index)) {
+                         cudf::type_dispatcher<dispatch_storage_type>(
+                           device_input.column(col_index).type(),
+                           hasher,
+                           device_input.column(col_index),
+                           row_index,
+                           &hash_state);
+                       }
+                     }
+                     hasher.finalize(&hash_state, d_chars + (row_index * 32));
+                   });
+
+  return make_strings_column(input.num_rows(),
+                             std::move(offsets_column),
+                             std::move(chars_column),
+                             0,
+                             std::move(null_mask),
+                             stream,
+                             mr);
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/hash/murmur_hash.cu b/cpp/src/hash/murmur_hash.cu
new file mode 100644
index 00000000000..81be4d0eabe
--- /dev/null
+++ b/cpp/src/hash/murmur_hash.cu
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/hashing.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/row_operators.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/tabulate.h>
+
+namespace cudf {
+namespace detail {
+
+std::unique_ptr<column> murmur_hash3_32(table_view const& input,
+                                        cudf::host_span<uint32_t const> initial_hash,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  // TODO this should be UINT32
+  auto output = make_numeric_column(
+    data_type(type_id::INT32), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
+
+  // Return early if there's nothing to hash
+  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+  bool const nullable     = has_nulls(input);
+  auto const device_input = table_device_view::create(input, stream);
+  auto output_view        = output->mutable_view();
+
+  // Compute the hash value for each row depending on the specified hash function
+  if (!initial_hash.empty()) {
+    CUDF_EXPECTS(initial_hash.size() == size_t(input.num_columns()),
+                 "Expected same size of initial hash values as number of columns");
+    auto device_initial_hash = make_device_uvector_async(initial_hash, stream);
+
+    if (nullable) {
+      thrust::tabulate(
+        rmm::exec_policy(stream),
+        output_view.begin<int32_t>(),
+        output_view.end<int32_t>(),
+        row_hasher_initial_values<MurmurHash3_32, true>(*device_input, device_initial_hash.data()));
+    } else {
+      thrust::tabulate(rmm::exec_policy(stream),
+                       output_view.begin<int32_t>(),
+                       output_view.end<int32_t>(),
+                       row_hasher_initial_values<MurmurHash3_32, false>(
+                         *device_input, device_initial_hash.data()));
+    }
+  } else {
+    if (nullable) {
+      thrust::tabulate(rmm::exec_policy(stream),
+                       output_view.begin<int32_t>(),
+                       output_view.end<int32_t>(),
+                       row_hasher<MurmurHash3_32, true>(*device_input));
+    } else {
+      thrust::tabulate(rmm::exec_policy(stream),
+                       output_view.begin<int32_t>(),
+                       output_view.end<int32_t>(),
+                       row_hasher<MurmurHash3_32, false>(*device_input));
+    }
+  }
+
+  return output;
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 0704425186e..645d9bc5185 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,11 @@
 
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
@@ -76,14 +79,16 @@ class unordered_multiset {
     auto d_column = column_device_view::create(col, stream);
     auto d_col    = *d_column;
 
-    rmm::device_vector<size_type> hash_bins_start(2 * d_col.size() + 1, size_type{0});
-    rmm::device_vector<size_type> hash_bins_end(2 * d_col.size() + 1, size_type{0});
-    rmm::device_vector<Element> hash_data(d_col.size());
+    auto hash_bins_start =
+      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
+    auto hash_bins_end =
+      cudf::detail::make_zeroed_device_uvector_async<size_type>(2 * d_col.size() + 1, stream);
+    auto hash_data = rmm::device_uvector<Element>(d_col.size(), stream);
 
     Hasher hasher;
-    size_type *d_hash_bins_start = hash_bins_start.data().get();
-    size_type *d_hash_bins_end   = hash_bins_end.data().get();
-    Element *d_hash_data         = hash_data.data().get();
+    size_type *d_hash_bins_start = hash_bins_start.data();
+    size_type *d_hash_bins_end   = hash_bins_end.data();
+    Element *d_hash_data         = hash_data.data();
 
     thrust::for_each(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
@@ -124,20 +129,20 @@ class unordered_multiset {
   unordered_multiset_device_view<Element, Hasher, Equality> to_device()
   {
     return unordered_multiset_device_view<Element, Hasher, Equality>(
-      size, hash_bins.data().get(), hash_data.data().get());
+      size, hash_bins.data(), hash_data.data());
   }
 
  private:
   unordered_multiset(size_type size,
-                     rmm::device_vector<size_type> &&hash_bins,
-                     rmm::device_vector<Element> &&hash_data)
+                     rmm::device_uvector<size_type> &&hash_bins,
+                     rmm::device_uvector<Element> &&hash_data)
     : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)}
   {
   }
 
   size_type size;
-  rmm::device_vector<size_type> hash_bins;
-  rmm::device_vector<Element> hash_data;
+  rmm::device_uvector<size_type> hash_bins;
+  rmm::device_uvector<Element> hash_data;
 };
 
 }  // namespace detail
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 571c695e66e..0e0ce8c4335 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -137,15 +137,15 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   auto const& tensor = managed_tensor->dl_tensor;
 
   // We can copy from host or device pointers
-  CUDF_EXPECTS(kDLGPU == tensor.ctx.device_type || kDLCPU == tensor.ctx.device_type ||
-                 kDLCPUPinned == tensor.ctx.device_type,
-               "DLTensor must be GPU, CPU, or pinned type");
+  CUDF_EXPECTS(tensor.device.device_type == kDLCPU || tensor.device.device_type == kDLCUDA ||
+                 tensor.device.device_type == kDLCUDAHost,
+               "DLTensor device type must be CPU, CUDA or CUDAHost");
 
   // Make sure the current device ID matches the Tensor's device ID
-  if (tensor.ctx.device_type != kDLCPU) {
+  if (tensor.device.device_type != kDLCPU) {
     int device_id = 0;
     CUDA_TRY(cudaGetDevice(&device_id));
-    CUDF_EXPECTS(tensor.ctx.device_id == device_id, "DLTensor device ID must be current device");
+    CUDF_EXPECTS(tensor.device.device_id == device_id, "DLTensor device ID must be current device");
   }
 
   // Currently only 1D and 2D tensors are supported
@@ -234,8 +234,8 @@ DLManagedTensor* to_dlpack(table_view const& input,
     tensor.strides[1] = num_rows;
   }
 
-  CUDA_TRY(cudaGetDevice(&tensor.ctx.device_id));
-  tensor.ctx.device_type = kDLGPU;
+  CUDA_TRY(cudaGetDevice(&tensor.device.device_id));
+  tensor.device.device_type = kDLCUDA;
 
   // If there is only one column, then a 1D tensor can just copy the pointer
   // to the data in the column, and the deleter should not delete the original
diff --git a/cpp/src/interop/from_arrow.cpp b/cpp/src/interop/from_arrow.cu
similarity index 87%
rename from cpp/src/interop/from_arrow.cpp
rename to cpp/src/interop/from_arrow.cu
index 99c9b386a15..9475d3136e5 100644
--- a/cpp/src/interop/from_arrow.cpp
+++ b/cpp/src/interop/from_arrow.cu
@@ -13,12 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
@@ -34,6 +34,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <thrust/gather.h>
+
 namespace cudf {
 
 namespace detail {
@@ -54,7 +56,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
     case arrow::Type::DOUBLE: return data_type(type_id::FLOAT64);
     case arrow::Type::DATE32: return data_type(type_id::TIMESTAMP_DAYS);
     case arrow::Type::TIMESTAMP: {
-      arrow::TimestampType const* type = static_cast<arrow::TimestampType const*>(&arrow_type);
+      auto type = static_cast<arrow::TimestampType const*>(&arrow_type);
       switch (type->unit()) {
         case arrow::TimeUnit::type::SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
         case arrow::TimeUnit::type::MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
@@ -64,7 +66,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
       }
     }
     case arrow::Type::DURATION: {
-      arrow::DurationType const* type = static_cast<arrow::DurationType const*>(&arrow_type);
+      auto type = static_cast<arrow::DurationType const*>(&arrow_type);
       switch (type->unit()) {
         case arrow::TimeUnit::type::SECOND: return data_type(type_id::DURATION_SECONDS);
         case arrow::TimeUnit::type::MILLI: return data_type(type_id::DURATION_MILLISECONDS);
@@ -76,6 +78,10 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
     case arrow::Type::STRING: return data_type(type_id::STRING);
     case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
     case arrow::Type::LIST: return data_type(type_id::LIST);
+    case arrow::Type::DECIMAL: {
+      auto const type = static_cast<arrow::Decimal128Type const*>(&arrow_type);
+      return data_type{type_id::DECIMAL64, -type->scale()};
+    }
     case arrow::Type::STRUCT: return data_type(type_id::STRUCT);
     default: CUDF_FAIL("Unsupported type_id conversion to cudf");
   }
@@ -144,7 +150,7 @@ struct dispatch_to_cudf_column {
 
       // If array is sliced, we have to copy whole mask and then take copy.
       auto out_mask = (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(T)))
-                        ? *tmp_mask
+                        ? std::move(*tmp_mask)
                         : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(tmp_mask->data()),
                                                      array.offset(),
                                                      array.offset() + num_rows,
@@ -160,7 +166,7 @@ struct dispatch_to_cudf_column {
 
 std::unique_ptr<column> get_empty_type_column(size_type size)
 {
-  return std::make_unique<column>(data_type(type_id::EMPTY), size, rmm::device_buffer(0));
+  return std::make_unique<column>(data_type(type_id::EMPTY), size, rmm::device_buffer{});
 }
 
 /**
@@ -174,6 +180,54 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr);
 
+template <>
+std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal64>(
+  arrow::Array const& array,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  using DeviceType = int64_t;
+
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto data_buffer               = array.data()->buffers[1];
+  auto const num_rows            = static_cast<size_type>(array.length());
+
+  rmm::device_uvector<DeviceType> buf(num_rows * BIT_WIDTH_RATIO, stream);
+  rmm::device_uvector<DeviceType> out_buf(num_rows, stream, mr);
+
+  CUDA_TRY(cudaMemcpyAsync(
+    reinterpret_cast<uint8_t*>(buf.data()),
+    reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
+    buf.size() * sizeof(DeviceType),
+    cudaMemcpyDefault,
+    stream.value()));
+
+  auto every_other = [] __device__(size_type i) { return 2 * i; };
+  auto gather_map  = cudf::detail::make_counting_transform_iterator(0, every_other);
+
+  thrust::gather(
+    rmm::exec_policy(stream), gather_map, gather_map + num_rows, buf.data(), out_buf.data());
+
+  auto null_mask = [&] {
+    if (not skip_mask and array.null_bitmap_data()) {
+      auto temp_mask = get_mask_buffer(array, stream, mr);
+      // If array is sliced, we have to copy whole mask and then take copy.
+      return (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(DeviceType)))
+               ? std::move(*temp_mask.release())
+               : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(temp_mask->data()),
+                                            array.offset(),
+                                            array.offset() + num_rows,
+                                            stream,
+                                            mr);
+    }
+    return rmm::device_buffer{};
+  }();
+
+  return std::make_unique<cudf::column>(type, num_rows, out_buf.release(), std::move(null_mask));
+}
+
 template <>
 std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
   arrow::Array const& array,
@@ -296,7 +350,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
                    return get_column(*child_array, type, false, stream, mr);
                  });
 
-  auto out_mask = *(get_mask_buffer(array, stream, mr));
+  auto out_mask = std::move(*(get_mask_buffer(array, stream, mr)));
   if (struct_array->null_bitmap_data() != nullptr) {
     out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
                                     array.offset(),
@@ -379,7 +433,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                     return get_column(*array_chunk, cudf_type, false, stream, mr);
                                   });
                    if (concat_columns.empty()) {
-                     return std::make_unique<column>(cudf_type, 0, rmm::device_buffer(0));
+                     return std::make_unique<column>(cudf_type, 0, rmm::device_buffer{});
                    } else if (concat_columns.size() == 1) {
                      return std::move(concat_columns[0]);
                    }
diff --git a/cpp/src/interop/to_arrow.cpp b/cpp/src/interop/to_arrow.cu
similarity index 88%
rename from cpp/src/interop/to_arrow.cpp
rename to cpp/src/interop/to_arrow.cu
index 4bc50b21718..d9be3316f9d 100644
--- a/cpp/src/interop/to_arrow.cpp
+++ b/cpp/src/interop/to_arrow.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -30,6 +31,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
 namespace cudf {
 namespace detail {
 namespace {
@@ -135,6 +139,49 @@ struct dispatch_to_arrow {
   }
 };
 
+template <>
+std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
+  column_view input,
+  cudf::type_id id,
+  column_metadata const& metadata,
+  arrow::MemoryPool* ar_mr,
+  rmm::cuda_stream_view stream)
+{
+  using DeviceType                = int64_t;
+  size_type const BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  rmm::device_uvector<DeviceType> buf(input.size() * BIT_WIDTH_RATIO, stream);
+
+  auto count = thrust::make_counting_iterator(0);
+
+  thrust::for_each(count,
+                   count + input.size(),
+                   [in = input.begin<DeviceType>(), out = buf.data()] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * 2;
+                     out[out_idx]       = in[in_idx];
+                     out[out_idx + 1]   = in[in_idx] < 0 ? -1 : 0;
+                   });
+
+  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
+  auto result                  = arrow::AllocateBuffer(buf_size_in_bytes, ar_mr);
+  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
+
+  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+
+  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                           buf.data(),
+                           buf_size_in_bytes,
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+
+  auto type    = arrow::decimal(18, -input.type().scale());
+  auto mask    = fetch_mask_buffer(input, ar_mr, stream);
+  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, data_buffer};
+  auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
+
+  return std::make_shared<arrow::Decimal128Array>(data);
+}
+
 template <>
 std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view input,
                                                                   cudf::type_id id,
diff --git a/cpp/src/io/avro/avro_common.h b/cpp/src/io/avro/avro_common.h
index 509eca41e61..3ef36863cd2 100644
--- a/cpp/src/io/avro/avro_common.h
+++ b/cpp/src/io/avro/avro_common.h
@@ -18,6 +18,7 @@
 
 #include <stdint.h>
 #include <stdio.h>
+#include <io/utilities/column_buffer.hpp>
 
 namespace cudf {
 namespace io {
@@ -56,6 +57,8 @@ enum type_kind_e {
   type_array,
 };
 
+using cudf::io::detail::string_index_pair;
+
 }  // namespace avro
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index 321f5ee8963..ebd7f51a08a 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -72,7 +72,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
                                                  size_t max_rows,
                                                  const uint8_t *cur,
                                                  const uint8_t *end,
-                                                 device_span<nvstrdesc_s> global_dictionary)
+                                                 device_span<string_index_pair> global_dictionary)
 {
   uint32_t array_start = 0, array_repeat_count = 0;
   int array_children = 0;
@@ -123,8 +123,8 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
           if (kind == type_enum) {  // dictionary
             size_t idx = schema[i].count + v;
             if (idx < global_dictionary.size()) {
-              ptr   = global_dictionary[idx].ptr;
-              count = global_dictionary[idx].count;
+              ptr   = global_dictionary[idx].first;
+              count = global_dictionary[idx].second;
             }
           } else if (v >= 0 && cur + v <= end) {  // string
             ptr   = reinterpret_cast<const char *>(cur);
@@ -132,8 +132,8 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
             cur += count;
           }
           if (dataptr != nullptr && row < max_rows) {
-            static_cast<nvstrdesc_s *>(dataptr)[row].ptr   = ptr;
-            static_cast<nvstrdesc_s *>(dataptr)[row].count = count;
+            static_cast<string_index_pair *>(dataptr)[row].first  = ptr;
+            static_cast<string_index_pair *>(dataptr)[row].second = count;
           }
         }
       } break;
@@ -230,7 +230,7 @@ static const uint8_t *__device__ avro_decode_row(const schemadesc_s *schema,
 extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
   gpuDecodeAvroColumnData(block_desc_s *blocks,
                           schemadesc_s *schema_g,
-                          device_span<nvstrdesc_s> global_dictionary,
+                          device_span<string_index_pair> global_dictionary,
                           const uint8_t *avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
@@ -313,7 +313,7 @@ extern "C" __global__ void __launch_bounds__(num_warps * 32, 2)
  */
 void DecodeAvroColumnData(block_desc_s *blocks,
                           schemadesc_s *schema,
-                          device_span<nvstrdesc_s> global_dictionary,
+                          device_span<string_index_pair> global_dictionary,
                           const uint8_t *avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h
index 95b6e13d3f6..a82d3604d02 100644
--- a/cpp/src/io/avro/avro_gpu.h
+++ b/cpp/src/io/avro/avro_gpu.h
@@ -25,13 +25,6 @@ namespace cudf {
 namespace io {
 namespace avro {
 namespace gpu {
-/**
- * @brief Struct to describe the output of a string datatype
- */
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
 
 /**
  * @brief Struct to describe the avro schema
@@ -59,7 +52,7 @@ struct schemadesc_s {
  */
 void DecodeAvroColumnData(block_desc_s *blocks,
                           schemadesc_s *schema,
-                          cudf::device_span<nvstrdesc_s> global_dictionary,
+                          cudf::device_span<string_index_pair> global_dictionary,
                           const uint8_t *avro_data,
                           uint32_t num_blocks,
                           uint32_t schema_len,
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 42035687750..21253ce8cdf 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -213,7 +213,7 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
         actual_uncompressed_size += inflate_in[i].dstSize;
       }
       if (actual_uncompressed_size > uncompressed_data_size) {
-        decomp_block_data.resize(actual_uncompressed_size);
+        decomp_block_data.resize(actual_uncompressed_size, stream);
         for (size_t i = 0, dst_pos = 0; i < _metadata->block_list.size(); i++) {
           auto dst_base           = static_cast<uint8_t *>(decomp_block_data.data());
           inflate_in[i].dstDevice = dst_base + dst_pos;
@@ -235,7 +235,7 @@ rmm::device_buffer reader::impl::decompress_data(const rmm::device_buffer &comp_
 
 void reader::impl::decode_data(const rmm::device_buffer &block_data,
                                const std::vector<std::pair<uint32_t, uint32_t>> &dict,
-                               device_span<gpu::nvstrdesc_s> global_dictionary,
+                               device_span<string_index_pair> global_dictionary,
                                size_t num_rows,
                                std::vector<std::pair<int, std::string>> selection,
                                std::vector<column_buffer> &out_buffers,
@@ -367,9 +367,19 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
     }
 
     if (_metadata->total_data_size > 0) {
-      const auto buffer =
-        _source->host_read(_metadata->block_list[0].offset, _metadata->total_data_size);
-      rmm::device_buffer block_data(buffer->data(), buffer->size(), stream);
+      rmm::device_buffer block_data;
+      if (_source->is_device_read_preferred(_metadata->total_data_size)) {
+        block_data      = rmm::device_buffer{_metadata->total_data_size, stream};
+        auto read_bytes = _source->device_read(_metadata->block_list[0].offset,
+                                               _metadata->total_data_size,
+                                               static_cast<uint8_t *>(block_data.data()),
+                                               stream);
+        block_data.resize(read_bytes, stream);
+      } else {
+        const auto buffer =
+          _source->host_read(_metadata->block_list[0].offset, _metadata->total_data_size);
+        block_data = rmm::device_buffer{buffer->data(), buffer->size(), stream};
+      }
 
       if (_metadata->codec != "" && _metadata->codec != "null") {
         auto decomp_block_data = decompress_data(block_data, stream);
@@ -393,10 +403,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
         for (const auto &sym : col_schema.symbols) { dictionary_data_size += sym.length(); }
       }
 
-      rmm::device_uvector<gpu::nvstrdesc_s> d_global_dict(total_dictionary_entries, stream);
+      rmm::device_uvector<string_index_pair> d_global_dict(total_dictionary_entries, stream);
       rmm::device_uvector<char> d_global_dict_data(dictionary_data_size, stream);
       if (total_dictionary_entries > 0) {
-        std::vector<gpu::nvstrdesc_s> h_global_dict(total_dictionary_entries);
+        std::vector<string_index_pair> h_global_dict(total_dictionary_entries);
         std::vector<char> h_global_dict_data(dictionary_data_size);
         size_t dict_pos = 0;
         for (size_t i = 0; i < column_types.size(); ++i) {
@@ -406,10 +416,10 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
           for (size_t j = 0; j < dict[i].second; j++) {
             auto const &symbols = col_schema.symbols[j];
 
-            auto const data_dst       = h_global_dict_data.data() + dict_pos;
-            auto const len            = symbols.length();
-            col_dict_entries[j].ptr   = data_dst;
-            col_dict_entries[j].count = len;
+            auto const data_dst        = h_global_dict_data.data() + dict_pos;
+            auto const len             = symbols.length();
+            col_dict_entries[j].first  = data_dst;
+            col_dict_entries[j].second = len;
 
             std::copy(symbols.c_str(), symbols.c_str() + len, data_dst);
             dict_pos += len;
@@ -418,7 +428,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
 
         CUDA_TRY(cudaMemcpyAsync(d_global_dict.data(),
                                  h_global_dict.data(),
-                                 h_global_dict.size() * sizeof(gpu::nvstrdesc_s),
+                                 h_global_dict.size() * sizeof(string_index_pair),
                                  cudaMemcpyDefault,
                                  stream.value()));
         CUDA_TRY(cudaMemcpyAsync(d_global_dict_data.data(),
@@ -463,6 +473,7 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
 // Forward to implementation
 reader::reader(std::vector<std::string> const &filepaths,
                avro_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
@@ -472,6 +483,7 @@ reader::reader(std::vector<std::string> const &filepaths,
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                avro_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
diff --git a/cpp/src/io/avro/reader_impl.hpp b/cpp/src/io/avro/reader_impl.hpp
index 22fa1aaa760..8e09da03563 100644
--- a/cpp/src/io/avro/reader_impl.hpp
+++ b/cpp/src/io/avro/reader_impl.hpp
@@ -97,7 +97,7 @@ class reader::impl {
    */
   void decode_data(const rmm::device_buffer &block_data,
                    const std::vector<std::pair<uint32_t, uint32_t>> &dict,
-                   cudf::device_span<gpu::nvstrdesc_s> global_dictionary,
+                   cudf::device_span<string_index_pair> global_dictionary,
                    size_t num_rows,
                    std::vector<std::pair<int, std::string>> columns,
                    std::vector<column_buffer> &out_buffers,
diff --git a/cpp/src/io/csv/csv_common.h b/cpp/src/io/csv/csv_common.h
index 693f08c41f3..693f36b48aa 100644
--- a/cpp/src/io/csv/csv_common.h
+++ b/cpp/src/io/csv/csv_common.h
@@ -19,8 +19,6 @@
 #include <cstdint>
 #include <io/utilities/column_type_histogram.hpp>
 
-class SerialTrieNode;
-
 namespace cudf {
 namespace io {
 namespace csv {
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 44acc7fc55f..4b32ffcdff3 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -21,7 +21,7 @@
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/parsing_utils.cuh>
 
-#include <cudf/detail/utilities/trie.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
 #include <cudf/null_mask.hpp>
@@ -32,6 +32,7 @@
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include <io/utilities/trie.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -118,14 +119,19 @@ __device__ __inline__ bool is_datetime(
  *
  * @param len Number of non special-symbol or numeric characters
  * @param digit_count Number of digits characters
- * @param decimal_count Number of '.' characters
+ * @param decimal_count Number of occurrences of the decimal point character
+ * @param thousands_count Number of occurrences of the thousands separator character
  * @param dash_count Number of '-' characters
  * @param exponent_count Number of 'e or E' characters
  *
  * @return `true` if it is floating point-like, `false` otherwise
  */
-__device__ __inline__ bool is_floatingpoint(
-  long len, long digit_count, long decimal_count, long dash_count, long exponent_count)
+__device__ __inline__ bool is_floatingpoint(long len,
+                                            long digit_count,
+                                            long decimal_count,
+                                            long thousands_count,
+                                            long dash_count,
+                                            long exponent_count)
 {
   // Can't have more than one exponent and one decimal point
   if (decimal_count > 1) return false;
@@ -138,7 +144,9 @@ __device__ __inline__ bool is_floatingpoint(
   if (dash_count > 1 + exponent_count) return false;
 
   // If anything other than these characters is present, it's not a float
-  if (digit_count + decimal_count + dash_count + exponent_count != len) { return false; }
+  if (digit_count + decimal_count + dash_count + exponent_count + thousands_count != len) {
+    return false;
+  }
 
   // Needs at least 1 digit, 2 if exponent is present
   if (digit_count < 1 + exponent_count) return false;
@@ -156,14 +164,14 @@ __device__ __inline__ bool is_floatingpoint(
  * @param csv_text The entire CSV data to read
  * @param column_flags Per-column parsing behavior flags
  * @param row_offsets The start the CSV data of interest
- * @param d_columnData The count for each column data type
+ * @param d_column_data The count for each column data type
  */
 __global__ void __launch_bounds__(csvparse_block_dim)
   data_type_detection(parse_options_view const opts,
                       device_span<char const> csv_text,
                       device_span<column_parse::flags const> const column_flags,
                       device_span<uint64_t const> const row_offsets,
-                      device_span<column_type_histogram> d_columnData)
+                      device_span<column_type_histogram> d_column_data)
 {
   auto const raw_csv = csv_text.data();
 
@@ -192,21 +200,22 @@ __global__ void __launch_bounds__(csvparse_block_dim)
       // points to last character in the field
       auto const field_len = static_cast<size_t>(next_delimiter - field_start);
       if (serialized_trie_contains(opts.trie_na, {field_start, field_len})) {
-        atomicAdd(&d_columnData[actual_col].null_count, 1);
+        atomicAdd(&d_column_data[actual_col].null_count, 1);
       } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) ||
                  serialized_trie_contains(opts.trie_false, {field_start, field_len})) {
-        atomicAdd(&d_columnData[actual_col].bool_count, 1);
+        atomicAdd(&d_column_data[actual_col].bool_count, 1);
       } else if (cudf::io::is_infinity(field_start, next_delimiter)) {
-        atomicAdd(&d_columnData[actual_col].float_count, 1);
+        atomicAdd(&d_column_data[actual_col].float_count, 1);
       } else {
-        long countNumber   = 0;
-        long countDecimal  = 0;
-        long countSlash    = 0;
-        long countDash     = 0;
-        long countPlus     = 0;
-        long countColon    = 0;
-        long countString   = 0;
-        long countExponent = 0;
+        long count_number    = 0;
+        long count_decimal   = 0;
+        long count_thousands = 0;
+        long count_slash     = 0;
+        long count_dash      = 0;
+        long count_plus      = 0;
+        long count_colon     = 0;
+        long count_string    = 0;
+        long count_exponent  = 0;
 
         // Modify field_start & end to ignore whitespace and quotechars
         // This could possibly result in additional empty fields
@@ -215,53 +224,62 @@ __global__ void __launch_bounds__(csvparse_block_dim)
 
         for (auto cur = trimmed_field_range.first; cur < trimmed_field_range.second; ++cur) {
           if (is_digit(*cur)) {
-            countNumber++;
+            count_number++;
+            continue;
+          }
+          if (*cur == opts.decimal) {
+            count_decimal++;
+            continue;
+          }
+          if (*cur == opts.thousands) {
+            count_thousands++;
             continue;
           }
           // Looking for unique characters that will help identify column types.
           switch (*cur) {
-            case '.': countDecimal++; break;
-            case '-': countDash++; break;
-            case '+': countPlus++; break;
-            case '/': countSlash++; break;
-            case ':': countColon++; break;
+            case '-': count_dash++; break;
+            case '+': count_plus++; break;
+            case '/': count_slash++; break;
+            case ':': count_colon++; break;
             case 'e':
             case 'E':
               if (cur > trimmed_field_range.first && cur < trimmed_field_range.second - 1)
-                countExponent++;
+                count_exponent++;
               break;
-            default: countString++; break;
+            default: count_string++; break;
           }
         }
 
         // Integers have to have the length of the string
         // Off by one if they start with a minus sign
-        auto const int_req_number_cnt = trimmed_field_len - ((*trimmed_field_range.first == '-' ||
-                                                              *trimmed_field_range.first == '+') &&
-                                                             trimmed_field_len > 1);
+        auto const int_req_number_cnt =
+          trimmed_field_len - count_thousands -
+          ((*trimmed_field_range.first == '-' || *trimmed_field_range.first == '+') &&
+           trimmed_field_len > 1);
 
         if (column_flags[col] & column_parse::as_datetime) {
           // PANDAS uses `object` dtype if the date is unparseable
-          if (is_datetime(countString, countDecimal, countColon, countDash, countSlash)) {
-            atomicAdd(&d_columnData[actual_col].datetime_count, 1);
+          if (is_datetime(count_string, count_decimal, count_colon, count_dash, count_slash)) {
+            atomicAdd(&d_column_data[actual_col].datetime_count, 1);
           } else {
-            atomicAdd(&d_columnData[actual_col].string_count, 1);
+            atomicAdd(&d_column_data[actual_col].string_count, 1);
           }
-        } else if (countNumber == int_req_number_cnt) {
+        } else if (count_number == int_req_number_cnt) {
           auto const is_negative = (*trimmed_field_range.first == '-');
           auto const data_begin =
             trimmed_field_range.first + (is_negative || (*trimmed_field_range.first == '+'));
           cudf::size_type *ptr = cudf::io::gpu::infer_integral_field_counter(
-            data_begin, data_begin + countNumber, is_negative, d_columnData[actual_col]);
+            data_begin, data_begin + count_number, is_negative, d_column_data[actual_col]);
           atomicAdd(ptr, 1);
         } else if (is_floatingpoint(trimmed_field_len,
-                                    countNumber,
-                                    countDecimal,
-                                    countDash + countPlus,
-                                    countExponent)) {
-          atomicAdd(&d_columnData[actual_col].float_count, 1);
+                                    count_number,
+                                    count_decimal,
+                                    count_thousands,
+                                    count_dash + count_plus,
+                                    count_exponent)) {
+          atomicAdd(&d_column_data[actual_col].float_count, 1);
         } else {
-          atomicAdd(&d_columnData[actual_col].string_count, 1);
+          atomicAdd(&d_column_data[actual_col].string_count, 1);
         }
       }
       actual_col++;
@@ -519,16 +537,13 @@ struct decode_op {
  *
  * Data is processed one record at a time
  *
- * @param[in] raw_csv The entire CSV data to read
- * @param[in] opts A set of parsing options
- * @param[in] num_records The number of lines/rows of CSV data
- * @param[in] num_columns The number of columns of CSV data
+ * @param[in] options A set of parsing options
+ * @param[in] data The entire CSV data to read
  * @param[in] column_flags Per-column parsing behavior flags
- * @param[in] recStart The start the CSV data of interest
- * @param[in] dtype The data type of the column
- * @param[out] data The output column data
- * @param[out] valid The bitmaps indicating whether column fields are valid
- * @param[out] num_valid The numbers of valid fields in columns
+ * @param[in] row_offsets The start the CSV data of interest
+ * @param[in] dtypes The data type of the column
+ * @param[out] columns The output column data
+ * @param[out] valids The bitmaps indicating whether column fields are valid
  */
 __global__ void __launch_bounds__(csvparse_block_dim)
   convert_csv_to_cudf(cudf::io::parse_options_view options,
@@ -536,8 +551,8 @@ __global__ void __launch_bounds__(csvparse_block_dim)
                       device_span<column_parse::flags const> column_flags,
                       device_span<uint64_t const> row_offsets,
                       device_span<cudf::data_type const> dtypes,
-                      device_span<void *> columns,
-                      device_span<cudf::bitmask_type *> valids)
+                      device_span<void *const> columns,
+                      device_span<cudf::bitmask_type *const> valids)
 {
   auto const raw_csv = data.data();
   // thread IDs range per block, so also need the block id.
@@ -970,8 +985,8 @@ __global__ void __launch_bounds__(rowofs_block_dim)
 }
 
 size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts,
-                                 device_span<char const> const data,
-                                 device_span<uint64_t const> const row_offsets,
+                                 device_span<char const> data,
+                                 device_span<uint64_t const> row_offsets,
                                  rmm::cuda_stream_view stream)
 {
   const auto newline  = opts.skipblanklines ? opts.terminator : opts.comment;
@@ -987,10 +1002,10 @@ size_t __host__ count_blank_rows(const cudf::io::parse_options_view &opts,
     });
 }
 
-void __host__ remove_blank_rows(cudf::io::parse_options_view const &options,
-                                device_span<char const> const data,
-                                rmm::device_vector<uint64_t> &row_offsets,
-                                rmm::cuda_stream_view stream)
+device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view const &options,
+                                                 device_span<char const> data,
+                                                 device_span<uint64_t> row_offsets,
+                                                 rmm::cuda_stream_view stream)
 {
   size_t d_size       = data.size();
   const auto newline  = options.skipblanklines ? options.terminator : options.comment;
@@ -1004,10 +1019,10 @@ void __host__ remove_blank_rows(cudf::io::parse_options_view const &options,
       return ((pos != d_size) &&
               (data[pos] == newline || data[pos] == comment || data[pos] == carriage));
     });
-  row_offsets.resize(new_end - row_offsets.begin());
+  return row_offsets.subspan(0, new_end - row_offsets.begin());
 }
 
-thrust::host_vector<column_type_histogram> detect_column_types(
+std::vector<column_type_histogram> detect_column_types(
   cudf::io::parse_options_view const &options,
   device_span<char const> const data,
   device_span<column_parse::flags const> const column_flags,
@@ -1019,21 +1034,22 @@ thrust::host_vector<column_type_histogram> detect_column_types(
   const int block_size = csvparse_block_dim;
   const int grid_size  = (row_starts.size() + block_size - 1) / block_size;
 
-  auto d_stats = rmm::device_vector<column_type_histogram>(num_active_columns);
+  auto d_stats =
+    detail::make_zeroed_device_uvector_async<column_type_histogram>(num_active_columns, stream);
 
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
 
-  return thrust::host_vector<column_type_histogram>(d_stats);
+  return detail::make_std_vector_sync(d_stats, stream);
 }
 
 void __host__ decode_row_column_data(cudf::io::parse_options_view const &options,
-                                     device_span<char const> const data,
-                                     device_span<column_parse::flags const> const column_flags,
-                                     device_span<uint64_t const> const row_offsets,
-                                     device_span<cudf::data_type const> const dtypes,
-                                     device_span<void *> const columns,
-                                     device_span<cudf::bitmask_type *> const valids,
+                                     device_span<char const> data,
+                                     device_span<column_parse::flags const> column_flags,
+                                     device_span<uint64_t const> row_offsets,
+                                     device_span<cudf::data_type const> dtypes,
+                                     device_span<void *const> columns,
+                                     device_span<cudf::bitmask_type *const> valids,
                                      rmm::cuda_stream_view stream)
 {
   // Calculate actual block count to use based on records count
diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h
index 0c36a1575d7..838abe66b94 100644
--- a/cpp/src/io/csv/csv_gpu.h
+++ b/cpp/src/io/csv/csv_gpu.h
@@ -183,10 +183,10 @@ size_t count_blank_rows(cudf::io::parse_options_view const &options,
  * @param row_offsets Row offsets in the character data buffer
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-void remove_blank_rows(const cudf::io::parse_options_view &options,
-                       device_span<char const> data,
-                       rmm::device_vector<uint64_t> &row_offsets,
-                       rmm::cuda_stream_view stream);
+device_span<uint64_t> remove_blank_rows(const cudf::io::parse_options_view &options,
+                                        device_span<char const> data,
+                                        device_span<uint64_t> row_offsets,
+                                        rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for detecting possible dtype of each column of data
@@ -199,7 +199,7 @@ void remove_blank_rows(const cudf::io::parse_options_view &options,
  *
  * @return stats Histogram of each dtypes' occurrence for each column
  */
-thrust::host_vector<column_type_histogram> detect_column_types(
+std::vector<column_type_histogram> detect_column_types(
   cudf::io::parse_options_view const &options,
   device_span<char const> data,
   device_span<column_parse::flags const> column_flags,
@@ -224,8 +224,8 @@ void decode_row_column_data(cudf::io::parse_options_view const &options,
                             device_span<column_parse::flags const> column_flags,
                             device_span<uint64_t const> row_offsets,
                             device_span<cudf::data_type const> dtypes,
-                            device_span<void *> columns,
-                            device_span<cudf::bitmask_type *> valids,
+                            device_span<void *const> columns,
+                            device_span<cudf::bitmask_type *const> valids,
                             rmm::cuda_stream_view stream);
 
 }  // namespace gpu
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index 7f3c2ab4942..4e4ddd09a9f 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "thrust/reduce.h"
+#include <thrust/reduce.h>
 
 #include <cudf/wrappers/durations.hpp>
 #include <io/utilities/parsing_utils.cuh>
@@ -435,4 +435,4 @@ __inline__ __device__ int64_t to_time_delta(char const* begin, char const* end)
 }
 
 }  // namespace io
-}  // namespace cudf
\ No newline at end of file
+}  // namespace cudf
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 5a7c772f73c..821b87c52e4 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,13 +18,13 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <strings/convert/utilities.cuh>
-#include <strings/utilities.cuh>
 
 namespace cudf {
 namespace io {
@@ -190,8 +190,8 @@ struct dispatch_from_durations_fn {
     // build chars column
     auto const chars_bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-    auto chars_column = strings::detail::create_chars_child_column(
-      strings_count, durations.null_count(), chars_bytes, stream, mr);
+    auto chars_column =
+      strings::detail::create_chars_child_column(strings_count, chars_bytes, stream, mr);
     auto chars_view = chars_column->mutable_view();
     auto d_chars    = chars_view.template data<char>();
 
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 76580122fe6..325cb3356d9 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -25,6 +25,8 @@
 #include <io/utilities/parsing_utils.cuh>
 #include <io/utilities/type_conversion.cuh>
 
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/table/table.hpp>
@@ -44,6 +46,7 @@ using std::vector;
 
 using cudf::device_span;
 using cudf::host_span;
+using cudf::detail::make_device_uvector_async;
 
 namespace cudf {
 namespace io {
@@ -180,7 +183,19 @@ std::vector<std::string> setColumnNames(std::vector<char> const &header,
   return col_names;
 }
 
-table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
+template <typename C>
+void erase_except_last(C &container, rmm::cuda_stream_view stream)
+{
+  cudf::detail::device_single_thread(
+    [span = device_span<typename C::value_type>{container}] __device__() mutable {
+      span.front() = span.back();
+    },
+    stream);
+  container.resize(1, stream);
+}
+
+std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
+reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
 {
   auto range_offset  = opts_.get_byte_range_offset();
   auto range_size    = opts_.get_byte_range_size();
@@ -205,11 +220,6 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     source_ = datasource::create(filepath_, range_offset, map_range_size);
   }
 
-  // Return an empty dataframe if no data and no column metadata to process
-  if (source_->is_empty() && (opts_.get_names().empty() || opts_.get_dtypes().empty())) {
-    return {std::make_unique<table>(), {}};
-  }
-
   // Transfer source data to GPU
   if (!source_->is_empty()) {
     auto data_size = (map_range_size != 0) ? map_range_size : source_->size();
@@ -237,36 +247,43 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
                  "byte_range offset with header not supported");
 
     // Gather row offsets
-    gather_row_offsets(h_data,
-                       data_start_offset,
-                       (range_size) ? range_size : h_data.size(),
-                       (skip_rows > 0) ? skip_rows : 0,
-                       num_rows,
-                       load_whole_file,
-                       stream);
-
+    auto data_row_offsets =
+      load_data_and_gather_row_offsets(h_data,
+                                       data_start_offset,
+                                       (range_size) ? range_size : h_data.size(),
+                                       (skip_rows > 0) ? skip_rows : 0,
+                                       num_rows,
+                                       load_whole_file,
+                                       stream);
+    auto &row_offsets = data_row_offsets.second;
     // Exclude the rows that are to be skipped from the end
-    if (skip_end_rows > 0 && static_cast<size_t>(skip_end_rows) < row_offsets_.size()) {
-      row_offsets_.resize(row_offsets_.size() - skip_end_rows);
+    if (skip_end_rows > 0 && static_cast<size_t>(skip_end_rows) < row_offsets.size()) {
+      row_offsets.shrink(row_offsets.size() - skip_end_rows);
     }
-
-    // Exclude the end-of-data row from number of rows with actual data
-    num_records_ = row_offsets_.size();
-    num_records_ -= (num_records_ > 0);
-  } else {
-    num_records_ = 0;
+    return data_row_offsets;
   }
+  return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
+}
+
+table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
+{
+  auto const data_row_offsets = select_data_and_row_offsets(stream);
+  auto const &data            = data_row_offsets.first;
+  auto const &row_offsets     = data_row_offsets.second;
+
+  // Exclude the end-of-data row from number of rows with actual data
+  num_records_ = std::max(row_offsets.size(), 1ul) - 1;
 
   // Check if the user gave us a list of column names
   if (not opts_.get_names().empty()) {
-    h_column_flags_.resize(opts_.get_names().size(), column_parse::enabled);
+    column_flags_.resize(opts_.get_names().size(), column_parse::enabled);
     col_names_ = opts_.get_names();
   } else {
     col_names_ = setColumnNames(header_, opts.view(), opts_.get_header(), opts_.get_prefix());
 
     num_actual_cols_ = num_active_cols_ = col_names_.size();
 
-    h_column_flags_.resize(num_actual_cols_, column_parse::enabled);
+    column_flags_.resize(num_actual_cols_, column_parse::enabled);
 
     // Rename empty column names to "Unnamed: col_index"
     for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) {
@@ -287,8 +304,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
           col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
         } else {
           // All duplicate columns will be ignored; First appearance is parsed
-          const auto idx       = &col_name - col_names_.data();
-          h_column_flags_[idx] = column_parse::disabled;
+          const auto idx     = &col_name - col_names_.data();
+          column_flags_[idx] = column_parse::disabled;
         }
       }
     }
@@ -300,17 +317,17 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 
   // User can specify which columns should be parsed
   if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) {
-    std::fill(h_column_flags_.begin(), h_column_flags_.end(), column_parse::disabled);
+    std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled);
 
     for (const auto index : opts_.get_use_cols_indexes()) {
-      h_column_flags_[index] = column_parse::enabled;
+      column_flags_[index] = column_parse::enabled;
     }
     num_active_cols_ = opts_.get_use_cols_indexes().size();
 
     for (const auto &name : opts_.get_use_cols_names()) {
       const auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
-        h_column_flags_[it - col_names_.begin()] = column_parse::enabled;
+        column_flags_[it - col_names_.begin()] = column_parse::enabled;
         num_active_cols_++;
       }
     }
@@ -319,13 +336,13 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   // User can specify which columns should be inferred as datetime
   if (!opts_.get_infer_date_indexes().empty() || !opts_.get_infer_date_names().empty()) {
     for (const auto index : opts_.get_infer_date_indexes()) {
-      h_column_flags_[index] |= column_parse::as_datetime;
+      column_flags_[index] |= column_parse::as_datetime;
     }
 
     for (const auto &name : opts_.get_infer_date_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
-        h_column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
+        column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
       }
     }
   }
@@ -335,12 +352,12 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 
   auto metadata     = table_metadata{};
   auto out_columns  = std::vector<std::unique_ptr<cudf::column>>();
-  auto column_types = gather_column_types(stream);
+  auto column_types = gather_column_types(data, row_offsets, stream);
 
   out_columns.reserve(column_types.size());
 
   if (num_records_ != 0) {
-    auto out_buffers = decode_data(column_types, stream);
+    auto out_buffers = decode_data(data, row_offsets, column_types, stream);
     for (size_t i = 0; i < column_types.size(); ++i) {
       metadata.column_names.emplace_back(out_buffers[i].name);
       if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' &&
@@ -365,7 +382,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     }
     // Handle empty metadata
     for (int col = 0; col < num_actual_cols_; ++col) {
-      if (h_column_flags_[col] & column_parse::enabled) {
+      if (column_flags_[col] & column_parse::enabled) {
         metadata.column_names.emplace_back(col_names_[col]);
       }
     }
@@ -373,7 +390,7 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
 }
 
-size_t reader::impl::find_first_row_start(host_span<char const> const data)
+size_t reader::impl::find_first_row_start(host_span<char const> data)
 {
   // For now, look for the first terminator (assume the first terminator isn't within a quote)
   // TODO: Attempt to infer this from the data
@@ -382,13 +399,14 @@ size_t reader::impl::find_first_row_start(host_span<char const> const data)
   return std::min(pos + 1, data.size());
 }
 
-void reader::impl::gather_row_offsets(host_span<char const> const data,
-                                      size_t range_begin,
-                                      size_t range_end,
-                                      size_t skip_rows,
-                                      int64_t num_rows,
-                                      bool load_whole_file,
-                                      rmm::cuda_stream_view stream)
+std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
+reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
+                                               size_t range_begin,
+                                               size_t range_end,
+                                               size_t skip_rows,
+                                               int64_t num_rows,
+                                               bool load_whole_file,
+                                               rmm::cuda_stream_view stream)
 {
   constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
@@ -403,21 +421,30 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
   // For compatibility with the previous parser, a row is considered in-range if the
   // previous row terminator is within the given range
   range_end += (range_end < data.size());
-  data_.resize(0);
-  row_offsets_.resize(0);
-  data_.reserve((load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()));
+
+  // Reserve memory by allocating and then resetting the size
+  rmm::device_uvector<char> d_data{
+    (load_whole_file) ? data.size() : std::min(buffer_size * 2, data.size()), stream};
+  d_data.resize(0, stream);
+  rmm::device_uvector<uint64_t> all_row_offsets{0, stream};
   do {
     size_t target_pos = std::min(pos + max_chunk_bytes, data.size());
     size_t chunk_size = target_pos - pos;
 
-    data_.insert(data_.end(), data.begin() + buffer_pos + data_.size(), data.begin() + target_pos);
+    auto const previous_data_size = d_data.size();
+    d_data.resize(target_pos - buffer_pos, stream);
+    CUDA_TRY(cudaMemcpyAsync(d_data.begin() + previous_data_size,
+                             data.begin() + buffer_pos + previous_data_size,
+                             target_pos - buffer_pos - previous_data_size,
+                             cudaMemcpyDefault,
+                             stream.value()));
 
     // Pass 1: Count the potential number of rows in each character block for each
     // possible parser state at the beginning of the block.
     uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(opts.view(),
                                                                  row_ctx.device_ptr(),
                                                                  device_span<uint64_t>(),
-                                                                 data_,
+                                                                 d_data,
                                                                  chunk_size,
                                                                  pos,
                                                                  buffer_pos,
@@ -444,7 +471,7 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
     size_t total_rows = ctx >> 2;
     if (total_rows > skip_rows) {
       // At least one row in range in this batch
-      row_offsets_.resize(total_rows - skip_rows);
+      all_row_offsets.resize(total_rows - skip_rows, stream);
 
       CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(),
                                row_ctx.host_ptr(),
@@ -455,8 +482,8 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
       // Pass 2: Output row offsets
       cudf::io::csv::gpu::gather_row_offsets(opts.view(),
                                              row_ctx.device_ptr(),
-                                             row_offsets_,
-                                             data_,
+                                             all_row_offsets,
+                                             d_data,
                                              chunk_size,
                                              pos,
                                              buffer_pos,
@@ -479,18 +506,18 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
         if (rows_out_of_range != 0) {
           // Keep one row out of range (used to infer length of previous row)
           auto new_row_offsets_size =
-            row_offsets_.size() - std::min(rows_out_of_range - 1, row_offsets_.size());
-          row_offsets_.resize(new_row_offsets_size);
+            all_row_offsets.size() - std::min(rows_out_of_range - 1, all_row_offsets.size());
+          all_row_offsets.resize(new_row_offsets_size, stream);
           // Implies we reached the end of the range
           break;
         }
       }
       // num_rows does not include blank rows
       if (num_rows >= 0) {
-        if (row_offsets_.size() > header_rows + static_cast<size_t>(num_rows)) {
+        if (all_row_offsets.size() > header_rows + static_cast<size_t>(num_rows)) {
           size_t num_blanks =
-            cudf::io::csv::gpu::count_blank_rows(opts.view(), data_, row_offsets_, stream);
-          if (row_offsets_.size() - num_blanks > header_rows + static_cast<size_t>(num_rows)) {
+            cudf::io::csv::gpu::count_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+          if (all_row_offsets.size() - num_blanks > header_rows + static_cast<size_t>(num_rows)) {
             // Got the desired number of rows
             break;
           }
@@ -498,24 +525,24 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
       }
     } else {
       // Discard data (all rows below skip_rows), keeping one character for history
-      size_t discard_bytes = std::max(data_.size(), sizeof(char)) - sizeof(char);
+      size_t discard_bytes = std::max(d_data.size(), sizeof(char)) - sizeof(char);
       if (discard_bytes != 0) {
-        data_.erase(data_.begin(), data_.begin() + discard_bytes);
+        erase_except_last(d_data, stream);
         buffer_pos += discard_bytes;
       }
     }
     pos = target_pos;
   } while (pos < data.size());
 
-  // Eliminate blank rows
-  if (row_offsets_.size() != 0) {
-    cudf::io::csv::gpu::remove_blank_rows(opts.view(), data_, row_offsets_, stream);
-  }
+  auto const non_blank_row_offsets =
+    io::csv::gpu::remove_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+  auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets};
+
   // Remove header rows and extract header
   const size_t header_row_index = std::max<size_t>(header_rows, 1) - 1;
-  if (header_row_index + 1 < row_offsets_.size()) {
+  if (header_row_index + 1 < row_offsets.size()) {
     CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(),
-                             row_offsets_.data().get() + header_row_index,
+                             row_offsets.data() + header_row_index,
                              2 * sizeof(uint64_t),
                              cudaMemcpyDeviceToHost,
                              stream.value()));
@@ -526,15 +553,18 @@ void reader::impl::gather_row_offsets(host_span<char const> const data,
     CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
                  "Invalid csv header location");
     header_.assign(data.begin() + header_start, data.begin() + header_end);
-    if (header_rows > 0) {
-      row_offsets_.erase(row_offsets_.begin(), row_offsets_.begin() + header_rows);
-    }
+    if (header_rows > 0) { row_offsets.erase_first_n(header_rows); }
   }
   // Apply num_rows limit
-  if (num_rows >= 0) { row_offsets_.resize(std::min<size_t>(row_offsets_.size(), num_rows + 1)); }
+  if (num_rows >= 0 && static_cast<size_t>(num_rows) < row_offsets.size() - 1) {
+    row_offsets.shrink(num_rows + 1);
+  }
+  return {std::move(d_data), std::move(row_offsets)};
 }
 
-std::vector<data_type> reader::impl::gather_column_types(rmm::cuda_stream_view stream)
+std::vector<data_type> reader::impl::gather_column_types(device_span<char const> data,
+                                                         device_span<uint64_t const> row_offsets,
+                                                         rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
 
@@ -542,10 +572,13 @@ std::vector<data_type> reader::impl::gather_column_types(rmm::cuda_stream_view s
     if (num_records_ == 0) {
       dtypes.resize(num_active_cols_, data_type{type_id::EMPTY});
     } else {
-      d_column_flags_ = h_column_flags_;
-
-      auto column_stats = cudf::io::csv::gpu::detect_column_types(
-        opts.view(), data_, d_column_flags_, row_offsets_, num_active_cols_, stream);
+      auto column_stats =
+        cudf::io::csv::gpu::detect_column_types(opts.view(),
+                                                data,
+                                                make_device_uvector_async(column_flags_, stream),
+                                                row_offsets,
+                                                num_active_cols_,
+                                                stream);
 
       stream.synchronize();
 
@@ -594,7 +627,7 @@ std::vector<data_type> reader::impl::gather_column_types(rmm::cuda_stream_view s
         column_parse::flags col_flags_;
         std::tie(dtype_, col_flags_) = get_dtype_info(opts_.get_dtypes()[0]);
         dtypes.resize(num_active_cols_, dtype_);
-        for (int col = 0; col < num_actual_cols_; col++) { h_column_flags_[col] |= col_flags_; }
+        for (int col = 0; col < num_actual_cols_; col++) { column_flags_[col] |= col_flags_; }
         CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
       } else {
         // If it's a list, assign dtypes to active columns in the given order
@@ -604,10 +637,10 @@ std::vector<data_type> reader::impl::gather_column_types(rmm::cuda_stream_view s
         auto dtype_ = std::back_inserter(dtypes);
 
         for (int col = 0; col < num_actual_cols_; col++) {
-          if (h_column_flags_[col] & column_parse::enabled) {
+          if (column_flags_[col] & column_parse::enabled) {
             column_parse::flags col_flags_;
             std::tie(dtype_, col_flags_) = get_dtype_info(opts_.get_dtypes()[col]);
-            h_column_flags_[col] |= col_flags_;
+            column_flags_[col] |= col_flags_;
             CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
           }
         }
@@ -626,12 +659,12 @@ std::vector<data_type> reader::impl::gather_column_types(rmm::cuda_stream_view s
       auto dtype_ = std::back_inserter(dtypes);
 
       for (int col = 0; col < num_actual_cols_; col++) {
-        if (h_column_flags_[col] & column_parse::enabled) {
+        if (column_flags_[col] & column_parse::enabled) {
           CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
                        "Must specify data types for all active columns");
           column_parse::flags col_flags_;
           std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
-          h_column_flags_[col] |= col_flags_;
+          column_flags_[col] |= col_flags_;
           CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
         }
       }
@@ -652,16 +685,17 @@ std::vector<data_type> reader::impl::gather_column_types(rmm::cuda_stream_view s
   return dtypes;
 }
 
-std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> const &column_types,
+std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
+                                                     device_span<uint64_t const> row_offsets,
+                                                     host_span<data_type const> column_types,
                                                      rmm::cuda_stream_view stream)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
-
   out_buffers.reserve(column_types.size());
 
   for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) {
-    if (h_column_flags_[col] & column_parse::enabled) {
+    if (column_flags_[col] & column_parse::enabled) {
       const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
       auto out_buffer =
         column_buffer(column_types[active_col],
@@ -670,7 +704,8 @@ std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> cons
                       stream,
                       is_final_allocation ? mr_ : rmm::mr::get_current_device_resource());
 
-      out_buffer.name = col_names_[col];
+      out_buffer.name         = col_names_[col];
+      out_buffer.null_count() = UNKNOWN_NULL_COUNT;
       out_buffers.emplace_back(std::move(out_buffer));
       active_col++;
     }
@@ -684,17 +719,14 @@ std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> cons
     h_valid[i] = out_buffers[i].null_mask();
   }
 
-  rmm::device_vector<data_type> d_dtypes(column_types);
-  rmm::device_vector<void *> d_data          = h_data;
-  rmm::device_vector<bitmask_type *> d_valid = h_valid;
-  d_column_flags_                            = h_column_flags_;
-
-  cudf::io::csv::gpu::decode_row_column_data(
-    opts.view(), data_, d_column_flags_, row_offsets_, d_dtypes, d_data, d_valid, stream);
-
-  stream.synchronize();
-
-  for (int i = 0; i < num_active_cols_; ++i) { out_buffers[i].null_count() = UNKNOWN_NULL_COUNT; }
+  cudf::io::csv::gpu::decode_row_column_data(opts.view(),
+                                             data,
+                                             make_device_uvector_async(column_flags_, stream),
+                                             row_offsets,
+                                             make_device_uvector_async(column_types, stream),
+                                             make_device_uvector_async(h_data, stream),
+                                             make_device_uvector_async(h_valid, stream),
+                                             stream);
 
   return out_buffers;
 }
@@ -702,8 +734,9 @@ std::vector<column_buffer> reader::impl::decode_data(std::vector<data_type> cons
 /**
  * @brief Create a serialized trie for N/A value matching, based on the options.
  */
-thrust::host_vector<SerialTrieNode> create_na_trie(char quotechar,
-                                                   csv_reader_options const &reader_opts)
+cudf::detail::trie create_na_trie(char quotechar,
+                                  csv_reader_options const &reader_opts,
+                                  rmm::cuda_stream_view stream)
 {
   // Default values to recognize as null values
   static std::vector<std::string> const default_na_values{"",
@@ -725,7 +758,7 @@ thrust::host_vector<SerialTrieNode> create_na_trie(char quotechar,
                                                           "nan",
                                                           "null"};
 
-  if (!reader_opts.is_enabled_na_filter()) { return {}; }
+  if (!reader_opts.is_enabled_na_filter()) { return cudf::detail::trie(0, stream); }
 
   std::vector<std::string> na_values = reader_opts.get_na_values();
   if (reader_opts.is_enabled_keep_default_na()) {
@@ -737,10 +770,11 @@ thrust::host_vector<SerialTrieNode> create_na_trie(char quotechar,
     na_values.push_back(std::string(2, quotechar));
   }
 
-  return createSerializedTrie(na_values);
+  return cudf::detail::create_serialized_trie(na_values, stream);
 }
 
-parse_options make_parse_options(csv_reader_options const &reader_opts)
+parse_options make_parse_options(csv_reader_options const &reader_opts,
+                                 rmm::cuda_stream_view stream)
 {
   auto parse_opts = parse_options{};
 
@@ -775,20 +809,22 @@ parse_options make_parse_options(csv_reader_options const &reader_opts)
   CUDF_EXPECTS(parse_opts.thousands != parse_opts.delimiter,
                "Thousands separator cannot be the same as the delimiter");
 
-  // Handle user-defined false values, whereby field data is substituted with a
+  // Handle user-defined true values, whereby field data is substituted with a
   // boolean true or numeric `1` value
   if (reader_opts.get_true_values().size() != 0) {
-    parse_opts.trie_true = createSerializedTrie(reader_opts.get_true_values());
+    parse_opts.trie_true =
+      cudf::detail::create_serialized_trie(reader_opts.get_true_values(), stream);
   }
 
   // Handle user-defined false values, whereby field data is substituted with a
   // boolean false or numeric `0` value
   if (reader_opts.get_false_values().size() != 0) {
-    parse_opts.trie_false = createSerializedTrie(reader_opts.get_false_values());
+    parse_opts.trie_false =
+      cudf::detail::create_serialized_trie(reader_opts.get_false_values(), stream);
   }
 
   // Handle user-defined N/A values, whereby field data is treated as null
-  parse_opts.trie_na = create_na_trie(parse_opts.quotechar, reader_opts);
+  parse_opts.trie_na = create_na_trie(parse_opts.quotechar, reader_opts, stream);
 
   return parse_opts;
 }
@@ -796,6 +832,7 @@ parse_options make_parse_options(csv_reader_options const &reader_opts)
 reader::impl::impl(std::unique_ptr<datasource> source,
                    std::string filepath,
                    csv_reader_options const &options,
+                   rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource *mr)
   : mr_(mr), source_(std::move(source)), filepath_(filepath), opts_(options)
 {
@@ -807,27 +844,29 @@ reader::impl::impl(std::unique_ptr<datasource> source,
                            filepath,
                            {{"gz", "gzip"}, {"zip", "zip"}, {"bz2", "bz2"}, {"xz", "xz"}});
 
-  opts = make_parse_options(options);
+  opts = make_parse_options(options, stream);
 }
 
 // Forward to implementation
 reader::reader(std::vector<std::string> const &filepaths,
                csv_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
   // Delay actual instantiation of data source until read to allow for
   // partial memory mapping of file using byte ranges
-  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, mr);
+  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, stream, mr);
 }
 
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                csv_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, mr);
+  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 2764eb0980c..bb0c3ea512e 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include "csv.h"
+#include "csv_common.h"
 #include "csv_gpu.h"
 
-#include <cudf/detail/utilities/trie.cuh>
 #include <io/utilities/column_buffer.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
+#include <io/utilities/trie.cuh>
 
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
@@ -74,11 +74,13 @@ class reader::impl {
    * @param source Dataset source
    * @param filepath Filepath if reading dataset from a file
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<datasource> source,
                 std::string filepath,
                 csv_reader_options const &options,
+                rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource *mr);
 
   /**
@@ -92,11 +94,53 @@ class reader::impl {
 
  private:
   /**
-   * @brief Finds row positions within the specified input data.
+   * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
    *
-   * This function scans the input data to record the row offsets (relative to
-   * the start of the input data).
-   * A row is actually the data/offset between two termination symbols.
+   * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
+   * elements.
+   */
+  class selected_rows_offsets {
+    rmm::device_uvector<uint64_t> all;
+    device_span<uint64_t const> selected;
+
+   public:
+    selected_rows_offsets(rmm::device_uvector<uint64_t> &&data,
+                          device_span<uint64_t const> selected_span)
+      : all{std::move(data)}, selected{selected_span}
+    {
+    }
+    selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
+
+    operator device_span<uint64_t const>() const { return selected; }
+    void shrink(size_t size)
+    {
+      CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
+      selected = selected.subspan(0, size);
+    }
+    void erase_first_n(size_t n)
+    {
+      CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
+      selected = selected.subspan(n, selected.size() - n);
+    }
+    auto size() const { return selected.size(); }
+    auto data() const { return selected.data(); }
+  };
+
+  /**
+   * @brief Selectively loads data on the GPU and gathers offsets of rows to read.
+   *
+   * Selection is based on read options.
+   *
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   */
+  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
+  select_data_and_row_offsets(rmm::cuda_stream_view stream);
+
+  /**
+   * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
+   *
+   * This function scans the input data to record the row offsets (relative to the start of the
+   * input data). A row is actually the data/offset between two termination symbols.
    *
    * @param data Uncompressed input data in host memory
    * @param range_begin Only include rows starting after this position
@@ -104,15 +148,17 @@ class reader::impl {
    * @param skip_rows Number of rows to skip from the start
    * @param num_rows Number of rows to read; -1: all remaining data
    * @param load_whole_file Hint that the entire data will be needed on gpu
-   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return Input data and row offsets in the device memory
    */
-  void gather_row_offsets(host_span<char const> data,
-                          size_t range_begin,
-                          size_t range_end,
-                          size_t skip_rows,
-                          int64_t num_rows,
-                          bool load_whole_file,
-                          rmm::cuda_stream_view stream);
+  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
+  load_data_and_gather_row_offsets(host_span<char const> data,
+                                   size_t range_begin,
+                                   size_t range_end,
+                                   size_t skip_rows,
+                                   int64_t num_rows,
+                                   bool load_whole_file,
+                                   rmm::cuda_stream_view stream);
 
   /**
    * @brief Find the start position of the first data row
@@ -130,7 +176,9 @@ class reader::impl {
    *
    * @return `std::vector<data_type>` List of column types
    */
-  std::vector<data_type> gather_column_types(rmm::cuda_stream_view stream);
+  std::vector<data_type> gather_column_types(device_span<char const> data,
+                                             device_span<uint64_t const> row_offsets,
+                                             rmm::cuda_stream_view stream);
 
   /**
    * @brief Converts the row-column data and outputs to column bufferrs.
@@ -140,7 +188,9 @@ class reader::impl {
    *
    * @return list of column buffers of decoded data, or ptr/size in the case of strings.
    */
-  std::vector<column_buffer> decode_data(std::vector<data_type> const &column_types,
+  std::vector<column_buffer> decode_data(device_span<char const> data,
+                                         device_span<uint64_t const> row_offsets,
+                                         host_span<data_type const> column_types,
                                          rmm::cuda_stream_view stream);
 
  private:
@@ -150,16 +200,13 @@ class reader::impl {
   std::string compression_type_;
   const csv_reader_options opts_;
 
-  rmm::device_vector<char> data_;
-  rmm::device_vector<uint64_t> row_offsets_;
   cudf::size_type num_records_ = 0;  // Number of rows with actual data
   int num_active_cols_         = 0;  // Number of columns to read
   int num_actual_cols_         = 0;  // Number of columns in the dataset
 
   // Parsing options
   parse_options opts{};
-  thrust::host_vector<column_parse::flags> h_column_flags_;
-  rmm::device_vector<column_parse::flags> d_column_flags_;
+  std::vector<column_parse::flags> column_flags_;
 
   // Intermediate data
   std::vector<std::string> col_names_;
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index f7e153d71f4..bc0e1243d4f 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -21,174 +21,86 @@
 
 #include "writer_impl.hpp"
 
-#include <strings/utilities.cuh>
-
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/combine.hpp>
-#include <cudf/strings/convert/convert_booleans.hpp>
-#include <cudf/strings/convert/convert_datetime.hpp>
-#include <cudf/strings/convert/convert_floats.hpp>
-#include <cudf/strings/convert/convert_integers.hpp>
-#include <cudf/strings/detail/modify_strings.cuh>
-#include <cudf/strings/replace.hpp>
-#include <cudf/utilities/traits.hpp>
+#include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/replace.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 
-#include <thrust/count.h>
 #include <thrust/execution_policy.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
+#include <thrust/logical.h>
 #include <thrust/scan.h>
-#include <thrust/transform.h>
 
 #include <algorithm>
-#include <cstring>
-#include <iterator>
 #include <sstream>
-#include <type_traits>
-#include <utility>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
 
-namespace {  // anonym.
-// helpers:
-
-using namespace cudf::strings;
-
-// predicate to determine if a given string_view contains special characters:
-//{"\"", "\n", <delimiter>}
-//
-struct predicate_special_chars {
-  explicit predicate_special_chars(string_view const& delimiter) : delimiter_(delimiter) {}
-
-  __device__ bool operator()(string_view const& str_view) const
-  {
-    // if (any_of{"\"", "\n", <delimiter>} )
-    //
-    constexpr char const* quote_str   = "\"";
-    constexpr char const* newline_str = "\n";
-    constexpr size_type len1byte{1};
-
-    if ((str_view.find(quote_str, len1byte) >= 0) || (str_view.find(newline_str, len1byte) >= 0) ||
-        (str_view.find(delimiter_) >= 0)) {
-      return true;
-    } else {
-      return false;
-    }
-  }
+namespace {
 
- private:
-  string_view const delimiter_;
-};
+/**
+ * @brief Functor to modify a string column for CSV format.
+ *
+ * If a row contains specific characters, the entire row must be
+ * output in double-quotes. Also, if a double-quote appears it
+ * must be escaped using a 2nd double-quote.
+ */
+struct escape_strings_fn {
+  column_device_view const d_column;
+  string_view const d_delimiter;  // check for column delimiter
+  offset_type* d_offsets{};
+  char* d_chars{};
 
-struct probe_special_chars {
-  probe_special_chars(column_device_view const d_column, predicate_special_chars const& predicate)
-    : d_column_(d_column), predicate_(predicate)
+  __device__ void write_char(char_utf8 chr, char*& d_buffer, offset_type& bytes)
   {
+    if (d_buffer)
+      d_buffer += cudf::strings::detail::from_char_utf8(chr, d_buffer);
+    else
+      bytes += cudf::strings::detail::bytes_in_char_utf8(chr);
   }
 
-  __device__ int32_t operator()(size_type idx) const
+  __device__ void operator()(size_type idx)
   {
-    if (d_column_.is_null(idx)) {
-      return 0;  // null string, so no-op
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
     }
 
-    string_view d_str = d_column_.template element<string_view>(idx);
+    constexpr char_utf8 const quote    = '\"';  // check for quote
+    constexpr char_utf8 const new_line = '\n';  // and for new-line
 
-    if (predicate_(d_str)) {
-      constexpr char const quote_char = '\"';
+    auto const d_str = d_column.element<string_view>(idx);
 
-      // count number of quotes "\""
-      size_type num_quotes = thrust::count_if(
-        thrust::seq, d_str.begin(), d_str.end(), [](char_utf8 chr) { return chr == quote_char; });
-      return d_str.size_bytes() + num_quotes + 2;
-    } else {
-      return d_str.size_bytes();
-    }
-  }
-
- private:
-  column_device_view const d_column_;
-  predicate_special_chars const predicate_;
-};
-
-struct modify_special_chars {
-  modify_special_chars(column_device_view const d_column,
-                       int32_t const* d_offsets,
-                       char* d_chars,
-                       predicate_special_chars const& predicate)
-    : d_column_(d_column), d_offsets_(d_offsets), d_chars_(d_chars), predicate_(predicate)
-  {
-  }
+    // if quote, new-line or a column delimiter appear in the string
+    // the entire string must be double-quoted.
+    bool const quote_row = thrust::any_of(
+      thrust::seq, d_str.begin(), d_str.end(), [d_delimiter = d_delimiter](auto chr) {
+        return chr == quote || chr == new_line || chr == d_delimiter[0];
+      });
 
-  __device__ int32_t operator()(size_type idx)
-  {
-    using namespace cudf::strings::detail;
+    char* d_buffer    = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    offset_type bytes = 0;
 
-    if (d_column_.is_null(idx)) {
-      return 0;  // null string, so no-op
+    if (quote_row) write_char(quote, d_buffer, bytes);
+    for (auto chr : d_str) {
+      if (chr == quote) write_char(quote, d_buffer, bytes);
+      write_char(chr, d_buffer, bytes);
     }
+    if (quote_row) write_char(quote, d_buffer, bytes);
 
-    string_view d_str        = d_column_.template element<string_view>(idx);
-    size_type str_size_bytes = d_str.size_bytes();
-
-    char* d_buffer = get_output_ptr(idx);
-    // assert( d_buffer != nullptr );
-
-    if (predicate_(d_str)) {
-      constexpr char const quote_char   = '\"';
-      constexpr char const* quote_str   = "\"";
-      constexpr char const* str_2quotes = "\"\"";
-
-      size_type len1quote{1};
-      size_type len2quotes{2};
-
-      // modify d_str by duplicating all 2bl quotes
-      // and surrounding whole string by 2bl quotes:
-      //
-      // pre-condition: `d_str` is _not_ modified by `d_buffer` manipulation
-      // because it's a copy of `idx` entry in `d_column_`
-      //(since `d_column` is const)
-      //
-      d_buffer = copy_and_increment(d_buffer, quote_str, len1quote);  // add the quote prefix
-
-      for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-        char_utf8 the_chr = *itr;
-
-        if (the_chr == quote_char) {
-          d_buffer = copy_and_increment(d_buffer, str_2quotes, len2quotes);  // double the quote;
-        } else {
-          d_buffer += from_char_utf8(the_chr, d_buffer);
-        }
-      }
-
-      d_buffer = copy_and_increment(d_buffer, quote_str, len1quote);  // add the quote suffix;
-    } else {
-      // copy the source string unmodified:
-      //(pass-through)
-      //
-      memcpy(d_buffer, d_str.data(), str_size_bytes);
-    }
-    return 0;
-  }
-
-  __device__ char* get_output_ptr(size_type idx)
-  {
-    return d_chars_ && d_offsets_ ? d_chars_ + d_offsets_[idx] : nullptr;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
-
- private:
-  column_device_view const d_column_;
-  int32_t const* d_offsets_;
-  char* d_chars_;
-  predicate_special_chars const predicate_;
 };
 
 struct column_to_strings_fn {
@@ -208,13 +120,15 @@ struct column_to_strings_fn {
     return not((std::is_same<column_type, cudf::string_view>::value) ||
                (std::is_integral<column_type>::value) ||
                (std::is_floating_point<column_type>::value) ||
-               (cudf::is_timestamp<column_type>()) || (cudf::is_duration<column_type>()));
+               (cudf::is_fixed_point<column_type>()) || (cudf::is_timestamp<column_type>()) ||
+               (cudf::is_duration<column_type>()));
   }
 
-  explicit column_to_strings_fn(csv_writer_options const& options,
-                                rmm::mr::device_memory_resource* mr = nullptr,
-                                rmm::cuda_stream_view stream        = nullptr)
-    : options_(options), mr_(mr), stream_(stream)
+  explicit column_to_strings_fn(
+    csv_writer_options const& options,
+    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : options_(options), stream_(stream), mr_(mr)
   {
   }
 
@@ -232,10 +146,8 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same<column_type, bool>::value, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    auto conv_col_ptr = cudf::strings::from_booleans(
-      column, options_.get_true_value(), options_.get_false_value(), mr_);
-
-    return conv_col_ptr;
+    return cudf::strings::detail::from_booleans(
+      column, options_.get_true_value(), options_.get_false_value(), stream_, mr_);
   }
 
   // strings:
@@ -244,27 +156,20 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same<column_type, cudf::string_view>::value, std::unique_ptr<column>>
   operator()(column_view const& column_v) const
   {
-    using namespace cudf::strings::detail;
-
     // handle special characters: {delimiter, '\n', "} in row:
-    //
-    // algorithm outline:
-    //
-    // target = "\"";
-    // repl = ""\"\";
-    //
-    // str_column_ref = {};
-    // for each str_row: column_v {
-    //    if ((not null str_row) &&
-    //        (str_row.find("\n") || str_row.find("\"") || str_row.find(delimiter) ))
-    //        str_column_modified = modify(str_row);
-    // where modify() = duplicate the double quotes, if any; add 2bl quotes prefix/suffix;
-    //}
-    //
     string_scalar delimiter{std::string{options_.get_inter_column_delimiter()}, true, stream_};
-    predicate_special_chars pred{delimiter.value(stream_)};
 
-    return modify_strings<probe_special_chars, modify_special_chars>(column_v, stream_, mr_, pred);
+    auto d_column = column_device_view::create(column_v, stream_);
+    escape_strings_fn fn{*d_column, delimiter.value(stream_)};
+    auto children = cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
+
+    return make_strings_column(column_v.size(),
+                               std::move(children.first),
+                               std::move(children.second),
+                               column_v.null_count(),
+                               cudf::detail::copy_bitmask(column_v, stream_, mr_),
+                               stream_,
+                               mr_);
   }
 
   // ints:
@@ -274,9 +179,7 @@ struct column_to_strings_fn {
                    std::unique_ptr<column>>
   operator()(column_view const& column) const
   {
-    auto conv_col_ptr = cudf::strings::from_integers(column, mr_);
-
-    return conv_col_ptr;
+    return cudf::strings::detail::from_integers(column, stream_, mr_);
   }
 
   // floats:
@@ -285,9 +188,16 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_floating_point<column_type>::value, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    auto conv_col_ptr = cudf::strings::from_floats(column, mr_);
+    return cudf::strings::detail::from_floats(column, stream_, mr_);
+  }
 
-    return conv_col_ptr;
+  // fixed point:
+  //
+  template <typename column_type>
+  std::enable_if_t<cudf::is_fixed_point<column_type>(), std::unique_ptr<column>> operator()(
+    column_view const& column) const
+  {
+    return cudf::strings::detail::from_fixed_point(column, stream_, mr_);
   }
 
   // timestamps:
@@ -322,16 +232,14 @@ struct column_to_strings_fn {
       format = "\"" + format + "\"";
     }
 
-    auto conv_col_ptr = cudf::strings::from_timestamps(column, format, mr_);
-
-    return conv_col_ptr;
+    return cudf::strings::detail::from_timestamps(column, format, stream_, mr_);
   }
 
   template <typename column_type>
   std::enable_if_t<cudf::is_duration<column_type>(), std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    return cudf::io::detail::csv::pandas_format_durations(column, stream_);
+    return cudf::io::detail::csv::pandas_format_durations(column, stream_, mr_);
   }
 
   // unsupported type of column:
@@ -345,14 +253,15 @@ struct column_to_strings_fn {
 
  private:
   csv_writer_options const& options_;
-  rmm::mr::device_memory_resource* mr_;
   rmm::cuda_stream_view stream_;
+  rmm::mr::device_memory_resource* mr_;
 };
 }  // unnamed namespace
 
 // Forward to implementation
 writer::writer(std::unique_ptr<data_sink> sink,
                csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(std::move(sink), options, mr))
 {
@@ -410,7 +319,8 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
   cudf::string_scalar newline{options_.get_line_terminator()};
-  auto p_str_col_w_nl = cudf::strings::join_strings(str_column_view, newline);
+  auto p_str_col_w_nl =
+    cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream);
   strings_column_view strings_column{p_str_col_w_nl->view()};
 
   auto total_num_bytes      = strings_column.chars_size();
@@ -472,30 +382,19 @@ void writer::impl::write(table_view const& table,
     if (num_rows <= n_rows_per_chunk) {
       vector_views.push_back(table);
     } else {
-      std::vector<size_type> splits;
-      auto n_chunks = num_rows / n_rows_per_chunk;
-      splits.resize(n_chunks);
-
-      rmm::device_vector<size_type> d_splits(n_chunks, n_rows_per_chunk);
-      thrust::inclusive_scan(
-        rmm::exec_policy(stream), d_splits.begin(), d_splits.end(), d_splits.begin());
-
-      CUDA_TRY(cudaMemcpyAsync(splits.data(),
-                               d_splits.data().get(),
-                               n_chunks * sizeof(size_type),
-                               cudaMemcpyDeviceToHost,
-                               stream.value()));
-
-      stream.synchronize();
+      auto const n_chunks = num_rows / n_rows_per_chunk;
+      std::vector<size_type> splits(n_chunks);
+      thrust::tabulate(splits.begin(), splits.end(), [n_rows_per_chunk](auto idx) {
+        return (idx + 1) * n_rows_per_chunk;
+      });
 
       // split table_view into chunks:
-      //
       vector_views = cudf::split(table, splits);
     }
 
     // convert each chunk to CSV:
     //
-    column_to_strings_fn converter{options_, mr_};
+    column_to_strings_fn converter{options_, stream, rmm::mr::get_current_device_resource()};
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
       if (sub_view.num_rows() == 0) continue;
@@ -516,11 +415,19 @@ void writer::impl::write(table_view const& table,
       auto str_table_view = str_table_ptr->view();
 
       // concatenate columns in each row into one big string column
-      //(using null representation and delimiter):
+      // (using null representation and delimiter):
       //
       std::string delimiter_str{options_.get_inter_column_delimiter()};
-      auto str_concat_col =
-        cudf::strings::concatenate(str_table_view, delimiter_str, options_.get_na_rep(), mr_);
+      auto str_concat_col = [&] {
+        if (str_table_view.num_columns() > 1)
+          return cudf::strings::detail::concatenate(str_table_view,
+                                                    delimiter_str,
+                                                    options_.get_na_rep(),
+                                                    strings::separator_on_nulls::YES,
+                                                    stream);
+        cudf::string_scalar narep{options_.get_na_rep()};
+        return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
+      }();
 
       write_chunked(str_concat_col->view(), metadata, stream);
     }
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
index 9c42a3666fb..965c036dc75 100644
--- a/cpp/src/io/csv/writer_impl.hpp
+++ b/cpp/src/io/csv/writer_impl.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "csv.h"
+#include "csv_common.h"
 #include "csv_gpu.h"
 
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index bc6d36a0328..bf51012211c 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -109,10 +109,11 @@ namespace {
 template <typename reader, typename reader_options>
 std::unique_ptr<reader> make_reader(source_info const& src_info,
                                     reader_options const& options,
+                                    rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
   if (src_info.type == io_type::FILEPATH) {
-    return std::make_unique<reader>(src_info.filepaths, options, mr);
+    return std::make_unique<reader>(src_info.filepaths, options, stream, mr);
   }
 
   std::vector<std::unique_ptr<datasource>> datasources;
@@ -124,7 +125,7 @@ std::unique_ptr<reader> make_reader(source_info const& src_info,
     CUDF_FAIL("Unsupported source type");
   }
 
-  return std::make_unique<reader>(std::move(datasources), options, mr);
+  return std::make_unique<reader>(std::move(datasources), options, stream, mr);
 }
 
 template <typename writer, typename... Ts>
@@ -155,7 +156,7 @@ table_with_metadata read_avro(avro_reader_options const& opts, rmm::mr::device_m
   namespace avro = cudf::io::detail::avro;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<avro::reader>(opts.get_source(), opts, mr);
+  auto reader = make_reader<avro::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
   return reader->read(opts);
 }
 
@@ -164,7 +165,7 @@ table_with_metadata read_json(json_reader_options const& opts, rmm::mr::device_m
   namespace json = cudf::io::detail::json;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<json::reader>(opts.get_source(), opts, mr);
+  auto reader = make_reader<json::reader>(opts.get_source(), opts, rmm::cuda_stream_default, mr);
   return reader->read(opts);
 }
 
@@ -173,7 +174,8 @@ table_with_metadata read_csv(csv_reader_options const& options, rmm::mr::device_
   namespace csv = cudf::io::detail::csv;
 
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<csv::reader>(options.get_source(), options, mr);
+  auto reader =
+    make_reader<csv::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
 
   return reader->read();
 }
@@ -183,7 +185,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 {
   using namespace cudf::io::detail;
 
-  auto writer = make_writer<csv::writer>(options.get_sink(), options, mr);
+  auto writer = make_writer<csv::writer>(options.get_sink(), options, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table(), options.get_metadata());
 }
@@ -235,76 +237,23 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 
 column_statistics::column_statistics(cudf::io::orc::column_statistics&& cs)
 {
-  _number_of_values = std::move(cs.number_of_values);
-  if (cs.int_stats.get()) {
-    _type                = statistics_type::INT;
-    _type_specific_stats = cs.int_stats.release();
-  } else if (cs.double_stats.get()) {
-    _type                = statistics_type::DOUBLE;
-    _type_specific_stats = cs.double_stats.release();
-  } else if (cs.string_stats.get()) {
-    _type                = statistics_type::STRING;
-    _type_specific_stats = cs.string_stats.release();
-  } else if (cs.bucket_stats.get()) {
-    _type                = statistics_type::BUCKET;
-    _type_specific_stats = cs.bucket_stats.release();
-  } else if (cs.decimal_stats.get()) {
-    _type                = statistics_type::DECIMAL;
-    _type_specific_stats = cs.decimal_stats.release();
-  } else if (cs.date_stats.get()) {
-    _type                = statistics_type::DATE;
-    _type_specific_stats = cs.date_stats.release();
-  } else if (cs.binary_stats.get()) {
-    _type                = statistics_type::BINARY;
-    _type_specific_stats = cs.binary_stats.release();
-  } else if (cs.timestamp_stats.get()) {
-    _type                = statistics_type::TIMESTAMP;
-    _type_specific_stats = cs.timestamp_stats.release();
-  }
-}
-
-column_statistics& column_statistics::operator=(column_statistics&& other) noexcept
-{
-  _number_of_values    = std::move(other._number_of_values);
-  _type                = other._type;
-  _type_specific_stats = other._type_specific_stats;
-
-  other._type                = statistics_type::NONE;
-  other._type_specific_stats = nullptr;
-
-  return *this;
-}
-
-column_statistics::column_statistics(column_statistics&& other) noexcept
-{
-  *this = std::move(other);
-}
-
-column_statistics::~column_statistics()
-{
-  switch (_type) {
-    case statistics_type::NONE:  // error state, but can't throw from a destructor.
-      break;
-    case statistics_type::INT: delete static_cast<integer_statistics*>(_type_specific_stats); break;
-    case statistics_type::DOUBLE:
-      delete static_cast<double_statistics*>(_type_specific_stats);
-      break;
-    case statistics_type::STRING:
-      delete static_cast<string_statistics*>(_type_specific_stats);
-      break;
-    case statistics_type::BUCKET:
-      delete static_cast<bucket_statistics*>(_type_specific_stats);
-      break;
-    case statistics_type::DECIMAL:
-      delete static_cast<decimal_statistics*>(_type_specific_stats);
-      break;
-    case statistics_type::DATE: delete static_cast<date_statistics*>(_type_specific_stats); break;
-    case statistics_type::BINARY:
-      delete static_cast<binary_statistics*>(_type_specific_stats);
-      break;
-    case statistics_type::TIMESTAMP:
-      delete static_cast<timestamp_statistics*>(_type_specific_stats);
-      break;
+  number_of_values = cs.number_of_values;
+  if (cs.int_stats) {
+    type_specific_stats = *cs.int_stats;
+  } else if (cs.double_stats) {
+    type_specific_stats = *cs.double_stats;
+  } else if (cs.string_stats) {
+    type_specific_stats = *cs.string_stats;
+  } else if (cs.bucket_stats) {
+    type_specific_stats = *cs.bucket_stats;
+  } else if (cs.decimal_stats) {
+    type_specific_stats = *cs.decimal_stats;
+  } else if (cs.date_stats) {
+    type_specific_stats = *cs.date_stats;
+  } else if (cs.binary_stats) {
+    type_specific_stats = *cs.binary_stats;
+  } else if (cs.timestamp_stats) {
+    type_specific_stats = *cs.timestamp_stats;
   }
 }
 
@@ -345,7 +294,8 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
 table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<detail_orc::reader>(options.get_source(), options, mr);
+  auto reader =
+    make_reader<detail_orc::reader>(options.get_source(), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -359,7 +309,7 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 
   namespace io_detail = cudf::io::detail;
   auto writer         = make_writer<detail_orc::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, mr);
+    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
 }
@@ -372,7 +322,7 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& op,
 {
   namespace io_detail = cudf::io::detail;
   writer              = make_writer<detail_orc::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, mr, rmm::cuda_stream_default);
+    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -404,7 +354,8 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto reader = make_reader<detail_parquet::reader>(options.get_source(), options, mr);
+  auto reader = make_reader<detail_parquet::reader>(
+    options.get_source(), options, rmm::cuda_stream_default, mr);
 
   return reader->read(options);
 }
@@ -445,7 +396,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
   namespace io_detail = cudf::io::detail;
 
   auto writer = make_writer<detail_parquet::writer>(
-    options.get_sink(), options, io_detail::SingleWriteMode::YES, mr, rmm::cuda_stream_default);
+    options.get_sink(), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
   return writer->close(options.get_column_chunks_file_path());
@@ -459,7 +410,7 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co
 {
   namespace io_detail = cudf::io::detail;
   writer              = make_writer<detail_parquet::writer>(
-    op.get_sink(), op, io_detail::SingleWriteMode::NO, mr, rmm::cuda_stream_default);
+    op.get_sink(), op, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
diff --git a/cpp/src/io/json/json_common.h b/cpp/src/io/json/json_common.h
index 0bcd4e95f9a..803b937e58d 100644
--- a/cpp/src/io/json/json_common.h
+++ b/cpp/src/io/json/json_common.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <io/utilities/column_buffer.hpp>
 #include <io/utilities/column_type_histogram.hpp>
 
-class SerialTrieNode;
+using cudf::io::detail::string_index_pair;
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 75910ae6b5b..4e9f896789b 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -21,7 +21,7 @@
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/detail/utilities/trie.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -29,10 +29,11 @@
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
+#include <io/utilities/trie.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/detail/copy.h>
@@ -46,8 +47,6 @@ namespace json {
 namespace gpu {
 using namespace ::cudf;
 
-using string_pair = std::pair<const char *, size_t>;
-
 namespace {
 /**
  * @brief CUDA Kernel that adjusts the row range to exclude the character outside of the top level
@@ -516,7 +515,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
     if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Type dispatcher does not handle strings
       if (column_types[desc.column].id() == type_id::STRING) {
-        auto str_list           = static_cast<string_pair *>(output_columns[desc.column]);
+        auto str_list           = static_cast<string_index_pair *>(output_columns[desc.column]);
         str_list[rec_id].first  = desc.value_begin;
         str_list[rec_id].second = value_len;
 
@@ -537,7 +536,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
         }
       }
     } else if (column_types[desc.column].id() == type_id::STRING) {
-      auto str_list           = static_cast<string_pair *>(output_columns[desc.column]);
+      auto str_list           = static_cast<string_index_pair *>(output_columns[desc.column]);
       str_list[rec_id].first  = nullptr;
       str_list[rec_id].second = 0;
     }
@@ -793,17 +792,23 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, detect_data_types_kernel));
 
-  rmm::device_vector<cudf::io::column_type_histogram> d_column_infos(
-    num_columns, cudf::io::column_type_histogram{});
-
-  if (do_set_null_count) {
-    // Set the null count to the row count (all fields assumes to be null).
-    thrust::for_each(
-      rmm::exec_policy(stream),
-      d_column_infos.begin(),
-      d_column_infos.end(),
-      [num_records = row_offsets.size()] __device__(auto &info) { info.null_count = num_records; });
-  }
+  auto d_column_infos = [&]() {
+    if (do_set_null_count) {
+      rmm::device_uvector<cudf::io::column_type_histogram> d_column_infos(num_columns, stream);
+      // Set the null count to the row count (all fields assumes to be null).
+      thrust::generate(
+        rmm::exec_policy(stream),
+        d_column_infos.begin(),
+        d_column_infos.end(),
+        [num_records = static_cast<cudf::size_type>(row_offsets.size())] __device__() {
+          return cudf::io::column_type_histogram{num_records};
+        });
+      return d_column_infos;
+    } else {
+      return cudf::detail::make_zeroed_device_uvector_async<cudf::io::column_type_histogram>(
+        num_columns, stream);
+    }
+  }();
 
   // Calculate actual block count to use based on records count
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
@@ -811,13 +816,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   detect_data_types_kernel<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, row_offsets, col_map, num_columns, d_column_infos);
 
-  CUDA_TRY(cudaGetLastError());
-
-  auto h_column_infos = std::vector<cudf::io::column_type_histogram>(num_columns);
-
-  thrust::copy(d_column_infos.begin(), d_column_infos.end(), h_column_infos.begin());
-
-  return h_column_infos;
+  return cudf::detail::make_std_vector_sync(d_column_infos, stream);
 }
 
 /**
diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h
index fb8d7b2c7ab..4a68ce48f20 100644
--- a/cpp/src/io/json/json_gpu.h
+++ b/cpp/src/io/json/json_gpu.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <io/json/json_common.h>
 #include <io/utilities/parsing_utils.cuh>
+#include "json_common.h"
 
 #include <hash/concurrent_unordered_map.cuh>
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 1a1fa8d0602..89bb05f7875 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -26,7 +26,6 @@
 #include <io/utilities/type_conversion.cuh>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/utilities/trie.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
@@ -34,10 +33,10 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
+#include <io/utilities/trie.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/optional.h>
@@ -210,12 +209,12 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
  * @return Names of JSON object keys in the file
  */
 std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_object_keys_hashes(
-  rmm::cuda_stream_view stream)
+  device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream)
 {
   auto info = create_json_keys_info_table(
     opts_.view(),
     device_span<char const>(static_cast<char const *>(data_.data()), data_.size()),
-    rec_starts_,
+    rec_starts,
     stream);
 
   auto aggregated_info = aggregate_keys_info(std::move(info));
@@ -286,14 +285,7 @@ void reader::impl::decompress_input(rmm::cuda_stream_view stream)
   if (load_whole_file_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
 }
 
-/**
- * @brief Finds all record starts in the file and stores them in rec_starts_
- *
- * Does not upload the entire file to the GPU
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- */
-void reader::impl::set_record_starts(rmm::cuda_stream_view stream)
+rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
@@ -303,38 +295,38 @@ void reader::impl::set_record_starts(rmm::cuda_stream_view stream)
   // If not starting at an offset, add an extra row to account for the first row in the file
   cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
   if (load_whole_file_) {
-    prefilter_count += count_all_from_set(data_, chars_to_count);
+    prefilter_count += count_all_from_set(data_, chars_to_count, stream);
   } else {
-    prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count);
+    prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
   }
 
-  rec_starts_.resize(prefilter_count);
+  rmm::device_uvector<uint64_t> rec_starts(prefilter_count, stream);
 
-  auto *find_result_ptr = rec_starts_.data().get();
+  auto *find_result_ptr = rec_starts.data();
   // Manually adding an extra row to account for the first row in the file
   if (byte_range_offset_ == 0) {
     find_result_ptr++;
-    CUDA_TRY(cudaMemsetAsync(rec_starts_.data().get(), 0ull, sizeof(uint64_t), stream.value()));
+    CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
   }
 
   std::vector<char> chars_to_find{'\n'};
   if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); }
   // Passing offset = 1 to return positions AFTER the found character
   if (load_whole_file_) {
-    find_all_from_set(data_, chars_to_find, 1, find_result_ptr);
+    find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
   } else {
-    find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr);
+    find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
   }
 
   // Previous call stores the record pinput_file.typeositions as encountered by all threads
   // Sort the record positions as subsequent processing may require filtering
   // certain rows or other processing on specific records
-  thrust::sort(rmm::exec_policy(stream), rec_starts_.begin(), rec_starts_.end());
+  thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end());
 
   auto filtered_count = prefilter_count;
   if (allow_newlines_in_strings_) {
-    thrust::host_vector<uint64_t> h_rec_starts = rec_starts_;
-    bool quotation                             = false;
+    auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
+    bool quotation    = false;
     for (cudf::size_type i = 1; i < prefilter_count; ++i) {
       if (uncomp_data_[h_rec_starts[i] - 1] == '\"') {
         quotation       = !quotation;
@@ -345,15 +337,20 @@ void reader::impl::set_record_starts(rmm::cuda_stream_view stream)
         filtered_count--;
       }
     }
-
-    rec_starts_ = h_rec_starts;
-    thrust::sort(rmm::exec_policy(stream), rec_starts_.begin(), rec_starts_.end());
+    CUDA_TRY(cudaMemcpyAsync(rec_starts.data(),
+                             h_rec_starts.data(),
+                             h_rec_starts.size() * sizeof(uint64_t),
+                             cudaMemcpyDefault,
+                             stream.value()));
+    thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end());
+    stream.synchronize();
   }
 
   // Exclude the ending newline as it does not precede a record start
   if (uncomp_data_[uncomp_size_ - 1] == '\n') { filtered_count--; }
+  rec_starts.resize(filtered_count, stream);
 
-  rec_starts_.resize(filtered_count);
+  return rec_starts;
 }
 
 /**
@@ -363,14 +360,15 @@ void reader::impl::set_record_starts(rmm::cuda_stream_view stream)
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-void reader::impl::upload_data_to_device(rmm::cuda_stream_view stream)
+void reader::impl::upload_data_to_device(rmm::device_uvector<uint64_t> &rec_starts,
+                                         rmm::cuda_stream_view stream)
 {
   size_t start_offset = 0;
   size_t end_offset   = uncomp_size_;
 
   // Trim lines that are outside range
   if (byte_range_size_ != 0 || byte_range_offset_ != 0) {
-    thrust::host_vector<uint64_t> h_rec_starts = rec_starts_;
+    auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
 
     if (byte_range_size_ != 0) {
       auto it = h_rec_starts.end() - 1;
@@ -384,12 +382,12 @@ void reader::impl::upload_data_to_device(rmm::cuda_stream_view stream)
     // Resize to exclude rows outside of the range
     // Adjust row start positions to account for the data subcopy
     start_offset = h_rec_starts.front();
-    rec_starts_.resize(h_rec_starts.size());
+    rec_starts.resize(h_rec_starts.size(), stream);
     thrust::transform(rmm::exec_policy(stream),
-                      rec_starts_.begin(),
-                      rec_starts_.end(),
+                      rec_starts.begin(),
+                      rec_starts.end(),
                       thrust::make_constant_iterator(start_offset),
-                      rec_starts_.begin(),
+                      rec_starts.begin(),
                       thrust::minus<uint64_t>());
   }
 
@@ -401,21 +399,15 @@ void reader::impl::upload_data_to_device(rmm::cuda_stream_view stream)
   data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
 }
 
-/**
- * @brief Parse the first row to set the column name
- *
- * Sets the column_names_ data member
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- */
-void reader::impl::set_column_names(rmm::cuda_stream_view stream)
+void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
+                                    rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
   uint64_t first_row_len = data_.size() / sizeof(char);
-  if (rec_starts_.size() > 1) {
+  if (rec_starts.size() > 1) {
     // Set first_row_len to the offset of the second row, if it exists
     CUDA_TRY(cudaMemcpyAsync(&first_row_len,
-                             rec_starts_.data().get() + 1,
+                             rec_starts.data() + 1,
                              sizeof(uint64_t),
                              cudaMemcpyDeviceToHost,
                              stream.value()));
@@ -439,7 +431,7 @@ void reader::impl::set_column_names(rmm::cuda_stream_view stream)
   // If the first opening bracket is '{', assume object format
   if (first_curly_bracket < first_square_bracket) {
     // use keys as column names if input rows are objects
-    auto keys_desc         = get_json_object_keys_hashes(stream);
+    auto keys_desc         = get_json_object_keys_hashes(rec_starts, stream);
     metadata_.column_names = keys_desc.first;
     set_column_map(std::move(keys_desc.second));
   } else {
@@ -458,14 +450,8 @@ void reader::impl::set_column_names(rmm::cuda_stream_view stream)
   }
 }
 
-/**
- * @brief Set the data type array data member
- *
- * If user does not pass the data types, deduces types from the file content
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- */
-void reader::impl::set_data_types(rmm::cuda_stream_view stream)
+void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
+                                  rmm::cuda_stream_view stream)
 {
   auto const dtype = options_.get_dtypes();
   if (!dtype.empty()) {
@@ -478,24 +464,21 @@ void reader::impl::set_data_types(rmm::cuda_stream_view stream)
         return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
       });
 
-    // When C++17, use std::string_view and CTAD
-    auto split_on_colon = [](auto const &s) -> std::pair<std::string, std::string> {
+    auto split_on_colon = [](std::string_view s) {
       auto const i = s.find(":");
-      auto const a = s.substr(0, i);
-      auto const b = s.substr(i + 1);
-      return {a, b};
+      return std::pair{s.substr(0, i), s.substr(i + 1)};
     };
 
     if (is_dict) {
       std::map<std::string, data_type> col_type_map;
-      std::transform(std::cbegin(dtype),
-                     std::cend(dtype),
-                     std::inserter(col_type_map, col_type_map.end()),
-                     [&](auto const &ts) -> std::pair<std::string, data_type> {
-                       // When C++17, use structured bindings: auto const& [col_name, type_str] = ..
-                       auto split = split_on_colon(ts);
-                       return {split.first, convert_string_to_dtype(split.second)};
-                     });
+      std::transform(
+        std::cbegin(dtype),
+        std::cend(dtype),
+        std::inserter(col_type_map, col_type_map.end()),
+        [&](auto const &ts) {
+          auto const [col_name, type_str] = split_on_colon(ts);
+          return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
+        });
 
       // Using the map here allows O(n log n) complexity
       std::transform(std::cbegin(metadata_.column_names),
@@ -509,14 +492,14 @@ void reader::impl::set_data_types(rmm::cuda_stream_view stream)
                      [](auto const &col_dtype) { return convert_string_to_dtype(col_dtype); });
     }
   } else {
-    CUDF_EXPECTS(rec_starts_.size() != 0, "No data available for data type inference.\n");
+    CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = metadata_.column_names.size();
     auto const do_set_null_count = key_to_col_idx_map_ != nullptr;
 
     auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
       opts_.view(),
       device_span<char const>(static_cast<char const *>(data_.data()), data_.size()),
-      rec_starts_,
+      rec_starts,
       do_set_null_count,
       num_columns,
       get_column_map_device_ptr(),
@@ -525,7 +508,7 @@ void reader::impl::set_data_types(rmm::cuda_stream_view stream)
     auto get_type_id = [&](auto const &cinfo) {
       auto int_count_total =
         cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count;
-      if (cinfo.null_count == static_cast<int>(rec_starts_.size())) {
+      if (cinfo.null_count == static_cast<int>(rec_starts.size())) {
         // Entire column is NULL; allocate the smallest amount of memory
         return type_id::INT8;
       } else if (cinfo.string_count > 0) {
@@ -552,19 +535,13 @@ void reader::impl::set_data_types(rmm::cuda_stream_view stream)
                    std::back_inserter(dtypes_),
                    [&](auto const &cinfo) { return data_type{get_type_id(cinfo)}; });
   }
-}  // namespace json
+}
 
-/**
- * @brief Parse the input data and store results a table
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- *
- * @return table_with_metadata struct
- */
-table_with_metadata reader::impl::convert_data_to_table(rmm::cuda_stream_view stream)
+table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t const> rec_starts,
+                                                        rmm::cuda_stream_view stream)
 {
   const auto num_columns = dtypes_.size();
-  const auto num_records = rec_starts_.size();
+  const auto num_records = rec_starts.size();
 
   // alloc output buffers.
   std::vector<column_buffer> out_buffers;
@@ -582,15 +559,16 @@ table_with_metadata reader::impl::convert_data_to_table(rmm::cuda_stream_view st
     h_valid[i]  = out_buffers[i].null_mask();
   }
 
-  rmm::device_vector<data_type> d_dtypes           = h_dtypes;
-  rmm::device_vector<void *> d_data                = h_data;
-  rmm::device_vector<cudf::bitmask_type *> d_valid = h_valid;
-  rmm::device_vector<cudf::size_type> d_valid_counts(num_columns, 0);
+  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(h_dtypes, stream);
+  auto d_data   = cudf::detail::make_device_uvector_async<void *>(h_data, stream);
+  auto d_valid  = cudf::detail::make_device_uvector_async<cudf::bitmask_type *>(h_valid, stream);
+  auto d_valid_counts =
+    cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
 
   cudf::io::json::gpu::convert_json_to_columns(
     opts_.view(),
     device_span<char const>(static_cast<char const *>(data_.data()), data_.size()),
-    rec_starts_,
+    rec_starts,
     d_dtypes,
     get_column_map_device_ptr(),
     d_data,
@@ -618,7 +596,7 @@ table_with_metadata reader::impl::convert_data_to_table(rmm::cuda_stream_view st
                                   0,
                                   stream);
 
-  thrust::host_vector<cudf::size_type> h_valid_counts = d_valid_counts;
+  auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream);
   std::vector<std::unique_ptr<column>> out_columns;
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
@@ -645,14 +623,15 @@ table_with_metadata reader::impl::convert_data_to_table(rmm::cuda_stream_view st
 reader::impl::impl(std::unique_ptr<datasource> source,
                    std::string filepath,
                    json_reader_options const &options,
+                   rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource *mr)
   : options_(options), mr_(mr), source_(std::move(source)), filepath_(filepath)
 {
   CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
-  opts_.trie_true  = createSerializedTrie({"true"});
-  opts_.trie_false = createSerializedTrie({"false"});
-  opts_.trie_na    = createSerializedTrie({"", "null"});
+  opts_.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  opts_.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
 
   opts_.dayfirst = options.is_enabled_dayfirst();
 }
@@ -679,39 +658,41 @@ table_with_metadata reader::impl::read(json_reader_options const &options,
   CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n");
   CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
-  set_record_starts(stream);
-  CUDF_EXPECTS(!rec_starts_.empty(), "Error enumerating records.\n");
+  auto rec_starts = find_record_starts(stream);
+  CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n");
 
-  upload_data_to_device(stream);
+  upload_data_to_device(rec_starts, stream);
   CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n");
 
-  set_column_names(stream);
+  set_column_names(rec_starts, stream);
   CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n");
 
-  set_data_types(stream);
+  set_data_types(rec_starts, stream);
   CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n");
 
-  return convert_data_to_table(stream);
+  return convert_data_to_table(rec_starts, stream);
 }
 
 // Forward to implementation
 reader::reader(std::vector<std::string> const &filepaths,
                json_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
   // Delay actual instantiation of data source until read to allow for
   // partial memory mapping of file using byte ranges
-  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, mr);
+  _impl = std::make_unique<impl>(nullptr, filepaths[0], options, stream, mr);
 }
 
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                json_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, mr);
+  _impl = std::make_unique<impl>(std::move(sources[0]), "", options, stream, mr);
 }
 
 // Destructor within this translation unit
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
index ffd3dc58fe7..cb413630d07 100644
--- a/cpp/src/io/json/reader_impl.hpp
+++ b/cpp/src/io/json/reader_impl.hpp
@@ -21,7 +21,7 @@
 
 #pragma once
 
-#include "json.h"
+#include "json_common.h"
 #include "json_gpu.h"
 
 #include <io/utilities/column_buffer.hpp>
@@ -34,8 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-
-#include <thrust/device_vector.h>
+#include <rmm/device_uvector.hpp>
 
 namespace cudf {
 namespace io {
@@ -67,7 +66,6 @@ class reader::impl {
   // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
   std::vector<char> uncomp_data_owner_;
   rmm::device_buffer data_;
-  rmm::device_vector<uint64_t> rec_starts_;
 
   size_t byte_range_offset_ = 0;
   size_t byte_range_size_   = 0;
@@ -120,7 +118,7 @@ class reader::impl {
    * @return Array of keys and a map that maps their hash values to column indices
    */
   std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
-    rmm::cuda_stream_view stream);
+    device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
 
   /**
    * @brief Decompress the input data, if needed
@@ -130,13 +128,14 @@ class reader::impl {
   void decompress_input(rmm::cuda_stream_view stream);
 
   /**
-   * @brief Finds all record starts in the file and stores them in rec_starts_
+   * @brief Finds all record starts in the file.
    *
    * Does not upload the entire file to the GPU
    *
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+   * @return Record starts in the device memory
    */
-  void set_record_starts(rmm::cuda_stream_view stream);
+  rmm::device_uvector<uint64_t> find_record_starts(rmm::cuda_stream_view stream);
 
   /**
    * @brief Uploads the relevant segment of the input json data onto the GPU.
@@ -145,34 +144,39 @@ class reader::impl {
    * Only rows that need to be parsed are copied, based on the byte range
    * Also updates the array of record starts to match the device data offset.
    */
-  void upload_data_to_device(rmm::cuda_stream_view stream);
+  void upload_data_to_device(rmm::device_uvector<uint64_t> &rec_starts,
+                             rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the first row to set the column name
    *
    * Sets the column_names_ data member
    *
+   * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_column_names(rmm::cuda_stream_view stream);
+  void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
 
   /**
    * @brief Set the data type array data member
    *
    * If user does not pass the data types, deduces types from the file content
    *
+   * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    */
-  void set_data_types(rmm::cuda_stream_view stream);
+  void set_data_types(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
 
   /**
    * @brief Parse the input data and store results a table
    *
+   * @param[in] rec_starts Record starts in device memory
    * @param[in] stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return Table and its metadata
    */
-  table_with_metadata convert_data_to_table(rmm::cuda_stream_view stream);
+  table_with_metadata convert_data_to_table(device_span<uint64_t const> rec_starts,
+                                            rmm::cuda_stream_view stream);
 
  public:
   /**
@@ -181,6 +185,7 @@ class reader::impl {
   explicit impl(std::unique_ptr<datasource> source,
                 std::string filepath,
                 json_reader_options const &options,
+                rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource *mr);
 
   /**
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index e1b6c3ace6c..97c1db80ace 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include <io/orc/orc.h>
-#include <io/orc/orc_field_reader.hpp>
-#include <io/orc/orc_field_writer.hpp>
+#include "orc.h"
 #include <string>
+#include "orc_field_reader.hpp"
+#include "orc_field_writer.hpp"
 
 namespace cudf {
 namespace io {
@@ -104,8 +104,9 @@ void ProtobufReader::read(StripeFooter &s, size_t maxlen)
 
 void ProtobufReader::read(Stream &s, size_t maxlen)
 {
-  auto op = std::make_tuple(
-    make_field_reader(1, s.kind), make_field_reader(2, s.column), make_field_reader(3, s.length));
+  auto op = std::make_tuple(make_field_reader(1, s.kind),
+                            make_field_reader(2, s.column_id),
+                            make_field_reader(3, s.length));
   function_builder(s, maxlen, op);
 }
 
@@ -117,60 +118,56 @@ void ProtobufReader::read(ColumnEncoding &s, size_t maxlen)
 
 void ProtobufReader::read(integer_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_field_reader(1, s._minimum),
-                            make_field_reader(2, s._maximum),
-                            make_field_reader(3, s._sum));
+  auto op = std::make_tuple(
+    make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
 void ProtobufReader::read(double_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_field_reader(1, s._minimum),
-                            make_field_reader(2, s._maximum),
-                            make_field_reader(3, s._sum));
+  auto op = std::make_tuple(
+    make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
 void ProtobufReader::read(string_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_field_reader(1, s._minimum),
-                            make_field_reader(2, s._maximum),
-                            make_field_reader(3, s._sum));
+  auto op = std::make_tuple(
+    make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
 void ProtobufReader::read(bucket_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_packed_field_reader(1, s._count));
+  auto op = std::make_tuple(make_packed_field_reader(1, s.count));
   function_builder(s, maxlen, op);
 }
 
 void ProtobufReader::read(decimal_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_field_reader(1, s._minimum),
-                            make_field_reader(2, s._maximum),
-                            make_field_reader(3, s._sum));
+  auto op = std::make_tuple(
+    make_field_reader(1, s.minimum), make_field_reader(2, s.maximum), make_field_reader(3, s.sum));
   function_builder(s, maxlen, op);
 }
 
 void ProtobufReader::read(date_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_field_reader(1, s._minimum), make_field_reader(2, s._maximum));
+  auto op = std::make_tuple(make_field_reader(1, s.minimum), make_field_reader(2, s.maximum));
   function_builder(s, maxlen, op);
 }
 
 void ProtobufReader::read(binary_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_field_reader(1, s._sum));
+  auto op = std::make_tuple(make_field_reader(1, s.sum));
   function_builder(s, maxlen, op);
 }
 
 void ProtobufReader::read(timestamp_statistics &s, size_t maxlen)
 {
-  auto op = std::make_tuple(make_field_reader(1, s._minimum),
-                            make_field_reader(2, s._maximum),
-                            make_field_reader(3, s._minimum_utc),
-                            make_field_reader(4, s._maximum_utc));
+  auto op = std::make_tuple(make_field_reader(1, s.minimum),
+                            make_field_reader(2, s.maximum),
+                            make_field_reader(3, s.minimum_utc),
+                            make_field_reader(4, s.maximum_utc));
   function_builder(s, maxlen, op);
 }
 
@@ -226,7 +223,7 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
   if (data_blk >= 0) { sz += put_uint(data_blk); }
   if (data_ofs >= 0) {
     sz += put_uint(data_ofs);
-    if (kind != STRING && kind != FLOAT && kind != DOUBLE) {
+    if (kind != STRING && kind != FLOAT && kind != DOUBLE && kind != DECIMAL) {
       putb(0);  // RLE run pos always zero (assumes RLE aligned with row index boundaries)
       sz++;
       if (kind == BOOLEAN) {
@@ -292,8 +289,8 @@ size_t ProtobufWriter::write(const SchemaType &s)
   w.field_packed_uint(2, s.subtypes);
   w.field_repeated_string(3, s.fieldNames);
   // w.field_uint(4, s.maximumLength);
-  // w.field_uint(5, s.precision);
-  // w.field_uint(6, s.scale);
+  if (s.precision) w.field_uint(5, *s.precision);
+  if (s.scale) w.field_uint(6, *s.scale);
   return w.value();
 }
 
@@ -318,7 +315,7 @@ size_t ProtobufWriter::write(const Stream &s)
 {
   ProtobufFieldWriter w(this);
   w.field_uint(1, s.kind);
-  w.field_uint(2, s.column);
+  if (s.column_id) w.field_uint(2, *s.column_id);
   w.field_uint(3, s.length);
   return w.value();
 }
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index c8c5c3919d0..84bea1fca66 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -16,20 +16,21 @@
 
 #pragma once
 
+#include "orc_common.h"
+
+#include <io/comp/io_uncomp.h>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/orc_metadata.hpp>
+#include <cudf/utilities/error.hpp>
+
 #include <stddef.h>
 #include <stdint.h>
 #include <algorithm>
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
-#include <cudf/utilities/error.hpp>
-
-#include <io/comp/io_uncomp.h>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/orc_metadata.hpp>
-#include "orc_common.h"
-
 namespace cudf {
 namespace io {
 namespace orc {
@@ -54,10 +55,10 @@ struct SchemaType {
   TypeKind kind = INVALID_TYPE_KIND;  // the kind of this type
   std::vector<uint32_t> subtypes;  // the type ids of any subcolumns for list, map, struct, or union
   std::vector<std::string> fieldNames;  // the list of field names for struct
-  uint32_t maximumLength =
-    0;  // optional: the maximum length of the type for varchar or char in UTF-8 characters
-  uint32_t precision = 0;  // optional: the precision and scale for decimal
-  uint32_t scale     = 0;
+  std::optional<uint32_t>
+    maximumLength;  // the maximum length of the type for varchar or char in UTF-8 characters
+  std::optional<uint32_t> precision;  // the precision for decimal
+  std::optional<uint32_t> scale;      // the scale for decimal
 };
 
 struct UserMetadataItem {
@@ -80,8 +81,16 @@ struct FileFooter {
 
 struct Stream {
   StreamKind kind = INVALID_STREAM_KIND;
-  uint32_t column = ~0;  // the column id
-  uint64_t length = 0;   // the number of bytes in the file
+  std::optional<uint32_t> column_id;  // ORC column id (different from column index in the table!)
+  uint64_t length = 0;                // the number of bytes in the file
+
+  // Returns index of the column in the table, if any
+  // Stream of the 'column 0' does not have a corresponding column in the table
+  std::optional<uint32_t> column_index() const noexcept
+  {
+    return column_id.value_or(0) > 0 ? std::optional<uint32_t>{*column_id - 1}
+                                     : std::optional<uint32_t>{};
+  }
 };
 
 struct ColumnEncoding {
@@ -98,18 +107,18 @@ struct StripeFooter {
 /**
  * @brief Contains per-column ORC statistics.
  *
- * At most one of the `***_statistics` members has a non-null value.
+ * At most one of the `***_statistics` members has a value.
  */
 struct column_statistics {
-  std::unique_ptr<uint64_t> number_of_values;
-  std::unique_ptr<integer_statistics> int_stats;
-  std::unique_ptr<double_statistics> double_stats;
-  std::unique_ptr<string_statistics> string_stats;
-  std::unique_ptr<bucket_statistics> bucket_stats;
-  std::unique_ptr<decimal_statistics> decimal_stats;
-  std::unique_ptr<date_statistics> date_stats;
-  std::unique_ptr<binary_statistics> binary_stats;
-  std::unique_ptr<timestamp_statistics> timestamp_stats;
+  std::optional<uint64_t> number_of_values;
+  std::optional<integer_statistics> int_stats;
+  std::optional<double_statistics> double_stats;
+  std::optional<string_statistics> string_stats;
+  std::optional<bucket_statistics> bucket_stats;
+  std::optional<decimal_statistics> decimal_stats;
+  std::optional<date_statistics> date_stats;
+  std::optional<binary_statistics> binary_stats;
+  std::optional<timestamp_statistics> timestamp_stats;
   // TODO: hasNull (issue #7087)
 };
 
@@ -220,11 +229,11 @@ class ProtobufReader {
 
   // optional fields don't change the field number encoding
   template <typename T,
-            typename std::enable_if_t<
-              std::is_same<T, std::unique_ptr<typename T::element_type>>::value> * = nullptr>
+            typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
+              * = nullptr>
   int static constexpr encode_field_number(int field_number) noexcept
   {
-    return encode_field_number_base<typename T::element_type>(field_number);
+    return encode_field_number_base<typename T::value_type>(field_number);
   }
 
   uint32_t read_field_size(const uint8_t *end);
@@ -270,13 +279,13 @@ class ProtobufReader {
   }
 
   template <typename T,
-            typename std::enable_if_t<
-              std::is_same<T, std::unique_ptr<typename T::element_type>>::value> * = nullptr>
+            typename std::enable_if_t<std::is_same<T, std::optional<typename T::value_type>>::value>
+              * = nullptr>
   void read_field(T &value, const uint8_t *end)
   {
-    typename T::element_type contained_value;
+    typename T::value_type contained_value;
     read_field(contained_value, end);
-    value = std::make_unique<typename T::element_type>(std::move(contained_value));
+    value = std::optional<typename T::value_type>{std::move(contained_value)};
   }
 
   template <typename T>
diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp
index 9bb1ff4310b..8e9bca44340 100644
--- a/cpp/src/io/orc/orc_field_reader.hpp
+++ b/cpp/src/io/orc/orc_field_reader.hpp
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <io/orc/orc.h>
 #include <string>
+#include "orc.h"
 
 /**
  * @file orc_field_reader.hpp
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index c60e5cbd23c..13c7befa3a1 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <io/orc/orc.h>
 #include <numeric>
 #include <string>
+#include "orc.h"
 
 /**
  * @file orc_field_writer.hpp
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 55df0adf95b..0f277d3d8fa 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,12 @@
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
-#include <io/orc/orc_common.h>
-#include <io/statistics/column_stats.h>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+#include <io/statistics/statistics.cuh>
+#include <io/utilities/column_buffer.hpp>
+#include "orc_common.h"
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -30,12 +32,15 @@ namespace cudf {
 namespace io {
 namespace orc {
 namespace gpu {
+
+using cudf::detail::device_2dspan;
+
 struct CompressedStreamInfo {
   CompressedStreamInfo() = default;
   explicit constexpr CompressedStreamInfo(const uint8_t *compressed_data_, size_t compressed_size_)
     : compressed_data(compressed_data_),
-      compressed_data_size(compressed_size_),
       uncompressed_data(nullptr),
+      compressed_data_size(compressed_size_),
       decctl(nullptr),
       decstatus(nullptr),
       copyctl(nullptr),
@@ -67,14 +72,6 @@ enum StreamIndexType {
   CI_NUM_STREAMS
 };
 
-/**
- * @brief Struct to describe the output of a string datatype
- */
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
-
 /**
  * @brief Struct to describe a single entry in the global dictionary
  */
@@ -83,11 +80,6 @@ struct DictionaryEntry {
   uint32_t len;  // Length in data stream
 };
 
-/**
- * @brief Mask to indicate conversion from decimals to float64
- */
-constexpr int orc_decimal2float64_scale = 0x80;
-
 /**
  * @brief Struct to describe per stripe's column information
  */
@@ -107,8 +99,7 @@ struct ColumnDesc {
   uint8_t encoding_kind;                   // column encoding kind (orc::ColumnEncodingKind)
   uint8_t type_kind;                       // column data type (orc::TypeKind)
   uint8_t dtype_len;      // data type length (for types that can be mapped to different sizes)
-  uint8_t decimal_scale;  // number of fractional decimal digits for decimal type (bit 7 set if
-                          // converting to float64)
+  int32_t decimal_scale;  // number of fractional decimal digits for decimal type
   int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 };
 
@@ -130,9 +121,10 @@ struct EncChunk {
   uint8_t encoding_kind;  // column encoding kind (orc::ColumnEncodingKind)
   uint8_t type_kind;      // column data type (orc::TypeKind)
   uint8_t dtype_len;      // data type length
-  uint8_t scale;          // scale for decimals or timestamps
+  int32_t scale;          // scale for decimals or timestamps
 
   uint32_t *dict_index;  // dictionary index from row index
+  device_span<uint32_t> decimal_offsets;
   column_device_view *leaf_column;
 };
 
@@ -292,8 +284,8 @@ void DecodeOrcColumnData(ColumnDesc const *chunks,
  * @param[in, out] streams chunk streams device array [column][rowgroup]
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void EncodeOrcColumnData(detail::device_2dspan<EncChunk const> chunks,
-                         detail::device_2dspan<encoder_chunk_streams> streams,
+void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
+                         device_2dspan<encoder_chunk_streams> streams,
                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -307,10 +299,10 @@ void EncodeOrcColumnData(detail::device_2dspan<EncChunk const> chunks,
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
 void EncodeStripeDictionaries(StripeDictionary *stripes,
-                              detail::device_2dspan<EncChunk const> chunks,
+                              device_2dspan<EncChunk const> chunks,
                               uint32_t num_string_columns,
                               uint32_t num_stripes,
-                              detail::device_2dspan<encoder_chunk_streams> enc_streams,
+                              device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -321,7 +313,7 @@ void EncodeStripeDictionaries(StripeDictionary *stripes,
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
 void set_chunk_columns(const table_device_view &view,
-                       detail::device_2dspan<EncChunk> chunks,
+                       device_2dspan<EncChunk> chunks,
                        rmm::cuda_stream_view stream);
 
 /**
@@ -331,8 +323,8 @@ void set_chunk_columns(const table_device_view &view,
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[in] stream CUDA stream to use, default `rmm::cuda_stream_default`
  */
-void CompactOrcDataStreams(detail::device_2dspan<StripeStream> strm_desc,
-                           detail::device_2dspan<encoder_chunk_streams> enc_streams,
+void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
+                           device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
@@ -352,8 +344,8 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
-                            detail::device_2dspan<StripeStream> strm_desc,
-                            detail::device_2dspan<encoder_chunk_streams> enc_streams,
+                            device_2dspan<StripeStream> strm_desc,
+                            device_2dspan<encoder_chunk_streams> enc_streams,
                             gpu_inflate_input_s *comp_in,
                             gpu_inflate_status_s *comp_out,
                             rmm::cuda_stream_view stream = rmm::cuda_stream_default);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 9f88c6584ce..986ce91027b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
-#include <io/orc/orc.h>
+#include "orc.h"
 
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
@@ -50,8 +50,7 @@ namespace {
  */
 constexpr type_id to_type_id(const orc::SchemaType &schema,
                              bool use_np_dtypes,
-                             type_id timestamp_type_id,
-                             bool decimals_as_float64)
+                             type_id timestamp_type_id)
 {
   switch (schema.kind) {
     case orc::BOOLEAN: return type_id::BOOL8;
@@ -73,9 +72,7 @@ constexpr type_id to_type_id(const orc::SchemaType &schema,
     case orc::DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL:
-      // There isn't an arbitrary-precision type in cuDF, so map as float or int
-      return (decimals_as_float64) ? type_id::FLOAT64 : type_id::INT64;
+    case orc::DECIMAL: return type_id::DECIMAL64;
     default: break;
   }
 
@@ -162,17 +159,18 @@ size_t gather_stream_info(const size_t stripe_index,
   uint64_t src_offset    = 0;
   uint64_t dst_offset    = 0;
   for (const auto &stream : stripefooter->streams) {
-    if (stream.column >= orc2gdf.size()) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
       dst_offset += stream.length;
       continue;
     }
 
-    auto col = orc2gdf[stream.column];
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
     if (col == -1) {
       // A struct-type column has no data itself, but rather child columns
       // for each of its fields. There is only a PRESENT stream, which
       // needs to be included for the reader.
-      const auto schema_type = types[stream.column];
+      const auto schema_type = types[column_id];
       if (schema_type.subtypes.size() != 0) {
         if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
           for (const auto &idx : schema_type.subtypes) {
@@ -192,7 +190,7 @@ size_t gather_stream_info(const size_t stripe_index,
         // NOTE: skip_count field is temporarily used to track index ordering
         auto &chunk = chunks[stripe_index * num_columns + col];
         const auto idx =
-          get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[stream.column]);
+          get_index_type_and_pos(stream.kind, chunk.skip_count, col == orc2gdf[column_id]);
         if (idx.first < gpu::CI_NUM_STREAMS) {
           chunk.strm_id[idx.first]  = stream_info.size();
           chunk.strm_len[idx.first] = stream.length;
@@ -200,8 +198,8 @@ size_t gather_stream_info(const size_t stripe_index,
 
           if (idx.first == gpu::CI_DICTIONARY) {
             chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[stream.column].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[stream.column].dictionarySize;
+            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
           }
         }
       }
@@ -405,10 +403,6 @@ reader::impl::impl(std::unique_ptr<datasource> source,
 
   // Enable or disable the conversion to numpy-compatible dtypes
   _use_np_dtypes = options.is_enabled_use_np_dtypes();
-
-  // Control decimals conversion (float64 or int64 with optional scale)
-  _decimals_as_float64   = options.is_enabled_decimals_as_float64();
-  _decimals_as_int_scale = options.get_forced_decimals_scale();
 }
 
 table_with_metadata reader::impl::read(size_type skip_rows,
@@ -431,10 +425,20 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   // Get a list of column data types
   std::vector<data_type> column_types;
   for (const auto &col : _selected_columns) {
-    auto col_type = to_type_id(
-      _metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id(), _decimals_as_float64);
+    auto col_type = to_type_id(_metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id());
     CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-    column_types.emplace_back(col_type);
+    // Remove this once we support Decimal128 data type
+    CUDF_EXPECTS((col_type != type_id::DECIMAL64) or (_metadata->ff.types[col].precision <= 18),
+                 "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
+    if (col_type == type_id::DECIMAL64) {
+      // sign of the scale is changed since cuDF follows c++ libraries like CNL
+      // which uses negative scaling, but liborc and other libraries
+      // follow positive scaling.
+      auto const scale = -static_cast<int32_t>(_metadata->ff.types[col].scale.value_or(0));
+      column_types.emplace_back(col_type, scale);
+    } else {
+      column_types.emplace_back(col_type);
+    }
 
     // Map each ORC column to its column
     orc_col_map[col] = column_types.size() - 1;
@@ -503,10 +507,16 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           len += stream_info[stream_count].length;
           stream_count++;
         }
-        const auto buffer = _source->host_read(offset, len);
-        CUDA_TRY(
-          cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
-        stream.synchronize();
+        if (_source->is_device_read_preferred(len)) {
+          CUDF_EXPECTS(_source->device_read(offset, len, d_dst, stream) == len,
+                       "Unexpected discrepancy in bytes read.");
+        } else {
+          const auto buffer = _source->host_read(offset, len);
+          CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
+          CUDA_TRY(
+            cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
+          stream.synchronize();
+        }
       }
 
       // Update chunks to reference streams pointers
@@ -516,16 +526,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         chunk.num_rows      = stripe_info->numberOfRows;
         chunk.encoding_kind = stripe_footer->columns[_selected_columns[j]].kind;
         chunk.type_kind     = _metadata->ff.types[_selected_columns[j]].kind;
-        if (_decimals_as_float64) {
-          chunk.decimal_scale =
-            _metadata->ff.types[_selected_columns[j]].scale | orc::gpu::orc_decimal2float64_scale;
-        } else if (_decimals_as_int_scale < 0) {
-          chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
-        } else {
-          chunk.decimal_scale = _decimals_as_int_scale;
-        }
-        chunk.rowgroup_id = num_rowgroups;
-        chunk.dtype_len   = (column_types[j].id() == type_id::STRING)
+        chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale.value_or(0);
+        chunk.rowgroup_id   = num_rowgroups;
+        chunk.dtype_len     = (column_types[j].id() == type_id::STRING)
                             ? sizeof(std::pair<const char *, size_t>)
                             : cudf::size_of(column_types[j]);
         if (chunk.type_kind == orc::TIMESTAMP) {
@@ -621,6 +624,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 // Forward to implementation
 reader::reader(std::vector<std::string> const &filepaths,
                orc_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(filepaths.size() == 1, "Only a single source is currently supported.");
@@ -630,6 +634,7 @@ reader::reader(std::vector<std::string> const &filepaths,
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                orc_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
 {
   CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 3a2913c5548..df6cf13a193 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -130,11 +130,9 @@ class reader::impl {
   std::unique_ptr<cudf::io::orc::metadata> _metadata;
 
   std::vector<int> _selected_columns;
-  bool _use_index                  = true;
-  bool _use_np_dtypes              = true;
-  bool _has_timestamp_column       = false;
-  bool _decimals_as_float64        = true;
-  size_type _decimals_as_int_scale = -1;
+  bool _use_index            = true;
+  bool _use_np_dtypes        = true;
+  bool _has_timestamp_column = false;
   data_type _timestamp_type{type_id::EMPTY};
 };
 
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 56a55bd0a4d..4c85150a9f0 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -311,7 +311,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         // }
         if (s->chunk.has_sum) {  // Sum is equal to the number of 'true' values
           cur[0]       = 5 * 8 + PB_TYPE_FIXEDLEN;
-          cur          = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.i_val);
+          cur          = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.u_val);
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 6206d98773f..e152d2bb9d5 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@ namespace io {
 namespace orc {
 namespace gpu {
 
+using cudf::io::detail::string_index_pair;
+
 // Must be able to handle 512x 8-byte values. These values are base 128 encoded
 // so 8 byte value is expanded to 10 bytes.
 constexpr int bytestream_buffer_size = 512 * 8 * 2;
@@ -147,9 +149,9 @@ static __device__ void bytestream_init(volatile orc_bytestream_s *bs,
                                        const uint8_t *base,
                                        uint32_t len)
 {
-  uint32_t pos   = static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base));
+  uint32_t pos   = (len > 0) ? static_cast<uint32_t>(7 & reinterpret_cast<size_t>(base)) : 0;
   bs->base       = base - pos;
-  bs->pos        = (len > 0) ? pos : 0;
+  bs->pos        = pos;
   bs->len        = (len + pos + 7) & ~7;
   bs->fill_pos   = 0;
   bs->fill_count = min(bs->len, bytestream_buffer_size) >> 3;
@@ -957,15 +959,6 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
   return rle->num_vals;
 }
 
-/**
- * @brief Powers of 10
- */
-static const __device__ __constant__ double kPow10[40] = {
-  1.0,   1.e1,  1.e2,  1.e3,  1.e4,  1.e5,  1.e6,  1.e7,  1.e8,  1.e9,  1.e10, 1.e11, 1.e12, 1.e13,
-  1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27,
-  1.e28, 1.e29, 1.e30, 1.e31, 1.e32, 1.e33, 1.e34, 1.e35, 1.e36, 1.e37, 1.e38, 1.e39,
-};
-
 static const __device__ __constant__ int64_t kPow5i[28] = {1,
                                                            5,
                                                            25,
@@ -1000,6 +993,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  *
  * @param[in] bs Input byte stream
  * @param[in,out] vals on input: scale from secondary stream, on output: value
+ * @param[in] val_scale Scale of each value
+ * @param[in] col_scale Scale from schema to which value will be adjusted
  * @param[in] numvals Number of values to decode
  * @param[in] t thread id
  *
@@ -1036,39 +1031,32 @@ static __device__ int Decode_Decimals(orc_bytestream_s *bs,
     if (t >= num_vals_read and t < num_vals_to_read) {
       auto const pos = static_cast<int>(vals.i64[t]);
       int128_s v     = decode_varint128(bs, pos);
-
-      if (col_scale & orc_decimal2float64_scale) {
-        double f      = Int128ToDouble_rn(v.lo, v.hi);
-        int32_t scale = (t < numvals) ? val_scale : 0;
-        if (scale >= 0)
-          vals.f64[t] = f / kPow10[min(scale, 39)];
-        else
-          vals.f64[t] = f * kPow10[min(-scale, 39)];
-      } else {
-        int32_t scale = (t < numvals) ? (col_scale & ~orc_decimal2float64_scale) - val_scale : 0;
-        if (scale >= 0) {
-          scale       = min(scale, 27);
-          vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
-        } else  // if (scale < 0)
-        {
-          bool is_negative = (v.hi < 0);
-          uint64_t hi = v.hi, lo = v.lo;
-          scale = min(-scale, 27);
-          if (is_negative) {
-            hi = (~hi) + (lo == 0);
-            lo = (~lo) + 1;
-          }
-          lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
-          hi >>= (int32_t)scale;
-          if (hi != 0) {
-            // Use intermediate float
-            lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
-            hi = 0;
-          } else {
-            lo /= kPow5i[scale];
-          }
-          vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
+      // Since cuDF column stores just one scale, value needs to
+      // be adjusted to col_scale from val_scale. So the difference
+      // of them will be used to add 0s or remove digits.
+      int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
+      if (scale >= 0) {
+        scale       = min(scale, 27);
+        vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
+      } else  // if (scale < 0)
+      {
+        bool is_negative = (v.hi < 0);
+        uint64_t hi = v.hi, lo = v.lo;
+        scale = min(-scale, 27);
+        if (is_negative) {
+          hi = (~hi) + (lo == 0);
+          lo = (~lo) + 1;
+        }
+        lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
+        hi >>= (int32_t)scale;
+        if (hi != 0) {
+          // Use intermediate float
+          lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
+          hi = 0;
+        } else {
+          lo /= kPow5i[scale];
         }
+        vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
       }
     }
     // There is nothing to read, so break
@@ -1683,9 +1671,9 @@ __global__ void __launch_bounds__(block_size)
             case BINARY:
             case VARCHAR:
             case CHAR: {
-              nvstrdesc_s *strdesc = &static_cast<nvstrdesc_s *>(data_out)[row];
-              void const *ptr      = nullptr;
-              uint32_t count       = 0;
+              string_index_pair *strdesc = &static_cast<string_index_pair *>(data_out)[row];
+              void const *ptr            = nullptr;
+              uint32_t count             = 0;
               if (is_dictionary(s->chunk.encoding_kind)) {
                 auto const dict_idx = s->vals.u32[t + vals_skipped];
                 if (dict_idx < s->chunk.dict_len) {
@@ -1703,8 +1691,8 @@ __global__ void __launch_bounds__(block_size)
                   count = secondary_val;
                 }
               }
-              strdesc->ptr   = static_cast<char const *>(ptr);
-              strdesc->count = count;
+              strdesc->first  = static_cast<char const *>(ptr);
+              strdesc->second = count;
               break;
             }
             case TIMESTAMP: {
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 10932d36309..b469d7215b4 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -27,7 +27,7 @@ namespace io {
 namespace orc {
 namespace gpu {
 
-using detail::device_2dspan;
+using cudf::detail::device_2dspan;
 
 constexpr int scratch_buffer_size = 512 * 4;
 
@@ -781,6 +781,9 @@ __global__ void __launch_bounds__(block_size)
               s->lengths.u32[nz_idx]                  = value.size_bytes();
             }
             break;
+            // Reusing the lengths array for the scale stream
+            // Note: can be written in a faster manner, given that all values are equal
+          case DECIMAL: s->lengths.u32[nz_idx] = zigzag(s->chunk.scale); break;
           default: break;
         }
       }
@@ -814,7 +817,7 @@ __global__ void __launch_bounds__(block_size)
         uint32_t nz = s->buf.u32[511];
         s->nnz += nz;
         s->numvals += nz;
-        s->numlengths += (s->chunk.type_kind == TIMESTAMP ||
+        s->numlengths += (s->chunk.type_kind == TIMESTAMP || s->chunk.type_kind == DECIMAL ||
                           (s->chunk.type_kind == STRING && s->chunk.encoding_kind != DICTIONARY_V2))
                            ? nz
                            : 0;
@@ -865,6 +868,17 @@ __global__ void __launch_bounds__(block_size)
               n = s->numvals;
             }
             break;
+          case DECIMAL: {
+            if (valid) {
+              uint64_t const zz_val = (s->chunk.leaf_column->type().id() == type_id::DECIMAL32)
+                                        ? zigzag(s->chunk.leaf_column->element<int32_t>(row))
+                                        : zigzag(s->chunk.leaf_column->element<int64_t>(row));
+              auto const offset =
+                (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1];
+              StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val);
+            }
+            n = s->numvals;
+          } break;
           default: n = s->numvals; break;
         }
         __syncthreads();
@@ -878,6 +892,7 @@ __global__ void __launch_bounds__(block_size)
             n = IntegerRLE<CI_DATA2, uint64_t, false, 0x3ff, block_size>(
               s, s->lengths.u64, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u64);
             break;
+          case DECIMAL:
           case STRING:
             n = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
               s, s->lengths.u32, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u32);
@@ -893,7 +908,9 @@ __global__ void __launch_bounds__(block_size)
   __syncthreads();
   if (t <= CI_PRESENT && s->stream.ids[t] >= 0) {
     // Update actual compressed length
-    streams[col_id][group_id].lengths[t] = s->strm_pos[t];
+    // (not needed for decimal data, whose exact size is known before encode)
+    if (!(t == CI_DATA && s->chunk.type_kind == DECIMAL))
+      streams[col_id][group_id].lengths[t] = s->strm_pos[t];
     if (!s->stream.data_ptrs[t]) {
       streams[col_id][group_id].data_ptrs[t] =
         static_cast<uint8_t *>(const_cast<void *>(s->chunk.leaf_column->head())) +
@@ -1226,8 +1243,8 @@ void CompressOrcDataStreams(uint8_t *compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
-                            detail::device_2dspan<StripeStream> strm_desc,
-                            detail::device_2dspan<encoder_chunk_streams> enc_streams,
+                            device_2dspan<StripeStream> strm_desc,
+                            device_2dspan<encoder_chunk_streams> enc_streams,
                             gpu_inflate_input_s *comp_in,
                             gpu_inflate_status_s *comp_out,
                             rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 61917403b41..42cb15a56b7 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -40,7 +40,7 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
   int strm_id                  = blockIdx.x * 4 + (threadIdx.x / 32);
   int lane_id                  = threadIdx.x % 32;
 
-  if (lane_id == 0) { s->info = strm_info[strm_id]; }
+  if (strm_id < num_streams && lane_id == 0) { s->info = strm_info[strm_id]; }
 
   __syncthreads();
   if (strm_id < num_streams) {
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 10050806552..2aa1e2d866a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -21,10 +21,14 @@
 
 #include "writer_impl.hpp"
 
+#include <io/statistics/column_statistics.cuh>
 #include <io/utilities/column_utils.cuh>
 
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -89,15 +93,16 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id)
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
     case cudf::type_id::TIMESTAMP_NANOSECONDS: return TypeKind::TIMESTAMP;
     case cudf::type_id::STRING: return TypeKind::STRING;
+    case cudf::type_id::DECIMAL32:
+    case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL;
     default: return TypeKind::INVALID_TYPE_KIND;
   }
 }
 
 /**
- * @brief Function that translates time unit to nanoscale multiple
+ * @brief Translates time unit to nanoscale multiple.
  */
-template <typename T>
-constexpr T to_clockscale(cudf::type_id timestamp_id)
+constexpr int32_t to_clockscale(cudf::type_id timestamp_id)
 {
   switch (timestamp_id) {
     case cudf::type_id::TIMESTAMP_SECONDS: return 9;
@@ -108,6 +113,18 @@ constexpr T to_clockscale(cudf::type_id timestamp_id)
   }
 }
 
+/**
+ * @brief Returns the precision of the given decimal type.
+ */
+constexpr auto orc_precision(cudf::type_id decimal_id)
+{
+  switch (decimal_id) {
+    case cudf::type_id::DECIMAL32: return 9;
+    case cudf::type_id::DECIMAL64: return 18;
+    default: return 0;
+  }
+}
+
 }  // namespace
 
 /**
@@ -119,26 +136,28 @@ class orc_column_view {
    * @brief Constructor that extracts out the string position + length pairs
    * for building dictionaries for string columns
    */
-  explicit orc_column_view(size_t id,
+  explicit orc_column_view(size_t index,
                            size_t str_id,
                            column_view const &col,
                            const table_metadata *metadata,
                            rmm::cuda_stream_view stream)
-    : _id(id),
+    : _index(index),
       _str_id(str_id),
       _is_string_type(col.type().id() == type_id::STRING),
       _type_width(_is_string_type ? 0 : cudf::size_of(col.type())),
       _data_count(col.size()),
       _null_count(col.null_count()),
       _nulls(col.null_mask()),
-      _clockscale(to_clockscale<uint8_t>(col.type().id())),
-      _type_kind(to_orc_type(col.type().id()))
+      _type_kind(to_orc_type(col.type().id())),
+      _scale{(_type_kind == TypeKind::DECIMAL) ? -col.type().scale()
+                                               : to_clockscale(col.type().id())},
+      _precision{orc_precision(col.type().id())}
   {
     // Generating default name if name isn't present in metadata
-    if (metadata && _id < metadata->column_names.size()) {
-      _name = metadata->column_names[_id];
+    if (metadata && _index < metadata->column_names.size()) {
+      _name = metadata->column_names[_index];
     } else {
-      _name = "_col" + std::to_string(_id);
+      _name = "_col" + std::to_string(_index);
     }
   }
 
@@ -161,6 +180,9 @@ class orc_column_view {
   }
   auto device_dict_chunk() const { return d_dict; }
 
+  auto const &decimal_offsets() const { return d_decimal_offsets; }
+  void attach_decimal_offsets(uint32_t *sizes_ptr) { d_decimal_offsets = sizes_ptr; }
+
   /**
    * @brief Function that associates an existing stripe dictionary allocation
    */
@@ -177,13 +199,18 @@ class orc_column_view {
   }
   auto device_stripe_dict() const { return d_stripe_dict; }
 
-  auto id() const noexcept { return _id; }
+  // Index in the table
+  auto index() const noexcept { return _index; }
+  // Id in the ORC file
+  auto id() const noexcept { return _index + 1; }
   size_t type_width() const noexcept { return _type_width; }
-  size_t data_count() const noexcept { return _data_count; }
+  auto data_count() const noexcept { return _data_count; }
   size_t null_count() const noexcept { return _null_count; }
   bool nullable() const noexcept { return (_nulls != nullptr); }
   uint32_t const *nulls() const noexcept { return _nulls; }
-  uint8_t clockscale() const noexcept { return _clockscale; }
+
+  auto scale() const noexcept { return _scale; }
+  auto precision() const noexcept { return _precision; }
 
   void set_orc_encoding(ColumnEncodingKind e) { _encoding_kind = e; }
   auto orc_kind() const noexcept { return _type_kind; }
@@ -192,27 +219,33 @@ class orc_column_view {
 
  private:
   // Identifier within set of columns and string columns, respectively
-  size_t _id           = 0;
-  size_t _str_id       = 0;
+  uint32_t _index      = 0;
+  uint32_t _str_id     = 0;
   bool _is_string_type = false;
 
   size_t _type_width     = 0;
-  size_t _data_count     = 0;
+  size_type _data_count  = 0;
   size_t _null_count     = 0;
   uint32_t const *_nulls = nullptr;
-  uint8_t _clockscale    = 0;
 
   // ORC-related members
   std::string _name{};
   TypeKind _type_kind;
   ColumnEncodingKind _encoding_kind;
 
+  int32_t _scale     = 0;
+  int32_t _precision = 0;
+
   // String dictionary-related members
   size_t dict_stride                       = 0;
   gpu::DictionaryChunk const *dict         = nullptr;
   gpu::StripeDictionary const *stripe_dict = nullptr;
   gpu::DictionaryChunk *d_dict             = nullptr;
   gpu::StripeDictionary *d_stripe_dict     = nullptr;
+
+  // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements
+  // into the output stream.
+  uint32_t *d_decimal_offsets = nullptr;
 };
 
 std::vector<stripe_rowgroups> writer::impl::gather_stripe_info(
@@ -345,11 +378,17 @@ void writer::impl::build_dictionaries(orc_column_view *columns,
 }
 
 orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
-                                         host_span<stripe_rowgroups const> stripe_bounds)
+                                         host_span<stripe_rowgroups const> stripe_bounds,
+                                         std::map<uint32_t, size_t> const &decimal_column_sizes)
 {
-  // First n + 1 streams are row index streams, including 'column 0'
-  std::vector<Stream> streams{{ROW_INDEX, 0, 0}};  // TODO: Separate index and data streams?
-  streams.resize(columns.size() + 1);
+  // 'column 0' row index stream
+  std::vector<Stream> streams{{ROW_INDEX, 0}};  // TODO: Separate index and data streams?
+  // First n + 1 streams are row index streams
+  streams.reserve(columns.size() + 1);
+  std::transform(columns.begin(), columns.end(), std::back_inserter(streams), [](auto const &col) {
+    return Stream{ROW_INDEX, col.id()};
+  });
+
   std::vector<int32_t> ids(columns.size() * gpu::CI_NUM_STREAMS, -1);
 
   for (auto &column : columns) {
@@ -367,8 +406,8 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
       if (single_write_mode) {
         return column.nullable();
       } else {
-        return (column.id() < user_metadata_with_nullability.column_nullable.size())
-                 ? user_metadata_with_nullability.column_nullable[column.id()]
+        return (column.index() < user_metadata_with_nullability.column_nullable.size())
+                 ? user_metadata_with_nullability.column_nullable[column.index()]
                  : true;
       }
     }();
@@ -376,7 +415,6 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
       present_stream_size = ((row_index_stride_ + 7) >> 3);
       present_stream_size += (present_stream_size + 0x7f) >> 7;
     }
-
     switch (kind) {
       case TypeKind::BOOLEAN:
         data_stream_size = div_rowgroups_by<int64_t>(1024) * (128 + 1);
@@ -461,39 +499,41 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         data2_kind        = SECONDARY;
         encoding_kind     = DIRECT_V2;
         break;
+      case TypeKind::DECIMAL:
+        // varint values (NO RLE)
+        data_stream_size = decimal_column_sizes.at(column.index());
+        // scale stream TODO: compute exact size since all elems are equal
+        data2_stream_size = div_rowgroups_by<int64_t>(512) * (512 * 4 + 2);
+        data2_kind        = SECONDARY;
+        encoding_kind     = DIRECT_V2;
+        break;
       default: CUDF_FAIL("Unsupported ORC type kind");
     }
 
     // Initialize the column's metadata (this is the only reason columns is in/out param)
     column.set_orc_encoding(encoding_kind);
 
-    // Initialize the column's index stream
-    const auto id      = static_cast<uint32_t>(1 + column.id());
-    streams[id].column = id;
-    streams[id].kind   = ROW_INDEX;
-    streams[id].length = 0;
-
     // Initialize the column's data stream(s)
-    const auto base = column.id() * gpu::CI_NUM_STREAMS;
+    const auto base = column.index() * gpu::CI_NUM_STREAMS;
     if (present_stream_size != 0) {
       auto len                    = static_cast<uint64_t>(present_stream_size);
       ids[base + gpu::CI_PRESENT] = streams.size();
-      streams.push_back(orc::Stream{PRESENT, id, len});
+      streams.push_back(orc::Stream{PRESENT, column.id(), len});
     }
     if (data_stream_size != 0) {
       auto len                 = static_cast<uint64_t>(std::max<int64_t>(data_stream_size, 0));
       ids[base + gpu::CI_DATA] = streams.size();
-      streams.push_back(orc::Stream{data_kind, id, len});
+      streams.push_back(orc::Stream{data_kind, column.id(), len});
     }
     if (data2_stream_size != 0) {
       auto len                  = static_cast<uint64_t>(std::max<int64_t>(data2_stream_size, 0));
       ids[base + gpu::CI_DATA2] = streams.size();
-      streams.push_back(orc::Stream{data2_kind, id, len});
+      streams.push_back(orc::Stream{data2_kind, column.id(), len});
     }
     if (dict_stream_size != 0) {
       auto len                       = static_cast<uint64_t>(dict_stream_size);
       ids[base + gpu::CI_DICTIONARY] = streams.size();
-      streams.push_back(orc::Stream{DICTIONARY_DATA, id, len});
+      streams.push_back(orc::Stream{DICTIONARY_DATA, column.id(), len});
     }
   }
   return {std::move(streams), std::move(ids)};
@@ -503,26 +543,39 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
   host_span<orc_column_view const> columns, size_t num_rowgroups) const
 {
   std::vector<size_t> strm_offsets(streams.size());
-  size_t str_data_size = 0;
-  size_t rle_data_size = 0;
+  size_t non_rle_data_size = 0;
+  size_t rle_data_size     = 0;
   for (size_t i = 0; i < streams.size(); ++i) {
     const auto &stream = streams[i];
-    const auto &column = columns[stream.column - 1];
-
-    if (((stream.kind == DICTIONARY_DATA || stream.kind == LENGTH) &&
-         (column.orc_encoding() == DICTIONARY_V2)) ||
-        ((stream.kind == DATA) &&
-         (column.orc_kind() == TypeKind::STRING && column.orc_encoding() == DIRECT_V2))) {
-      strm_offsets[i] = str_data_size;
-      str_data_size += stream.length;
-    } else {
+
+    auto const is_rle_data = [&]() {
+      // First stream is an index stream, don't check types, etc.
+      if (!stream.column_index().has_value()) return true;
+
+      auto const &column = columns[stream.column_index().value()];
+      // Dictionary encoded string column - dictionary characters or
+      // directly encoded string - column characters
+      if (column.orc_kind() == TypeKind::STRING &&
+          ((stream.kind == DICTIONARY_DATA && column.orc_encoding() == DICTIONARY_V2) ||
+           (stream.kind == DATA && column.orc_encoding() == DIRECT_V2)))
+        return false;
+      // Decimal data
+      if (column.orc_kind() == TypeKind::DECIMAL && stream.kind == DATA) return false;
+
+      // Everything else uses RLE
+      return true;
+    }();
+    if (is_rle_data) {
       strm_offsets[i] = rle_data_size;
       rle_data_size += (stream.length * num_rowgroups + 7) & ~7;
+    } else {
+      strm_offsets[i] = non_rle_data_size;
+      non_rle_data_size += stream.length;
     }
   }
-  str_data_size = (str_data_size + 7) & ~7;
+  non_rle_data_size = (non_rle_data_size + 7) & ~7;
 
-  return {std::move(strm_offsets), str_data_size, rle_data_size};
+  return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
 }
 
 struct segmented_valid_cnt_input {
@@ -533,6 +586,9 @@ struct segmented_valid_cnt_input {
 encoded_data writer::impl::encode_columns(const table_device_view &view,
                                           host_span<orc_column_view const> columns,
                                           std::vector<int> const &str_col_ids,
+                                          rmm::device_uvector<uint32_t> &&dict_data,
+                                          rmm::device_uvector<uint32_t> &&dict_index,
+                                          encoder_decimal_info &&dec_chunk_sizes,
                                           host_span<stripe_rowgroups const> stripe_bounds,
                                           orc_streams const &streams)
 {
@@ -550,7 +606,7 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
     for (auto const &stripe : stripe_bounds) {
       for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) {
         auto const rg_idx = *rg_idx_it;
-        auto &ck          = chunks[column.id()][rg_idx];
+        auto &ck          = chunks[column.index()][rg_idx];
 
         ck.start_row = (rg_idx * row_index_stride_);
         ck.num_rows  = std::min<uint32_t>(row_index_stride_, column.data_count() - ck.start_row);
@@ -564,8 +620,10 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
         } else {
           ck.dtype_len = column.type_width();
         }
-        ck.scale = column.clockscale();
-        // Only need to check row groups that end within the stripe
+        ck.scale = column.scale();
+        if (ck.type_kind == TypeKind::DECIMAL) {
+          ck.decimal_offsets = device_span<uint32_t>{column.decimal_offsets(), ck.num_rows};
+        }
       }
     }
   }
@@ -583,19 +641,19 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
   };
   for (auto const &column : columns) {
     if (column.orc_kind() == TypeKind::BOOLEAN && column.nullable()) {
-      validity_check_inputs[column.id()] = {column.nulls(), validity_check_indices(column.id())};
+      validity_check_inputs[column.index()] = {column.nulls(),
+                                               validity_check_indices(column.index())};
     }
   }
   for (auto &cnt_in : validity_check_inputs) {
     auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices);
-    CUDF_EXPECTS(std::none_of(valid_counts.cbegin(),
-                              valid_counts.cend(),
-                              [](auto valid_count) { return valid_count % 8; }),
-                 "There's currently a bug in encoding boolean columns. Suggested workaround "
-                 "is to convert "
-                 "to "
-                 "int8 type. Please see https://github.com/rapidsai/cudf/issues/6763 for "
-                 "more information.");
+    CUDF_EXPECTS(
+      std::none_of(valid_counts.cbegin(),
+                   valid_counts.cend(),
+                   [](auto valid_count) { return valid_count % 8; }),
+      "There's currently a bug in encoding boolean columns. Suggested workaround is to convert "
+      "to int8 type."
+      " Please see https://github.com/rapidsai/cudf/issues/6763 for more information.");
   }
 
   for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
@@ -634,19 +692,26 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
               }
             } else if (strm_type == gpu::CI_DATA && ck.type_kind == TypeKind::STRING &&
                        ck.encoding_kind == DIRECT_V2) {
-              strm.lengths[strm_type] = column.host_dict_chunk(rg_idx)->string_char_count;
-              auto const &prev_strm   = col_streams[rg_idx - 1];
-              strm.data_ptrs[strm_type] =
-                (rg_idx == 0) ? encoded_data.data() + stream_offsets.offsets[strm_id]
-                              : (prev_strm.data_ptrs[strm_type] + prev_strm.lengths[strm_type]);
+              strm.lengths[strm_type]   = column.host_dict_chunk(rg_idx)->string_char_count;
+              strm.data_ptrs[strm_type] = (rg_idx == 0)
+                                            ? encoded_data.data() + stream_offsets.offsets[strm_id]
+                                            : (col_streams[rg_idx - 1].data_ptrs[strm_type] +
+                                               col_streams[rg_idx - 1].lengths[strm_type]);
             } else if (strm_type == gpu::CI_DATA && streams[strm_id].length == 0 &&
                        (ck.type_kind == DOUBLE || ck.type_kind == FLOAT)) {
               // Pass-through
               strm.lengths[strm_type]   = ck.num_rows * ck.dtype_len;
               strm.data_ptrs[strm_type] = nullptr;
+
+            } else if (ck.type_kind == DECIMAL && strm_type == gpu::CI_DATA) {
+              strm.lengths[strm_type]   = dec_chunk_sizes.rg_sizes.at(col_idx)[rg_idx];
+              strm.data_ptrs[strm_type] = (rg_idx == 0)
+                                            ? encoded_data.data() + stream_offsets.offsets[strm_id]
+                                            : (col_streams[rg_idx - 1].data_ptrs[strm_type] +
+                                               col_streams[rg_idx - 1].lengths[strm_type]);
             } else {
               strm.lengths[strm_type]   = streams[strm_id].length;
-              strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.str_data_size +
+              strm.data_ptrs[strm_type] = encoded_data.data() + stream_offsets.non_rle_data_size +
                                           stream_offsets.offsets[strm_id] +
                                           streams[strm_id].length * rg_idx;
             }
@@ -671,6 +736,8 @@ encoded_data writer::impl::encode_columns(const table_device_view &view,
   }
 
   gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  dict_data.release();
+  dict_index.release();
   stream.synchronize();
 
   return {std::move(encoded_data), std::move(chunk_streams)};
@@ -731,7 +798,7 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   rmm::device_uvector<statistics_group> stat_groups(num_chunks, stream);
 
   for (auto const &column : columns) {
-    stats_column_desc *desc = &stat_desc[column.id()];
+    stats_column_desc *desc = &stat_desc[column.index()];
     switch (column.orc_kind()) {
       case TypeKind::BYTE: desc->stats_dtype = dtype_int8; break;
       case TypeKind::SHORT: desc->stats_dtype = dtype_int16; break;
@@ -741,6 +808,7 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
       case TypeKind::DOUBLE: desc->stats_dtype = dtype_float64; break;
       case TypeKind::BOOLEAN: desc->stats_dtype = dtype_bool; break;
       case TypeKind::DATE: desc->stats_dtype = dtype_int32; break;
+      case TypeKind::DECIMAL: desc->stats_dtype = dtype_decimal64; break;
       case TypeKind::TIMESTAMP: desc->stats_dtype = dtype_timestamp64; break;
       case TypeKind::STRING: desc->stats_dtype = dtype_string; break;
       default: desc->stats_dtype = dtype_none; break;
@@ -749,7 +817,7 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
     desc->num_values = column.data_count();
     if (desc->stats_dtype == dtype_timestamp64) {
       // Timestamp statistics are in milliseconds
-      switch (column.clockscale()) {
+      switch (column.scale()) {
         case 9: desc->ts_scale = 1000; break;
         case 6: desc->ts_scale = 0; break;
         case 3: desc->ts_scale = -1000; break;
@@ -760,15 +828,15 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
       desc->ts_scale = 0;
     }
     for (auto const &stripe : stripe_bounds) {
-      auto grp         = &stat_merge[column.id() * stripe_bounds.size() + stripe.id];
-      grp->col         = stat_desc.device_ptr(column.id());
-      grp->start_chunk = static_cast<uint32_t>(column.id() * num_rowgroups + stripe.first);
+      auto grp         = &stat_merge[column.index() * stripe_bounds.size() + stripe.id];
+      grp->col         = stat_desc.device_ptr(column.index());
+      grp->start_chunk = static_cast<uint32_t>(column.index() * num_rowgroups + stripe.first);
       grp->num_chunks  = stripe.size;
     }
     statistics_merge_group *col_stats =
-      &stat_merge[stripe_bounds.size() * columns.size() + column.id()];
-    col_stats->col         = stat_desc.device_ptr(column.id());
-    col_stats->start_chunk = static_cast<uint32_t>(column.id() * stripe_bounds.size());
+      &stat_merge[stripe_bounds.size() * columns.size() + column.index()];
+    col_stats->col         = stat_desc.device_ptr(column.index());
+    col_stats->start_chunk = static_cast<uint32_t>(column.index() * stripe_bounds.size());
     col_stats->num_chunks  = static_cast<uint32_t>(stripe_bounds.size());
   }
   stat_desc.host_to_device(stream);
@@ -784,18 +852,20 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
                                   row_index_stride_,
                                   stream);
 
-  GatherColumnStatistics(stat_chunks.data(), stat_groups.data(), num_chunks, stream);
-  MergeColumnStatistics(stat_chunks.data() + num_chunks,
-                        stat_chunks.data(),
-                        stat_merge.device_ptr(),
-                        stripe_bounds.size() * columns.size(),
-                        stream);
-
-  MergeColumnStatistics(stat_chunks.data() + num_chunks + stripe_bounds.size() * columns.size(),
-                        stat_chunks.data() + num_chunks,
-                        stat_merge.device_ptr(stripe_bounds.size() * columns.size()),
-                        columns.size(),
-                        stream);
+  detail::calculate_group_statistics<detail::io_file_format::ORC>(
+    stat_chunks.data(), stat_groups.data(), num_chunks, stream);
+  detail::merge_group_statistics<detail::io_file_format::ORC>(stat_chunks.data() + num_chunks,
+                                                              stat_chunks.data(),
+                                                              stat_merge.device_ptr(),
+                                                              stripe_bounds.size() * columns.size(),
+                                                              stream);
+
+  detail::merge_group_statistics<detail::io_file_format::ORC>(
+    stat_chunks.data() + num_chunks + stripe_bounds.size() * columns.size(),
+    stat_chunks.data() + num_chunks,
+    stat_merge.device_ptr(stripe_bounds.size() * columns.size()),
+    columns.size(),
+    stream);
   gpu::orc_init_statistics_buffersize(
     stat_merge.device_ptr(), stat_chunks.data() + num_chunks, num_stat_blobs, stream);
   stat_merge.device_to_host(stream, true);
@@ -915,10 +985,14 @@ void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc,
 {
   const auto length                                        = strm_desc.stream_size;
   (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
-  if (length != 0) {
-    const auto *stream_in = (compression_kind_ == NONE)
-                              ? enc_stream.data_ptrs[strm_desc.stream_type]
-                              : (compressed_data + strm_desc.bfr_offset);
+  if (length == 0) { return; }
+
+  const auto *stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
+                                                      : (compressed_data + strm_desc.bfr_offset);
+
+  if (out_sink_->is_device_write_preferred(length)) {
+    out_sink_->device_write(stream_in, length, stream);
+  } else {
     CUDA_TRY(
       cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value()));
     stream.synchronize();
@@ -951,8 +1025,8 @@ void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t> &v)
 writer::impl::impl(std::unique_ptr<data_sink> sink,
                    orc_writer_options const &options,
                    SingleWriteMode mode,
-                   rmm::mr::device_memory_resource *mr,
-                   rmm::cuda_stream_view stream)
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource *mr)
   : compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
     out_sink_(std::move(sink)),
@@ -967,8 +1041,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
 writer::impl::impl(std::unique_ptr<data_sink> sink,
                    chunked_orc_writer_options const &options,
                    SingleWriteMode mode,
-                   rmm::mr::device_memory_resource *mr,
-                   rmm::cuda_stream_view stream)
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource *mr)
   : compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
     out_sink_(std::move(sink)),
@@ -1008,6 +1082,127 @@ rmm::device_uvector<size_type> get_string_column_ids(const table_device_view &vi
   return string_column_ids;
 }
 
+/**
+ * @brief Iterates over row indexes but returns the corresponding rowgroup index.
+ *
+ */
+struct rowgroup_iterator {
+  using difference_type   = long;
+  using value_type        = int;
+  using pointer           = int *;
+  using reference         = int &;
+  using iterator_category = thrust::output_device_iterator_tag;
+  size_type idx;
+  size_type rowgroup_size;
+
+  CUDA_HOST_DEVICE_CALLABLE rowgroup_iterator(int offset, size_type rg_size)
+    : idx{offset}, rowgroup_size{rg_size}
+  {
+  }
+  CUDA_HOST_DEVICE_CALLABLE value_type operator*() const { return idx / rowgroup_size; }
+  CUDA_HOST_DEVICE_CALLABLE auto operator+(int i) const
+  {
+    return rowgroup_iterator{idx + i, rowgroup_size};
+  }
+  CUDA_HOST_DEVICE_CALLABLE rowgroup_iterator &operator++()
+  {
+    ++idx;
+    return *this;
+  }
+  CUDA_HOST_DEVICE_CALLABLE value_type operator[](int offset)
+  {
+    return (idx + offset) / rowgroup_size;
+  }
+  CUDA_HOST_DEVICE_CALLABLE bool operator!=(rowgroup_iterator const &other)
+  {
+    return idx != other.idx;
+  }
+};
+
+// returns host vector of per-rowgroup sizes
+encoder_decimal_info decimal_chunk_sizes(table_view const &table,
+                                         host_span<orc_column_view> orc_columns,
+                                         size_type rowgroup_size,
+                                         host_span<stripe_rowgroups const> stripes,
+                                         rmm::cuda_stream_view stream)
+{
+  std::map<uint32_t, rmm::device_uvector<uint32_t>> elem_sizes;
+
+  auto const d_table = table_device_view::create(table, stream);
+  // Compute per-element offsets (within each row group) on the device
+  for (size_t col_idx = 0; col_idx < orc_columns.size(); ++col_idx) {
+    auto &orc_col = orc_columns[col_idx];
+    if (orc_col.orc_kind() == DECIMAL) {
+      auto const &col = table.column(col_idx);
+      auto &current_sizes =
+        elem_sizes.insert({col_idx, rmm::device_uvector<uint32_t>(col.size(), stream)})
+          .first->second;
+      thrust::tabulate(rmm::exec_policy(stream),
+                       current_sizes.begin(),
+                       current_sizes.end(),
+                       [table = *d_table, col_idx] __device__(auto idx) {
+                         auto const &col = table.column(col_idx);
+                         if (col.is_null(idx)) return 0u;
+                         int64_t const element = (col.type().id() == type_id::DECIMAL32)
+                                                   ? col.element<int32_t>(idx)
+                                                   : col.element<int64_t>(idx);
+                         int64_t const sign      = (element < 0) ? 1 : 0;
+                         uint64_t zigzaged_value = ((element ^ -sign) * 2) + sign;
+
+                         uint32_t encoded_length = 1;
+                         while (zigzaged_value > 127) {
+                           zigzaged_value >>= 7u;
+                           ++encoded_length;
+                         }
+                         return encoded_length;
+                       });
+
+      // Compute element offsets within each row group
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    rowgroup_iterator{0, rowgroup_size},
+                                    rowgroup_iterator{col.size(), rowgroup_size},
+                                    current_sizes.begin(),
+                                    current_sizes.begin());
+
+      orc_col.attach_decimal_offsets(current_sizes.data());
+    }
+  }
+  if (elem_sizes.empty()) return {};
+
+  // Gather the row group sizes and copy to host
+  auto const num_rowgroups  = stripes_size(stripes);
+  auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(num_rowgroups, stream);
+  std::map<uint32_t, std::vector<uint32_t>> rg_sizes;
+  for (auto const &[col_idx, esizes] : elem_sizes) {
+    // Copy last elem in each row group - equal to row group size
+    thrust::tabulate(
+      rmm::exec_policy(stream),
+      d_tmp_rowgroup_sizes.begin(),
+      d_tmp_rowgroup_sizes.end(),
+      [src = esizes.data(), num_rows = esizes.size(), rg_size = rowgroup_size] __device__(
+        auto idx) { return src[thrust::min<size_type>(num_rows, rg_size * (idx + 1)) - 1]; });
+
+    rg_sizes[col_idx] = cudf::detail::make_std_vector_async(d_tmp_rowgroup_sizes, stream);
+  }
+
+  return {std::move(elem_sizes), std::move(rg_sizes)};
+}
+
+std::map<uint32_t, size_t> decimal_column_sizes(
+  std::map<uint32_t, std::vector<uint32_t>> const &chunk_sizes)
+{
+  std::map<uint32_t, size_t> column_sizes;
+  std::transform(chunk_sizes.cbegin(),
+                 chunk_sizes.cend(),
+                 std::inserter(column_sizes, column_sizes.end()),
+                 [](auto const &chunk_size) -> std::pair<uint32_t, size_t> {
+                   return {
+                     chunk_size.first,
+                     std::accumulate(chunk_size.second.cbegin(), chunk_size.second.cend(), 0lu)};
+                 });
+  return column_sizes;
+}
+
 void writer::impl::write(table_view const &table)
 {
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
@@ -1037,8 +1232,8 @@ void writer::impl::write(table_view const &table)
     if (orc_columns.back().is_string()) { str_col_ids.push_back(current_id); }
   }
 
-  rmm::device_uvector<uint32_t> dict_index(str_col_ids.size() * num_rows, stream);
   rmm::device_uvector<uint32_t> dict_data(str_col_ids.size() * num_rows, stream);
+  rmm::device_uvector<uint32_t> dict_index(str_col_ids.size() * num_rows, stream);
 
   // Build per-column dictionary indices
   const auto num_rowgroups   = div_by_rowgroups<size_t>(num_rows);
@@ -1065,9 +1260,19 @@ void writer::impl::write(table_view const &table)
       orc_columns.data(), str_col_ids, stripe_bounds, dict, dict_index.data(), stripe_dict);
   }
 
-  auto streams  = create_streams(orc_columns, stripe_bounds);
-  auto enc_data = encode_columns(*device_columns, orc_columns, str_col_ids, stripe_bounds, streams);
-
+  auto dec_chunk_sizes =
+    decimal_chunk_sizes(table, orc_columns, row_index_stride_, stripe_bounds, stream);
+
+  auto streams =
+    create_streams(orc_columns, stripe_bounds, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
+  auto enc_data = encode_columns(*device_columns,
+                                 orc_columns,
+                                 str_col_ids,
+                                 std::move(dict_data),
+                                 std::move(dict_index),
+                                 std::move(dec_chunk_sizes),
+                                 stripe_bounds,
+                                 streams);
   // Assemble individual disparate column chunks into contiguous data streams
   const auto num_index_streams = (num_columns + 1);
   const auto num_data_streams  = streams.size() - num_index_streams;
@@ -1086,11 +1291,13 @@ void writer::impl::write(table_view const &table)
   size_t num_compressed_blocks = 0;
   auto stream_output           = [&]() {
     size_t max_stream_size = 0;
+    bool all_device_write  = true;
 
     for (size_t stripe_id = 0; stripe_id < stripe_bounds.size(); stripe_id++) {
       for (size_t i = 0; i < num_data_streams; i++) {  // TODO range for (at least)
         gpu::StripeStream *ss = &strm_descs[stripe_id][i];
-        size_t stream_size    = ss->stream_size;
+        if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; }
+        size_t stream_size = ss->stream_size;
         if (compression_kind_ != NONE) {
           ss->first_block = num_compressed_blocks;
           ss->bfr_offset  = compressed_bfr_size;
@@ -1105,12 +1312,16 @@ void writer::impl::write(table_view const &table)
       }
     }
 
-    return pinned_buffer<uint8_t>{[](size_t size) {
-                                    uint8_t *ptr = nullptr;
-                                    CUDA_TRY(cudaMallocHost(&ptr, size));
-                                    return ptr;
-                                  }(max_stream_size),
-                                  cudaFreeHost};
+    if (all_device_write) {
+      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+    } else {
+      return pinned_buffer<uint8_t>{[](size_t size) {
+                                      uint8_t *ptr = nullptr;
+                                      CUDA_TRY(cudaMallocHost(&ptr, size));
+                                      return ptr;
+                                    }(max_stream_size),
+                                    cudaFreeHost};
+    }
   }();
 
   // Compress the data streams
@@ -1169,8 +1380,7 @@ void writer::impl::write(table_view const &table)
     StripeFooter sf;
     sf.streams = streams;
     sf.columns.resize(num_columns + 1);
-    sf.columns[0].kind           = DIRECT;
-    sf.columns[0].dictionarySize = 0;
+    sf.columns[0].kind = DIRECT;
     for (size_t i = 1; i < sf.columns.size(); ++i) {
       sf.columns[i].kind           = orc_columns[i - 1].orc_encoding();
       sf.columns[i].dictionarySize = (sf.columns[i].kind == DICTIONARY_V2)
@@ -1234,19 +1444,24 @@ void writer::impl::write(table_view const &table)
     ff.types[0].subtypes.resize(num_columns);
     ff.types[0].fieldNames.resize(num_columns);
     for (auto const &column : orc_columns) {
-      ff.types[1 + column.id()].kind      = column.orc_kind();
-      ff.types[0].subtypes[column.id()]   = 1 + column.id();
-      ff.types[0].fieldNames[column.id()] = column.orc_name();
+      ff.types[column.id()].kind = column.orc_kind();
+      if (column.orc_kind() == DECIMAL) {
+        ff.types[column.id()].scale     = static_cast<uint32_t>(column.scale());
+        ff.types[column.id()].precision = column.precision();
+      }
+      ff.types[0].subtypes[column.index()]   = column.id();
+      ff.types[0].fieldNames[column.index()] = column.orc_name();
     }
   } else {
     // verify the user isn't passing mismatched tables
     CUDF_EXPECTS(ff.types.size() == 1 + orc_columns.size(),
                  "Mismatch in table structure between multiple calls to write");
-    CUDF_EXPECTS(
-      std::all_of(orc_columns.cbegin(),
-                  orc_columns.cend(),
-                  [&](auto const &col) { return ff.types[1 + col.id()].kind == col.orc_kind(); }),
-      "Mismatch in column types between multiple calls to write");
+    CUDF_EXPECTS(std::all_of(orc_columns.cbegin(),
+                             orc_columns.cend(),
+                             [&](auto const &col) {
+                               return ff.types[1 + col.index()].kind == col.orc_kind();
+                             }),
+                 "Mismatch in column types between multiple calls to write");
   }
   ff.stripes.insert(ff.stripes.end(),
                     std::make_move_iterator(stripes.begin()),
@@ -1297,9 +1512,9 @@ void writer::impl::close()
 writer::writer(std::unique_ptr<data_sink> sink,
                orc_writer_options const &options,
                SingleWriteMode mode,
-               rmm::mr::device_memory_resource *mr,
-               rmm::cuda_stream_view stream)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, mr, stream))
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource *mr)
+  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
 
@@ -1307,9 +1522,9 @@ writer::writer(std::unique_ptr<data_sink> sink,
 writer::writer(std::unique_ptr<data_sink> sink,
                chunked_orc_writer_options const &options,
                SingleWriteMode mode,
-               rmm::mr::device_memory_resource *mr,
-               rmm::cuda_stream_view stream)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, mr, stream))
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource *mr)
+  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
 
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 352cb11440f..155c83a88d9 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -64,6 +64,15 @@ struct stripe_rowgroups {
   auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
 
+/**
+ * @brief Holds the sizes of encoded elements of decimal columns.
+ */
+struct encoder_decimal_info {
+  std::map<uint32_t, rmm::device_uvector<uint32_t>>
+    elem_sizes;                                        ///< Column index -> per-element size map
+  std::map<uint32_t, std::vector<uint32_t>> rg_sizes;  ///< Column index -> per-rowgroup size map
+};
+
 /**
  * @brief Returns the total number of rowgroups in the list of contigious stripes.
  */
@@ -94,9 +103,9 @@ class orc_streams {
    */
   struct orc_stream_offsets {
     std::vector<size_t> offsets;
-    size_t str_data_size = 0;
-    size_t rle_data_size = 0;
-    auto data_size() const { return str_data_size + rle_data_size; }
+    size_t non_rle_data_size = 0;
+    size_t rle_data_size     = 0;
+    auto data_size() const { return non_rle_data_size + rle_data_size; }
   };
   orc_stream_offsets compute_offsets(host_span<orc_column_view const> columns,
                                      size_t num_rowgroups) const;
@@ -136,14 +145,14 @@ class writer::impl {
    * @param sink Output sink
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
-   * @param mr Device memory resource to use for device memory allocation
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<data_sink> sink,
                 orc_writer_options const& options,
                 SingleWriteMode mode,
-                rmm::mr::device_memory_resource* mr,
-                rmm::cuda_stream_view stream);
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor with chunked writer options.
@@ -151,14 +160,14 @@ class writer::impl {
    * @param sink Output sink
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
-   * @param mr Device memory resource to use for device memory allocation
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::unique_ptr<data_sink> sink,
                 chunked_orc_writer_options const& options,
                 SingleWriteMode mode,
-                rmm::mr::device_memory_resource* mr,
-                rmm::cuda_stream_view stream);
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor to complete any incomplete write and release resources.
@@ -224,10 +233,12 @@ class writer::impl {
    *
    * @param[in,out] columns List of columns
    * @param[in] stripe_bounds List of stripe boundaries
+   * @param[in] decimal_column_sizes Sizes of encoded decimal columns
    * @return List of stream descriptors
    */
   orc_streams create_streams(host_span<orc_column_view> columns,
-                             host_span<stripe_rowgroups const> stripe_bounds);
+                             host_span<stripe_rowgroups const> stripe_bounds,
+                             std::map<uint32_t, size_t> const& decimal_column_sizes);
 
   /**
    * @brief Gathers stripe information.
@@ -245,6 +256,9 @@ class writer::impl {
    * @param view Table device view representing input table
    * @param columns List of columns
    * @param str_col_ids List of columns that are strings type
+   * @param dict_data Dictionary data memory
+   * @param dict_index Dictionary index memory
+   * @param dec_chunk_sizes Information about size of encoded decimal columns
    * @param stripe_bounds List of stripe boundaries
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Encoded data and per-chunk stream descriptors
@@ -252,6 +266,9 @@ class writer::impl {
   encoded_data encode_columns(const table_device_view& view,
                               host_span<orc_column_view const> columns,
                               std::vector<int> const& str_col_ids,
+                              rmm::device_uvector<uint32_t>&& dict_data,
+                              rmm::device_uvector<uint32_t>&& dict_index,
+                              encoder_decimal_info&& dec_chunk_sizes,
                               host_span<stripe_rowgroups const> stripe_bounds,
                               orc_streams const& streams);
 
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index ddb5006098d..a9b8eb0ac6b 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <io/parquet/compact_protocol_writer.hpp>
+#include "compact_protocol_writer.hpp"
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 680ea078a2f..2ce9245490e 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <io/parquet/parquet.hpp>
-#include <io/parquet/parquet_common.hpp>
+#include "parquet.hpp"
+#include "parquet_common.hpp"
 
 #include <stddef.h>
 #include <stdint.h>
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 538e238b5ea..dfd9c1384c5 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/column_buffer.hpp>
+#include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/utilities/bit.hpp>
@@ -518,13 +518,14 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo
 
   if (s->dict_base) {
     // String dictionary
-    uint32_t dict_pos = (s->dict_bits > 0)
-                          ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] * sizeof(nvstrdesc_s)
-                          : 0;
+    uint32_t dict_pos = (s->dict_bits > 0) ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] *
+                                               sizeof(string_index_pair)
+                                           : 0;
     if (dict_pos < (uint32_t)s->dict_size) {
-      const nvstrdesc_s *src = reinterpret_cast<const nvstrdesc_s *>(s->dict_base + dict_pos);
-      ptr                    = src->ptr;
-      len                    = src->count;
+      const string_index_pair *src =
+        reinterpret_cast<const string_index_pair *>(s->dict_base + dict_pos);
+      ptr = src->first;
+      len = src->second;
     }
   } else {
     // Plain encoding
@@ -539,9 +540,9 @@ inline __device__ void gpuOutputString(volatile page_state_s *s, int src_pos, vo
     *static_cast<uint32_t *>(dstv) = device_str2hash32(ptr, len);
   } else {
     // Output string descriptor
-    nvstrdesc_s *dst = static_cast<nvstrdesc_s *>(dstv);
-    dst->ptr         = ptr;
-    dst->count       = len;
+    string_index_pair *dst = static_cast<string_index_pair *>(dstv);
+    dst->first             = ptr;
+    dst->second            = len;
   }
 }
 
@@ -1010,7 +1011,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
           // Fall through to DOUBLE
         case DOUBLE: s->dtype_len = 8; break;
         case INT96: s->dtype_len = 12; break;
-        case BYTE_ARRAY: s->dtype_len = sizeof(nvstrdesc_s); break;
+        case BYTE_ARRAY: s->dtype_len = sizeof(string_index_pair); break;
         default:  // FIXED_LEN_BYTE_ARRAY:
           s->dtype_len = dtype_len_out;
           s->error |= (s->dtype_len <= 0);
@@ -1094,7 +1095,7 @@ static __device__ bool setupLocalPageInfo(page_state_s *const s,
           if (((s->col.data_type & 7) == BYTE_ARRAY) && (s->col.str_dict_index)) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<const uint8_t *>(s->col.str_dict_index);
-            s->dict_size = s->col.page_info[0].num_input_values * sizeof(nvstrdesc_s);
+            s->dict_size = s->col.page_info[0].num_input_values * sizeof(string_index_pair);
           } else {
             s->dict_base =
               s->col.page_info[0].page_data;  // dictionary is always stored in the first page
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
index 2676f30474d..2d505b99981 100644
--- a/cpp/src/io/parquet/page_dict.cu
+++ b/cpp/src/io/parquet/page_dict.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
+#include "parquet_gpu.hpp"
 
 #include <cudf/utilities/error.hpp>
 
@@ -150,7 +150,7 @@ __device__ void GenerateDictionaryIndices(dict_state_s *s, uint32_t t)
 // blockDim(1024, 1, 1)
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
-  gpuBuildChunkDictionaries(EncColumnChunk *chunks, uint32_t *dev_scratch)
+  gpuBuildChunkDictionaries(device_span<EncColumnChunk> chunks, uint32_t *dev_scratch)
 {
   __shared__ __align__(8) dict_state_s state_g;
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
@@ -321,19 +321,14 @@ __global__ void __launch_bounds__(block_size, 1)
  *
  * @param[in,out] chunks Column chunks
  * @param[in] dev_scratch Device scratch data (kDictScratchSize per dictionary)
- * @param[in] num_chunks Number of column chunks
  * @param[in] stream CUDA stream to use, default 0
  */
-void BuildChunkDictionaries(EncColumnChunk *chunks,
+void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
                             uint32_t *dev_scratch,
-                            size_t scratch_size,
-                            uint32_t num_chunks,
                             rmm::cuda_stream_view stream)
 {
-  if (num_chunks > 0 && scratch_size > 0) {  // zero scratch size implies no dictionaries
-    CUDA_TRY(cudaMemsetAsync(dev_scratch, 0, scratch_size, stream.value()));
-    gpuBuildChunkDictionaries<1024><<<num_chunks, 1024, 0, stream.value()>>>(chunks, dev_scratch);
-  }
+  auto num_chunks = chunks.size();
+  gpuBuildChunkDictionaries<1024><<<num_chunks, 1024, 0, stream.value()>>>(chunks, dev_scratch);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 51ec0013f1a..bf9114949aa 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
+#include "parquet_gpu.hpp"
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -40,6 +40,8 @@ constexpr bool enable_bool_rle = true;
 constexpr bool enable_bool_rle = false;
 #endif
 
+using ::cudf::detail::device_2dspan;
+
 constexpr int init_hash_bits       = 12;
 constexpr uint32_t rle_buffer_size = (1 << 9);
 
@@ -72,7 +74,7 @@ struct page_enc_state_s {
   EncColumnChunk ck;
   parquet_column_device_view col;
   gpu_inflate_input_s comp_in;
-  gpu_inflate_status_s comp_out;
+  gpu_inflate_status_s comp_stat;
   uint16_t vals[rle_buffer_size];
 };
 
@@ -114,10 +116,8 @@ inline __device__ uint32_t uint64_init_hash(uint64_t v)
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
-  gpuInitPageFragments(PageFragment *frag,
-                       const parquet_column_device_view *col_desc,
-                       int32_t num_fragments,
-                       int32_t num_columns,
+  gpuInitPageFragments(device_2dspan<PageFragment> frag,
+                       device_span<parquet_column_device_view const> col_desc,
                        uint32_t fragment_size,
                        uint32_t max_num_rows)
 {
@@ -377,42 +377,42 @@ __global__ void __launch_bounds__(block_size)
     }
   }
   __syncthreads();
-  if (t == 0) frag[blockIdx.x * num_fragments + blockIdx.y] = s->frag;
+  if (t == 0) frag[blockIdx.x][blockIdx.y] = s->frag;
 }
 
 // blockDim {128,1,1}
 __global__ void __launch_bounds__(128)
-  gpuInitFragmentStats(statistics_group *groups,
-                       const PageFragment *fragments,
-                       const parquet_column_device_view *col_desc,
-                       int32_t num_fragments,
-                       int32_t num_columns,
-                       uint32_t fragment_size)
+  gpuInitFragmentStats(device_2dspan<statistics_group> groups,
+                       device_2dspan<PageFragment const> fragments,
+                       device_span<parquet_column_device_view const> col_desc)
 {
+  // TODO: why not 1 block per warp?
   __shared__ __align__(8) statistics_group group_g[4];
 
-  uint32_t lane_id          = threadIdx.x & 0x1f;
-  uint32_t frag_id          = blockIdx.y * 4 + (threadIdx.x >> 5);
-  uint32_t column_id        = blockIdx.x;
-  statistics_group *const g = &group_g[threadIdx.x >> 5];
-  if (!lane_id && frag_id < num_fragments) {
+  uint32_t lane_id              = threadIdx.x & 0x1f;
+  uint32_t frag_id              = blockIdx.y * 4 + (threadIdx.x >> 5);
+  uint32_t column_id            = blockIdx.x;
+  auto num_fragments_per_column = fragments.size().second;
+  statistics_group *const g     = &group_g[threadIdx.x >> 5];
+  if (!lane_id && frag_id < num_fragments_per_column) {
     g->col       = &col_desc[column_id];
-    g->start_row = fragments[column_id * num_fragments + frag_id].start_value_idx;
-    g->num_rows  = fragments[column_id * num_fragments + frag_id].num_leaf_values;
+    g->start_row = fragments[column_id][frag_id].start_value_idx;
+    g->num_rows  = fragments[column_id][frag_id].num_leaf_values;
   }
   __syncthreads();
-  if (frag_id < num_fragments and lane_id == 0) groups[column_id * num_fragments + frag_id] = *g;
+  if (frag_id < num_fragments_per_column and lane_id == 0) groups[column_id][frag_id] = *g;
 }
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
-                                                    EncPage *pages,
-                                                    const parquet_column_device_view *col_desc,
-                                                    statistics_merge_group *page_grstats,
-                                                    statistics_merge_group *chunk_grstats,
-                                                    int32_t num_rowgroups,
-                                                    int32_t num_columns)
+__global__ void __launch_bounds__(128)
+  gpuInitPages(device_2dspan<EncColumnChunk> chunks,
+               device_span<gpu::EncPage> pages,
+               device_span<parquet_column_device_view const> col_desc,
+               statistics_merge_group *page_grstats,
+               statistics_merge_group *chunk_grstats,
+               int32_t num_columns)
 {
+  // TODO: All writing seems to be done by thread 0. Could be replaced by thrust foreach
   __shared__ __align__(8) parquet_column_device_view col_g;
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) PageFragment frag_g;
@@ -422,8 +422,9 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
   uint32_t t = threadIdx.x;
 
   if (t == 0) {
-    col_g = col_desc[blockIdx.x];
-    ck_g  = chunks[blockIdx.y * num_columns + blockIdx.x];
+    col_g  = col_desc[blockIdx.x];
+    ck_g   = chunks[blockIdx.y][blockIdx.x];
+    page_g = {};
   }
   __syncthreads();
   if (t < 32) {
@@ -454,6 +455,7 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
         page_g.num_fragments   = 0;
         page_g.page_type       = PageType::DICTIONARY_PAGE;
         page_g.dict_bits_plus1 = 0;
+        page_g.chunk           = &chunks[blockIdx.y][blockIdx.x];
         page_g.chunk_id        = blockIdx.y * num_columns + blockIdx.x;
         page_g.hdr_size        = 0;
         page_g.max_hdr_size    = 32;
@@ -467,7 +469,7 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
       }
       __syncwarp();
       if (t == 0) {
-        if (pages) pages[ck_g.first_page] = page_g;
+        if (not pages.empty()) pages[ck_g.first_page] = page_g;
         if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g;
       }
       num_pages = 1;
@@ -531,6 +533,7 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
         }
         if (!t) {
           page_g.num_fragments   = fragments_in_chunk - page_start;
+          page_g.chunk           = &chunks[blockIdx.y][blockIdx.x];
           page_g.chunk_id        = blockIdx.y * num_columns + blockIdx.x;
           page_g.page_type       = PageType::DATA_PAGE;
           page_g.dict_bits_plus1 = dict_bits_plus1;
@@ -574,7 +577,7 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
         }
         __syncwarp();
         if (t == 0) {
-          if (pages) { pages[ck_g.first_page + num_pages] = page_g; }
+          if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; }
 
           if (page_grstats) { page_grstats[ck_g.first_page + num_pages] = pagestats_g; }
         }
@@ -613,7 +616,8 @@ __global__ void __launch_bounds__(128) gpuInitPages(EncColumnChunk *chunks,
   }
   __syncthreads();
   if (t == 0) {
-    chunks[blockIdx.y * num_columns + blockIdx.x] = ck_g;
+    if (not pages.empty()) ck_g.pages = &pages[ck_g.first_page];
+    chunks[blockIdx.y][blockIdx.x] = ck_g;
     if (chunk_grstats) chunk_grstats[blockIdx.y * num_columns + blockIdx.x] = pagestats_g;
   }
 }
@@ -922,11 +926,10 @@ convert_nanoseconds(cuda::std::chrono::sys_time<cuda::std::chrono::nanoseconds>
 
 // blockDim(128, 1, 1)
 template <int block_size>
-__global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
-                                                         const EncColumnChunk *chunks,
-                                                         gpu_inflate_input_s *comp_in,
-                                                         gpu_inflate_status_s *comp_out,
-                                                         uint32_t start_page)
+__global__ void __launch_bounds__(128, 8)
+  gpuEncodePages(device_span<gpu::EncPage> pages,
+                 device_span<gpu_inflate_input_s> comp_in,
+                 device_span<gpu_inflate_status_s> comp_stat)
 {
   __shared__ __align__(8) page_enc_state_s state_g;
   using block_scan = cub::BlockScan<uint32_t, block_size>;
@@ -938,8 +941,8 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
   int32_t dict_bits;
 
   if (t == 0) {
-    s->page = pages[start_page + blockIdx.x];
-    s->ck   = chunks[s->page.chunk_id];
+    s->page = pages[blockIdx.x];
+    s->ck   = *s->page.chunk;
     s->col  = *s->ck.col_desc;
     s->cur  = s->page.page_data + s->page.max_hdr_size;
   }
@@ -1255,49 +1258,53 @@ __global__ void __launch_bounds__(128, 8) gpuEncodePages(EncPage *pages,
     s->comp_in.srcSize           = actual_data_size;
     s->comp_in.dstDevice         = s->page.compressed_data + s->page.max_hdr_size;
     s->comp_in.dstSize           = compressed_bfr_size;
-    s->comp_out.bytes_written    = 0;
-    s->comp_out.status           = ~0;
-    s->comp_out.reserved         = 0;
+    s->comp_stat.bytes_written   = 0;
+    s->comp_stat.status          = ~0;
+    s->comp_stat.reserved        = 0;
   }
   __syncthreads();
   if (t == 0) {
-    pages[start_page + blockIdx.x] = s->page;
-    if (comp_in) comp_in[blockIdx.x] = s->comp_in;
-    if (comp_out) comp_out[blockIdx.x] = s->comp_out;
+    pages[blockIdx.x] = s->page;
+    if (not comp_in.empty()) comp_in[blockIdx.x] = s->comp_in;
+    if (not comp_stat.empty()) {
+      comp_stat[blockIdx.x]       = s->comp_stat;
+      pages[blockIdx.x].comp_stat = &comp_stat[blockIdx.x];
+    }
   }
 }
 
 // blockDim(128, 1, 1)
-__global__ void __launch_bounds__(128) gpuDecideCompression(EncColumnChunk *chunks,
-                                                            const EncPage *pages,
-                                                            const gpu_inflate_status_s *comp_out,
-                                                            uint32_t start_page)
+__global__ void __launch_bounds__(128) gpuDecideCompression(device_span<EncColumnChunk> chunks)
 {
+  // After changing the way structs are loaded from coop to normal, this kernel has no business
+  // being launched with 128 thread block. It can easily be a single warp.
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(4) unsigned int error_count;
   using warp_reduce = cub::WarpReduce<uint32_t>;
   __shared__ typename warp_reduce::TempStorage temp_storage[2];
+  __shared__ volatile bool has_compression;
 
   uint32_t t                      = threadIdx.x;
   uint32_t uncompressed_data_size = 0;
   uint32_t compressed_data_size   = 0;
-  uint32_t first_page, num_pages;
+  uint32_t num_pages;
 
   if (t == 0) {
     ck_g = chunks[blockIdx.x];
     atomicAnd(&error_count, 0);
+    has_compression = false;
   }
   __syncthreads();
   if (t < 32) {
-    first_page = ck_g.first_page;
-    num_pages  = ck_g.num_pages;
+    num_pages = ck_g.num_pages;
     for (uint32_t page = t; page < num_pages; page += 32) {
-      uint32_t page_data_size = pages[first_page + page].max_data_size;
-      uint32_t comp_idx       = first_page + page - start_page;
+      auto &curr_page         = ck_g.pages[page];
+      uint32_t page_data_size = curr_page.max_data_size;
       uncompressed_data_size += page_data_size;
-      if (comp_out) {
-        compressed_data_size += (uint32_t)comp_out[comp_idx].bytes_written;
-        if (comp_out[comp_idx].status != 0) { atomicAdd(&error_count, 1); }
+      if (auto comp_status = curr_page.comp_stat; comp_status != nullptr) {
+        has_compression = true;
+        compressed_data_size += comp_status->bytes_written;
+        if (comp_status->status != 0) { atomicAdd(&error_count, 1); }
       }
     }
     uncompressed_data_size = warp_reduce(temp_storage[0]).Sum(uncompressed_data_size);
@@ -1306,7 +1313,7 @@ __global__ void __launch_bounds__(128) gpuDecideCompression(EncColumnChunk *chun
   __syncthreads();
   if (t == 0) {
     bool is_compressed;
-    if (comp_out) {
+    if (has_compression) {
       uint32_t compression_error = atomicAdd(&error_count, 0);
       is_compressed = (!compression_error && compressed_data_size < uncompressed_data_size);
     } else {
@@ -1427,11 +1434,10 @@ class header_encoder {
 
 __device__ uint8_t *EncodeStatistics(uint8_t *start,
                                      const statistics_chunk *s,
-                                     const parquet_column_device_view *col,
+                                     uint8_t dtype,
                                      float *fp_scratch)
 {
-  uint8_t *end, dtype, dtype_len;
-  dtype = col->stats_dtype;
+  uint8_t *end, dtype_len;
   switch (dtype) {
     case dtype_bool: dtype_len = 1; break;
     case dtype_int8:
@@ -1478,13 +1484,13 @@ __device__ uint8_t *EncodeStatistics(uint8_t *start,
 }
 
 // blockDim(128, 1, 1)
-__global__ void __launch_bounds__(128) gpuEncodePageHeaders(EncPage *pages,
-                                                            EncColumnChunk *chunks,
-                                                            const gpu_inflate_status_s *comp_out,
-                                                            const statistics_chunk *page_stats,
-                                                            const statistics_chunk *chunk_stats,
-                                                            uint32_t start_page)
+__global__ void __launch_bounds__(128)
+  gpuEncodePageHeaders(device_span<EncPage> pages,
+                       device_span<gpu_inflate_status_s const> comp_stat,
+                       device_span<statistics_chunk const> page_stats,
+                       const statistics_chunk *chunk_stats)
 {
+  // When this whole kernel becomes single thread, the following variables need not be __shared__
   __shared__ __align__(8) parquet_column_device_view col_g;
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
@@ -1496,19 +1502,20 @@ __global__ void __launch_bounds__(128) gpuEncodePageHeaders(EncPage *pages,
     uint8_t *hdr_start, *hdr_end;
     uint32_t compressed_page_size, uncompressed_page_size;
 
-    page_g = pages[start_page + blockIdx.x];
-    ck_g   = chunks[page_g.chunk_id];
+    page_g = pages[blockIdx.x];
+    ck_g   = *page_g.chunk;
     col_g  = *ck_g.col_desc;
 
-    if (chunk_stats && start_page + blockIdx.x == ck_g.first_page) {
+    if (chunk_stats && &pages[blockIdx.x] == ck_g.pages) {  // Is this the first page in a chunk?
       hdr_start = (ck_g.is_compressed) ? ck_g.compressed_bfr : ck_g.uncompressed_bfr;
-      hdr_end   = EncodeStatistics(hdr_start, &chunk_stats[page_g.chunk_id], &col_g, fp_scratch);
-      chunks[page_g.chunk_id].ck_stat_size = static_cast<uint32_t>(hdr_end - hdr_start);
+      hdr_end =
+        EncodeStatistics(hdr_start, &chunk_stats[page_g.chunk_id], col_g.stats_dtype, fp_scratch);
+      page_g.chunk->ck_stat_size = static_cast<uint32_t>(hdr_end - hdr_start);
     }
     uncompressed_page_size = page_g.max_data_size;
     if (ck_g.is_compressed) {
       hdr_start            = page_g.compressed_data;
-      compressed_page_size = (uint32_t)comp_out[blockIdx.x].bytes_written;
+      compressed_page_size = (uint32_t)comp_stat[blockIdx.x].bytes_written;
       page_g.max_data_size = compressed_page_size;
     } else {
       hdr_start            = page_g.page_data;
@@ -1542,10 +1549,10 @@ __global__ void __launch_bounds__(128) gpuEncodePageHeaders(EncPage *pages,
       encoder.field_int32(3, Encoding::RLE);      // definition_level_encoding
       encoder.field_int32(4, Encoding::RLE);      // repetition_level_encoding
       // Optionally encode page-level statistics
-      if (page_stats) {
+      if (not page_stats.empty()) {
         encoder.field_struct_begin(5);
         encoder.set_ptr(EncodeStatistics(
-          encoder.get_ptr(), &page_stats[start_page + blockIdx.x], &col_g, fp_scratch));
+          encoder.get_ptr(), &page_stats[blockIdx.x], col_g.stats_dtype, fp_scratch));
         encoder.field_struct_end(5);
       }
       encoder.field_struct_end(5);
@@ -1560,11 +1567,12 @@ __global__ void __launch_bounds__(128) gpuEncodePageHeaders(EncPage *pages,
     page_g.hdr_size = (uint32_t)(hdr_end - hdr_start);
   }
   __syncthreads();
-  if (t == 0) pages[start_page + blockIdx.x] = page_g;
+  if (t == 0) pages[blockIdx.x] = page_g;
 }
 
 // blockDim(1024, 1, 1)
-__global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, const EncPage *pages)
+__global__ void __launch_bounds__(1024)
+  gpuGatherPages(device_span<EncColumnChunk> chunks, device_span<gpu::EncPage const> pages)
 {
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
@@ -1577,7 +1585,7 @@ __global__ void __launch_bounds__(1024) gpuGatherPages(EncColumnChunk *chunks, c
   if (t == 0) ck_g = chunks[blockIdx.x];
   __syncthreads();
 
-  first_page = &pages[ck_g.first_page];
+  first_page = ck_g.pages;
   num_pages  = ck_g.num_pages;
   dst        = (ck_g.is_compressed) ? ck_g.compressed_bfr : ck_g.uncompressed_bfr;
   dst += ck_g.ck_stat_size;  // Skip over chunk statistics
@@ -2090,17 +2098,17 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] num_columns Number of columns
  * @param[in] stream CUDA stream to use, default 0
  */
-void InitPageFragments(PageFragment *frag,
-                       const parquet_column_device_view *col_desc,
-                       int32_t num_fragments,
-                       int32_t num_columns,
+void InitPageFragments(device_2dspan<PageFragment> frag,
+                       device_span<parquet_column_device_view const> col_desc,
                        uint32_t fragment_size,
                        uint32_t num_rows,
                        rmm::cuda_stream_view stream)
 {
-  dim3 dim_grid(num_columns, num_fragments);  // 1 threadblock per fragment
-  gpuInitPageFragments<512><<<dim_grid, 512, 0, stream.value()>>>(
-    frag, col_desc, num_fragments, num_columns, fragment_size, num_rows);
+  auto num_columns              = frag.size().first;
+  auto num_fragments_per_column = frag.size().second;
+  dim3 dim_grid(num_columns, num_fragments_per_column);  // 1 threadblock per fragment
+  gpuInitPageFragments<512>
+    <<<dim_grid, 512, 0, stream.value()>>>(frag, col_desc, fragment_size, num_rows);
 }
 
 /**
@@ -2109,22 +2117,18 @@ void InitPageFragments(PageFragment *frag,
  * @param[out] groups Statistics groups [num_columns x num_fragments]
  * @param[in] fragments Page fragments [num_columns x num_fragments]
  * @param[in] col_desc Column description [num_columns]
- * @param[in] num_fragments Number of fragments
- * @param[in] num_columns Number of columns
- * @param[in] fragment_size Max size of each fragment in rows
  * @param[in] stream CUDA stream to use, default 0
  */
-void InitFragmentStatistics(statistics_group *groups,
-                            const PageFragment *fragments,
-                            const parquet_column_device_view *col_desc,
-                            int32_t num_fragments,
-                            int32_t num_columns,
-                            uint32_t fragment_size,
+void InitFragmentStatistics(device_2dspan<statistics_group> groups,
+                            device_2dspan<PageFragment const> fragments,
+                            device_span<parquet_column_device_view const> col_desc,
                             rmm::cuda_stream_view stream)
 {
-  dim3 dim_grid(num_columns, (num_fragments + 3) >> 2);  // 1 warp per fragment
-  gpuInitFragmentStats<<<dim_grid, 128, 0, stream.value()>>>(
-    groups, fragments, col_desc, num_fragments, num_columns, fragment_size);
+  int const num_columns              = col_desc.size();
+  int const num_fragments_per_column = fragments.size().second;
+  auto grid_y = util::div_rounding_up_safe(num_fragments_per_column, 128 / cudf::detail::warp_size);
+  dim3 dim_grid(num_columns, grid_y);  // 1 warp per fragment
+  gpuInitFragmentStats<<<dim_grid, 128, 0, stream.value()>>>(groups, fragments, col_desc);
 }
 
 /**
@@ -2139,88 +2143,69 @@ void InitFragmentStatistics(statistics_group *groups,
  * @param[out] chunk_grstats Setup for chunk-level stats
  * @param[in] stream CUDA stream to use, default 0
  */
-void InitEncoderPages(EncColumnChunk *chunks,
-                      EncPage *pages,
-                      const parquet_column_device_view *col_desc,
-                      int32_t num_rowgroups,
+void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
+                      device_span<gpu::EncPage> pages,
+                      device_span<parquet_column_device_view const> col_desc,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats,
                       statistics_merge_group *chunk_grstats,
                       rmm::cuda_stream_view stream)
 {
+  auto num_rowgroups = chunks.size().first;
   dim3 dim_grid(num_columns, num_rowgroups);  // 1 threadblock per rowgroup
   gpuInitPages<<<dim_grid, 128, 0, stream.value()>>>(
-    chunks, pages, col_desc, page_grstats, chunk_grstats, num_rowgroups, num_columns);
+    chunks, pages, col_desc, page_grstats, chunk_grstats, num_columns);
 }
 
 /**
  * @brief Launches kernel for packing column data into parquet pages
  *
  * @param[in,out] pages Device array of EncPages (unordered)
- * @param[in] chunks Column chunks
- * @param[in] num_pages Number of pages
- * @param[in] start_page First page to encode in page array
  * @param[out] comp_in Optionally initializes compressor input params
- * @param[out] comp_out Optionally initializes compressor output params
+ * @param[out] comp_stat Optionally initializes compressor status
  * @param[in] stream CUDA stream to use, default 0
  */
-void EncodePages(EncPage *pages,
-                 const EncColumnChunk *chunks,
-                 uint32_t num_pages,
-                 uint32_t start_page,
-                 gpu_inflate_input_s *comp_in,
-                 gpu_inflate_status_s *comp_out,
+void EncodePages(device_span<gpu::EncPage> pages,
+                 device_span<gpu_inflate_input_s> comp_in,
+                 device_span<gpu_inflate_status_s> comp_stat,
                  rmm::cuda_stream_view stream)
 {
+  auto num_pages = pages.size();
   // A page is part of one column. This is launching 1 block per page. 1 block will exclusively
   // deal with one datatype.
-  gpuEncodePages<128>
-    <<<num_pages, 128, 0, stream.value()>>>(pages, chunks, comp_in, comp_out, start_page);
+  gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_stat);
 }
 
 /**
  * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
  *
  * @param[in,out] chunks Column chunks
- * @param[in] pages Device array of EncPages (unordered)
- * @param[in] num_chunks Number of column chunks
- * @param[in] start_page First page to encode in page array
- * @param[in] comp_out Compressor status
  * @param[in] stream CUDA stream to use, default 0
  */
-void DecideCompression(EncColumnChunk *chunks,
-                       const EncPage *pages,
-                       uint32_t num_chunks,
-                       uint32_t start_page,
-                       const gpu_inflate_status_s *comp_out,
-                       rmm::cuda_stream_view stream)
+void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
 {
-  gpuDecideCompression<<<num_chunks, 128, 0, stream.value()>>>(chunks, pages, comp_out, start_page);
+  gpuDecideCompression<<<chunks.size(), 128, 0, stream.value()>>>(chunks);
 }
 
 /**
  * @brief Launches kernel to encode page headers
  *
  * @param[in,out] pages Device array of EncPages
- * @param[in,out] chunks Column chunks
- * @param[in] num_pages Number of pages
- * @param[in] start_page First page to encode in page array
- * @param[in] comp_out Compressor status or nullptr if no compression
+ * @param[in] comp_stat Compressor status or nullptr if no compression
  * @param[in] page_stats Optional page-level statistics to be included in page header
  * @param[in] chunk_stats Optional chunk-level statistics to be encoded
  * @param[in] stream CUDA stream to use, default 0
  */
-void EncodePageHeaders(EncPage *pages,
-                       EncColumnChunk *chunks,
-                       uint32_t num_pages,
-                       uint32_t start_page,
-                       const gpu_inflate_status_s *comp_out,
-                       const statistics_chunk *page_stats,
+void EncodePageHeaders(device_span<EncPage> pages,
+                       device_span<gpu_inflate_status_s const> comp_stat,
+                       device_span<statistics_chunk const> page_stats,
                        const statistics_chunk *chunk_stats,
                        rmm::cuda_stream_view stream)
 {
-  gpuEncodePageHeaders<<<num_pages, 128, 0, stream.value()>>>(
-    pages, chunks, comp_out, page_stats, chunk_stats, start_page);
+  // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the
+  // threads to coop load structs
+  gpuEncodePageHeaders<<<pages.size(), 128, 0, stream.value()>>>(
+    pages, comp_stat, page_stats, chunk_stats);
 }
 
 /**
@@ -2228,15 +2213,13 @@ void EncodePageHeaders(EncPage *pages,
  *
  * @param[in,out] chunks Column chunks
  * @param[in] pages Device array of EncPages
- * @param[in] num_chunks Number of column chunks
  * @param[in] stream CUDA stream to use, default 0
  */
-void GatherPages(EncColumnChunk *chunks,
-                 const EncPage *pages,
-                 uint32_t num_chunks,
+void GatherPages(device_span<EncColumnChunk> chunks,
+                 device_span<gpu::EncPage const> pages,
                  rmm::cuda_stream_view stream)
 {
-  gpuGatherPages<<<num_chunks, 1024, 0, stream.value()>>>(chunks, pages);
+  gpuGatherPages<<<chunks.size(), 1024, 0, stream.value()>>>(chunks, pages);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 34f5ee6fb1a..bc10fd92566 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -15,8 +15,8 @@
  */
 
 #include <thrust/tuple.h>
-#include <io/parquet/parquet_gpu.hpp>
 #include <io/utilities/block_utils.cuh>
+#include "parquet_gpu.hpp"
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -447,10 +447,10 @@ extern "C" __global__ void __launch_bounds__(128)
   if (chunk >= num_chunks) { return; }
   if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) {
     // Data type to describe a string
-    nvstrdesc_s *dict_index = ck->str_dict_index;
-    const uint8_t *dict     = ck->page_info[0].page_data;
-    int dict_size           = ck->page_info[0].uncompressed_page_size;
-    int num_entries         = ck->page_info[0].num_input_values;
+    string_index_pair *dict_index = ck->str_dict_index;
+    const uint8_t *dict           = ck->page_info[0].page_data;
+    int dict_size                 = ck->page_info[0].uncompressed_page_size;
+    int num_entries               = ck->page_info[0].num_input_values;
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
@@ -464,8 +464,8 @@ extern "C" __global__ void __launch_bounds__(128)
         }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].ptr   = reinterpret_cast<const char *>(dict + pos + 4);
-      dict_index[i].count = len;
+      dict_index[i].first  = reinterpret_cast<const char *>(dict + pos + 4);
+      dict_index[i].second = len;
     }
   }
 }
diff --git a/cpp/src/io/parquet/parquet.cpp b/cpp/src/io/parquet/parquet.cpp
index 40ce222825b..2a1bd0d5a18 100644
--- a/cpp/src/io/parquet/parquet.cpp
+++ b/cpp/src/io/parquet/parquet.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
+#include "parquet.hpp"
 #include <algorithm>
-#include <io/parquet/parquet.hpp>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 6c1c6209266..eefff518a9a 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <io/parquet/parquet_common.hpp>
+#include "parquet_common.hpp"
 
 #include <stddef.h>
 #include <stdint.h>
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 555259c443d..1b6bb9ad7ca 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -16,16 +16,17 @@
 
 #pragma once
 
-#include <io/comp/gpuinflate.h>
-#include <io/statistics/column_stats.h>
-#include <io/parquet/parquet_common.hpp>
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
+#include "io/comp/gpuinflate.h"
+#include "io/parquet/parquet_common.hpp"
+#include "io/statistics/statistics.cuh"
+#include "io/utilities/column_buffer.hpp"
+#include "io/utilities/hostdevice_vector.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
@@ -39,6 +40,8 @@ namespace cudf {
 namespace io {
 namespace parquet {
 
+using cudf::io::detail::string_index_pair;
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -70,14 +73,6 @@ enum level_type {
   NUM_LEVEL_TYPES
 };
 
-/**
- * @brief Struct to describe the output of a string datatype
- */
-struct nvstrdesc_s {
-  const char *ptr;
-  size_t count;
-};
-
 /**
  * @brief Nesting information
  */
@@ -211,7 +206,7 @@ struct ColumnChunkDesc {
   int32_t max_num_pages;                      // size of page_info array
   PageInfo *page_info;                        // output page info for up to num_dict_pages +
                                               // num_data_pages (dictionary pages first)
-  nvstrdesc_s *str_dict_index;                // index for string dictionary
+  string_index_pair *str_dict_index;          // index for string dictionary
   uint32_t **valid_map_base;                  // base pointers of valid bit map for this column
   void **column_data_base;                    // base pointers of column data
   int8_t codec;                               // compressed codec enum
@@ -231,7 +226,6 @@ struct parquet_column_device_view : stats_column_desc {
   uint32_t *dict_data;     //!< Dictionary data (unique row indices)
   uint8_t physical_type;   //!< physical data type
   uint8_t converted_type;  //!< logical data type
-  // TODO (dm): Evaluate if this is sufficient. At 4 bits, this allows a maximum 16 level nesting
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
   constexpr uint8_t num_def_level_bits() { return level_bits & 0xf; }
@@ -264,26 +258,6 @@ struct PageFragment {
   uint16_t num_dict_vals;    //!< Number of unique dictionary entries
 };
 
-/**
- * @brief Struct describing an encoder data page
- */
-struct EncPage {
-  uint8_t *page_data;        //!< Ptr to uncompressed page
-  uint8_t *compressed_data;  //!< Ptr to compressed page
-  uint16_t num_fragments;    //!< Number of fragments in page
-  PageType page_type;        //!< Page type
-  uint8_t dict_bits_plus1;   //!< 0=plain, nonzero:bits to encoding dictionary indices + 1
-  uint32_t chunk_id;         //!< Index in chunk array
-  uint32_t hdr_size;         //!< Size of page header
-  uint32_t max_hdr_size;     //!< Maximum size of page header
-  uint32_t max_data_size;    //!< Maximum size of coded page data (excluding header)
-  uint32_t start_row;        //!< First row of page
-  uint32_t num_rows;         //!< Rows in page
-  uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
-  uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
-                        //!< non-leaf levels
-};
-
 /// Size of hash used for building dictionaries
 constexpr unsigned int kDictHashBits = 16;
 constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
@@ -311,21 +285,24 @@ inline size_t __device__ __host__ GetMaxCompressedBfrSize(size_t uncomp_size,
   return uncomp_size + (uncomp_size >> 7) + num_pages * 8;
 }
 
+struct EncPage;
+
 /**
  * @brief Struct describing an encoder column chunk
  */
 struct EncColumnChunk {
-  const parquet_column_device_view *col_desc;  //!< Column description
+  parquet_column_device_view const *col_desc;  //!< Column description
   PageFragment *fragments;                     //!< First fragment in chunk
   uint8_t *uncompressed_bfr;                   //!< Uncompressed page data
   uint8_t *compressed_bfr;                     //!< Compressed page data
-  const statistics_chunk *stats;               //!< Fragment statistics
+  statistics_chunk const *stats;               //!< Fragment statistics
   uint32_t bfr_size;                           //!< Uncompressed buffer size
   uint32_t compressed_size;                    //!< Compressed buffer size
   uint32_t start_row;                          //!< First row of chunk
   uint32_t num_rows;                           //!< Number of rows in chunk
   uint32_t num_values;      //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
+  EncPage *pages;           //!< Ptr to pages that belong to this chunk
   uint32_t first_page;      //!< First page of chunk
   uint32_t num_pages;       //!< Number of pages in chunk
   uint32_t dictionary_id;   //!< Dictionary id for this chunk
@@ -337,6 +314,28 @@ struct EncColumnChunk {
   uint32_t ck_stat_size;        //!< Size of chunk-level statistics (included in 1st page header)
 };
 
+/**
+ * @brief Struct describing an encoder data page
+ */
+struct EncPage {
+  uint8_t *page_data;        //!< Ptr to uncompressed page
+  uint8_t *compressed_data;  //!< Ptr to compressed page
+  uint16_t num_fragments;    //!< Number of fragments in page
+  PageType page_type;        //!< Page type
+  uint8_t dict_bits_plus1;   //!< 0=plain, nonzero:bits to encoding dictionary indices + 1
+  EncColumnChunk *chunk;     //!< Chunk that this page belongs to
+  uint32_t chunk_id;         //!< Index in chunk array
+  uint32_t hdr_size;         //!< Size of page header
+  uint32_t max_hdr_size;     //!< Maximum size of page header
+  uint32_t max_data_size;    //!< Maximum size of coded page data (excluding header)
+  uint32_t start_row;        //!< First row of page
+  uint32_t num_rows;         //!< Rows in page
+  uint32_t num_leaf_values;  //!< Values in page. Different from num_rows in case of nested types
+  uint32_t num_values;  //!< Number of def/rep level values in page. Includes null/empty elements in
+                        //!< non-leaf levels
+  gpu_inflate_status_s *comp_stat;  //!< Ptr to compression status
+};
+
 /**
  * @brief Launches kernel for parsing the page headers in the column chunks
  *
@@ -452,10 +451,8 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] num_rows Number of rows per column
  * @param[in] stream CUDA stream to use, default 0
  */
-void InitPageFragments(PageFragment *frag,
-                       const parquet_column_device_view *col_desc,
-                       int32_t num_fragments,
-                       int32_t num_columns,
+void InitPageFragments(cudf::detail::device_2dspan<PageFragment> frag,
+                       device_span<parquet_column_device_view const> col_desc,
                        uint32_t fragment_size,
                        uint32_t num_rows,
                        rmm::cuda_stream_view stream);
@@ -466,17 +463,11 @@ void InitPageFragments(PageFragment *frag,
  * @param[out] groups Statistics groups [num_columns x num_fragments]
  * @param[in] fragments Page fragments [num_columns x num_fragments]
  * @param[in] col_desc Column description [num_columns]
- * @param[in] num_fragments Number of fragments
- * @param[in] num_columns Number of columns
- * @param[in] fragment_size Max size of each fragment in rows
  * @param[in] stream CUDA stream to use, default 0
  */
-void InitFragmentStatistics(statistics_group *groups,
-                            const PageFragment *fragments,
-                            const parquet_column_device_view *col_desc,
-                            int32_t num_fragments,
-                            int32_t num_columns,
-                            uint32_t fragment_size,
+void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups,
+                            cudf::detail::device_2dspan<PageFragment const> fragments,
+                            device_span<gpu::parquet_column_device_view const> col_desc,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -491,10 +482,9 @@ void InitFragmentStatistics(statistics_group *groups,
  * @param[in] chunk_grstats Setup for chunk-level stats
  * @param[in] stream CUDA stream to use, default 0
  */
-void InitEncoderPages(EncColumnChunk *chunks,
-                      EncPage *pages,
-                      const parquet_column_device_view *col_desc,
-                      int32_t num_rowgroups,
+void InitEncoderPages(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                      device_span<gpu::EncPage> pages,
+                      device_span<parquet_column_device_view const> col_desc,
                       int32_t num_columns,
                       statistics_merge_group *page_grstats  = nullptr,
                       statistics_merge_group *chunk_grstats = nullptr,
@@ -504,71 +494,48 @@ void InitEncoderPages(EncColumnChunk *chunks,
  * @brief Launches kernel for packing column data into parquet pages
  *
  * @param[in,out] pages Device array of EncPages (unordered)
- * @param[in] chunks Column chunks
- * @param[in] num_pages Number of pages
- * @param[in] start_page First page to encode in page array
  * @param[out] comp_in Optionally initializes compressor input params
  * @param[out] comp_out Optionally initializes compressor output params
  * @param[in] stream CUDA stream to use, default 0
  */
-void EncodePages(EncPage *pages,
-                 const EncColumnChunk *chunks,
-                 uint32_t num_pages,
-                 uint32_t start_page            = 0,
-                 gpu_inflate_input_s *comp_in   = nullptr,
-                 gpu_inflate_status_s *comp_out = nullptr,
-                 rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
+void EncodePages(device_span<EncPage> pages,
+                 device_span<gpu_inflate_input_s> comp_in   = {},
+                 device_span<gpu_inflate_status_s> comp_out = {},
+                 rmm::cuda_stream_view stream               = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
  *
  * @param[in,out] chunks Column chunks (updated with actual compressed/uncompressed sizes)
- * @param[in] pages Device array of EncPages
- * @param[in] num_chunks Number of column chunks
- * @param[in] start_page First page to encode in page array
- * @param[in] comp_out Compressor status or nullptr if no compression
  * @param[in] stream CUDA stream to use, default 0
  */
-void DecideCompression(EncColumnChunk *chunks,
-                       const EncPage *pages,
-                       uint32_t num_chunks,
-                       uint32_t start_page,
-                       const gpu_inflate_status_s *comp_out = nullptr,
-                       rmm::cuda_stream_view stream         = rmm::cuda_stream_default);
+void DecideCompression(device_span<EncColumnChunk> chunks,
+                       rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel to encode page headers
  *
  * @param[in,out] pages Device array of EncPages
- * @param[in,out] chunks Column chunks
- * @param[in] num_pages Number of pages
- * @param[in] start_page First page to encode in page array
  * @param[in] comp_out Compressor status or nullptr if no compression
  * @param[in] page_stats Optional page-level statistics to be included in page header
  * @param[in] chunk_stats Optional chunk-level statistics to be encoded
  * @param[in] stream CUDA stream to use, default 0
  */
-void EncodePageHeaders(EncPage *pages,
-                       EncColumnChunk *chunks,
-                       uint32_t num_pages,
-                       uint32_t start_page                  = 0,
-                       const gpu_inflate_status_s *comp_out = nullptr,
-                       const statistics_chunk *page_stats   = nullptr,
-                       const statistics_chunk *chunk_stats  = nullptr,
-                       rmm::cuda_stream_view stream         = rmm::cuda_stream_default);
+void EncodePageHeaders(device_span<EncPage> pages,
+                       device_span<gpu_inflate_status_s const> comp_out = {},
+                       device_span<statistics_chunk const> page_stats   = {},
+                       const statistics_chunk *chunk_stats              = nullptr,
+                       rmm::cuda_stream_view stream                     = rmm::cuda_stream_default);
 
 /**
  * @brief Launches kernel to gather pages to a single contiguous block per chunk
  *
  * @param[in,out] chunks Column chunks
  * @param[in] pages Device array of EncPages
- * @param[in] num_chunks Number of column chunks
- * @param[in] comp_out Compressor status
  * @param[in] stream CUDA stream to use, default 0
  */
-void GatherPages(EncColumnChunk *chunks,
-                 const EncPage *pages,
-                 uint32_t num_chunks,
+void GatherPages(device_span<EncColumnChunk> chunks,
+                 device_span<gpu::EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -576,14 +543,10 @@ void GatherPages(EncColumnChunk *chunks,
  *
  * @param[in] chunks Column chunks
  * @param[in] dev_scratch Device scratch data (kDictScratchSize bytes per dictionary)
- * @param[in] scratch_size size of scratch data in bytes
- * @param[in] num_chunks Number of column chunks
  * @param[in] stream CUDA stream to use, default 0
  */
-void BuildChunkDictionaries(EncColumnChunk *chunks,
+void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
                             uint32_t *dev_scratch,
-                            size_t scratch_size,
-                            uint32_t num_chunks,
                             rmm::cuda_stream_view stream);
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 698eb1569cb..0863bca7b03 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -23,6 +23,7 @@
 
 #include <io/comp/gpuinflate.h>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -30,7 +31,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 
 #include <algorithm>
 #include <array>
@@ -927,7 +927,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
   };
 
   // Brotli scratch memory for decompressing
-  rmm::device_vector<uint8_t> debrotli_scratch;
+  rmm::device_buffer debrotli_scratch;
 
   // Count the exact number of compressed pages
   size_t num_comp_pages    = 0;
@@ -943,7 +943,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
       num_comp_pages++;
     });
     if (codec.first == parquet::BROTLI && codec.second > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.second));
+      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.second), stream);
     }
   }
 
@@ -1001,7 +1001,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
         case parquet::BROTLI:
           CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos),
                                 inflate_out.device_ptr(start_pos),
-                                debrotli_scratch.data().get(),
+                                debrotli_scratch.data(),
                                 debrotli_scratch.size(),
                                 argc - start_pos,
                                 stream));
@@ -1052,10 +1052,10 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
   int target_page_index = 0;
   int src_info_index    = 0;
   for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema = chunks[idx].src_col_schema;
-    auto &schema       = _metadata->get_schema(src_col_schema);
-    auto const per_page_nesting_info_size =
-      max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
+    int src_col_schema                    = chunks[idx].src_col_schema;
+    auto &schema                          = _metadata->get_schema(src_col_schema);
+    auto const per_page_nesting_info_size = std::max(
+      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
 
     // skip my dict pages
     target_page_index += chunks[idx].num_dict_pages;
@@ -1083,7 +1083,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
     int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
     // # of nesting infos stored per page for this column
-    auto const per_page_nesting_info_size = max(schema.max_definition_level + 1, max_depth);
+    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
 
     // if this column has lists, generate depth remapping
     std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
@@ -1199,8 +1199,8 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
 
   // Build index for string dictionaries since they can't be indexed
   // directly due to variable-sized elements
-  rmm::device_vector<gpu::nvstrdesc_s> str_dict_index;
-  if (total_str_dict_indexes > 0) { str_dict_index.resize(total_str_dict_indexes); }
+  auto str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
+    total_str_dict_indexes, stream);
 
   // TODO (dm): hd_vec should have begin and end iterator members
   size_t sum_max_depths =
@@ -1225,7 +1225,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc> &chu
                  "Column/page schema index mismatch");
 
     if (is_dict_chunk(chunks[c])) {
-      chunks[c].str_dict_index = str_dict_index.data().get() + str_ofs;
+      chunks[c].str_dict_index = str_dict_index.data() + str_ofs;
       str_ofs += pages[page_count].num_input_values;
     }
 
@@ -1572,7 +1572,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) {
     out_metadata.schema_info.push_back(column_name_info{""});
-    out_columns.emplace_back(make_empty_column(_output_columns[i].type));
+    out_columns.emplace_back(cudf::io::detail::empty_like(
+      _output_columns[i], &out_metadata.schema_info.back(), stream, _mr));
   }
 
   // Return column names (must match order of returned columns)
@@ -1591,6 +1592,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 // Forward to implementation
 reader::reader(std::vector<std::string> const &filepaths,
                parquet_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
   : _impl(std::make_unique<impl>(datasource::create(filepaths), options, mr))
 {
@@ -1599,6 +1601,7 @@ reader::reader(std::vector<std::string> const &filepaths,
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>> &&sources,
                parquet_reader_options const &options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource *mr)
   : _impl(std::make_unique<impl>(std::move(sources), options, mr))
 {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index ca200936134..ffd8975a8d2 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -21,8 +21,8 @@
 
 #pragma once
 
-#include <io/parquet/parquet.hpp>
-#include <io/parquet/parquet_gpu.hpp>
+#include "parquet.hpp"
+#include "parquet_gpu.hpp"
 
 #include <io/utilities/column_buffer.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 1e8a6920ea4..77210b5a2ab 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,12 +19,14 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
+#include <io/statistics/column_statistics.cuh>
 #include "writer_impl.hpp"
 
-#include <io/parquet/compact_protocol_writer.hpp>
 #include <io/utilities/column_utils.cuh>
+#include "compact_protocol_writer.hpp"
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -34,7 +36,6 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 
 #include <algorithm>
 #include <cstring>
@@ -504,7 +505,7 @@ struct parquet_column_view {
                       rmm::cuda_stream_view stream);
 
   column_view leaf_column_view() const;
-  gpu::parquet_column_device_view get_device_view();
+  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream);
 
   column_view cudf_column_view() const { return cudf_col; }
   parquet::Type physical_type() const { return schema_node.type; }
@@ -517,21 +518,21 @@ struct parquet_column_view {
   bool is_list() const noexcept { return _is_list; }
 
   // Dictionary related member functions
-  uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data().get() : nullptr; }
-  uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data().get() : nullptr; }
+  uint32_t *get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; }
+  uint32_t *get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; }
   void use_dictionary(bool use_dict) { _dictionary_used = use_dict; }
-  void alloc_dictionary(size_t max_num_rows)
+  void alloc_dictionary(size_t max_num_rows, rmm::cuda_stream_view stream)
   {
-    _dict_data.resize(max_num_rows);
-    _dict_index.resize(max_num_rows);
+    _dict_data.resize(max_num_rows, stream);
+    _dict_index.resize(max_num_rows, stream);
   }
-  bool check_dictionary_used()
+  bool check_dictionary_used(rmm::cuda_stream_view stream)
   {
     if (!_dictionary_used) {
-      _dict_data.resize(0);
-      _dict_data.shrink_to_fit();
-      _dict_index.resize(0);
-      _dict_index.shrink_to_fit();
+      _dict_data.resize(0, stream);
+      _dict_data.shrink_to_fit(stream);
+      _dict_index.resize(0, stream);
+      _dict_index.shrink_to_fit(stream);
     }
     return _dictionary_used;
   }
@@ -558,8 +559,8 @@ struct parquet_column_view {
 
   // Dictionary related members
   bool _dictionary_used = false;
-  rmm::device_vector<uint32_t> _dict_data;
-  rmm::device_vector<uint32_t> _dict_index;
+  rmm::device_uvector<uint32_t> _dict_data;
+  rmm::device_uvector<uint32_t> _dict_index;
 };
 
 parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
@@ -569,7 +570,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const &schema_node,
     _d_nullability(0, stream),
     _dremel_offsets(0, stream),
     _rep_level(0, stream),
-    _def_level(0, stream)
+    _def_level(0, stream),
+    _dict_data(0, stream),
+    _dict_index(0, stream)
 {
   // Construct single inheritance column_view from linked_column_view
   auto curr_col                           = schema_node.leaf_column.get();
@@ -680,16 +683,17 @@ column_view parquet_column_view::leaf_column_view() const
   return col;
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view()
+gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view stream)
 {
   column_view col  = leaf_column_view();
   auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
-  // TODO (dm): Enable dictionary for list after refactor
-  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE && !is_list()) {
-    alloc_dictionary(_data_count);
+  // TODO (dm): Enable dictionary for list and struct after refactor
+  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE &&
+      !is_nested(cudf_col.type())) {
+    alloc_dictionary(_data_count, stream);
     desc.dict_index = get_dict_index();
     desc.dict_data  = get_dict_data();
   }
@@ -714,147 +718,126 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view()
   return desc;
 }
 
-void writer::impl::init_page_fragments(hostdevice_vector<gpu::PageFragment> &frag,
-                                       hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
-                                       uint32_t num_columns,
-                                       uint32_t num_fragments,
+void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment> &frag,
+                                       device_span<gpu::parquet_column_device_view const> col_desc,
                                        uint32_t num_rows,
                                        uint32_t fragment_size)
 {
-  gpu::InitPageFragments(frag.device_ptr(),
-                         col_desc.device_ptr(),
-                         num_fragments,
-                         num_columns,
-                         fragment_size,
-                         num_rows,
-                         stream);
+  gpu::InitPageFragments(frag, col_desc, fragment_size, num_rows, stream);
   frag.device_to_host(stream, true);
 }
 
 void writer::impl::gather_fragment_statistics(
-  statistics_chunk *frag_stats_chunk,
-  hostdevice_vector<gpu::PageFragment> &frag,
-  hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
-  uint32_t num_columns,
-  uint32_t num_fragments,
-  uint32_t fragment_size)
+  device_2dspan<statistics_chunk> frag_stats_chunk,
+  device_2dspan<gpu::PageFragment const> frag,
+  device_span<gpu::parquet_column_device_view const> col_desc,
+  uint32_t num_fragments)
 {
-  rmm::device_vector<statistics_group> frag_stats_group(num_fragments * num_columns);
-
-  gpu::InitFragmentStatistics(frag_stats_group.data().get(),
-                              frag.device_ptr(),
-                              col_desc.device_ptr(),
-                              num_fragments,
-                              num_columns,
-                              fragment_size,
-                              stream);
-  GatherColumnStatistics(
-    frag_stats_chunk, frag_stats_group.data().get(), num_fragments * num_columns, stream);
+  auto num_columns = col_desc.size();
+  rmm::device_uvector<statistics_group> frag_stats_group(num_fragments * num_columns, stream);
+  auto frag_stats_group_2dview =
+    device_2dspan<statistics_group>(frag_stats_group.data(), num_columns, num_fragments);
+
+  gpu::InitFragmentStatistics(frag_stats_group_2dview, frag, col_desc, stream);
+  detail::calculate_group_statistics<detail::io_file_format::PARQUET>(
+    frag_stats_chunk.data(), frag_stats_group.data(), num_fragments * num_columns, stream);
   stream.synchronize();
 }
 
 void writer::impl::build_chunk_dictionaries(
-  hostdevice_vector<gpu::EncColumnChunk> &chunks,
-  hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
-  uint32_t num_rowgroups,
+  hostdevice_2dvector<gpu::EncColumnChunk> &chunks,
+  device_span<gpu::parquet_column_device_view const> col_desc,
   uint32_t num_columns,
   uint32_t num_dictionaries)
 {
-  size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
-  rmm::device_vector<uint32_t> dict_scratch(dict_scratch_size / sizeof(uint32_t));
   chunks.host_to_device(stream);
-  gpu::BuildChunkDictionaries(chunks.device_ptr(),
-                              dict_scratch.data().get(),
-                              dict_scratch_size,
-                              num_rowgroups * num_columns,
-                              stream);
-  gpu::InitEncoderPages(chunks.device_ptr(),
-                        nullptr,
-                        col_desc.device_ptr(),
-                        num_rowgroups,
-                        num_columns,
-                        nullptr,
-                        nullptr,
-                        stream);
+  if (num_dictionaries > 0) {
+    size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
+    auto dict_scratch        = cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+      dict_scratch_size / sizeof(uint32_t), stream);
+
+    gpu::BuildChunkDictionaries(chunks.device_view().flat_view(), dict_scratch.data(), stream);
+  }
+  gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, stream);
   chunks.device_to_host(stream, true);
 }
 
-void writer::impl::init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
-                                      hostdevice_vector<gpu::parquet_column_device_view> &col_desc,
-                                      gpu::EncPage *pages,
+void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk> &chunks,
+                                      device_span<gpu::parquet_column_device_view const> col_desc,
+                                      device_span<gpu::EncPage> pages,
                                       statistics_chunk *page_stats,
                                       statistics_chunk *frag_stats,
-                                      uint32_t num_rowgroups,
                                       uint32_t num_columns,
                                       uint32_t num_pages,
                                       uint32_t num_stats_bfr)
 {
-  rmm::device_vector<statistics_merge_group> page_stats_mrg(num_stats_bfr);
+  rmm::device_uvector<statistics_merge_group> page_stats_mrg(num_stats_bfr, stream);
   chunks.host_to_device(stream);
-  InitEncoderPages(chunks.device_ptr(),
+  InitEncoderPages(chunks,
                    pages,
-                   col_desc.device_ptr(),
-                   num_rowgroups,
+                   col_desc,
                    num_columns,
-                   (num_stats_bfr) ? page_stats_mrg.data().get() : nullptr,
-                   (num_stats_bfr > num_pages) ? page_stats_mrg.data().get() + num_pages : nullptr,
+                   (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
+                   (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
                    stream);
   if (num_stats_bfr > 0) {
-    MergeColumnStatistics(page_stats, frag_stats, page_stats_mrg.data().get(), num_pages, stream);
+    detail::merge_group_statistics<detail::io_file_format::PARQUET>(
+      page_stats, frag_stats, page_stats_mrg.data(), num_pages, stream);
     if (num_stats_bfr > num_pages) {
-      MergeColumnStatistics(page_stats + num_pages,
-                            page_stats,
-                            page_stats_mrg.data().get() + num_pages,
-                            num_stats_bfr - num_pages,
-                            stream);
+      detail::merge_group_statistics<detail::io_file_format::PARQUET>(
+        page_stats + num_pages,
+        page_stats,
+        page_stats_mrg.data() + num_pages,
+        num_stats_bfr - num_pages,
+        stream);
     }
   }
   stream.synchronize();
 }
 
-void writer::impl::encode_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
-                                gpu::EncPage *pages,
-                                uint32_t num_columns,
+void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk> &chunks,
+                                device_span<gpu::EncPage> pages,
                                 uint32_t pages_in_batch,
                                 uint32_t first_page_in_batch,
                                 uint32_t rowgroups_in_batch,
                                 uint32_t first_rowgroup,
-                                gpu_inflate_input_s *comp_in,
-                                gpu_inflate_status_s *comp_out,
                                 const statistics_chunk *page_stats,
                                 const statistics_chunk *chunk_stats)
 {
-  gpu::EncodePages(
-    pages, chunks.device_ptr(), pages_in_batch, first_page_in_batch, comp_in, comp_out, stream);
+  auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
+
+  auto batch_pages_stats =
+    (page_stats != nullptr)
+      ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
+      : device_span<statistics_chunk const>();
+
+  uint32_t max_comp_pages =
+    (compression_ != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+
+  rmm::device_uvector<gpu_inflate_input_s> compression_input(max_comp_pages, stream);
+  rmm::device_uvector<gpu_inflate_status_s> compression_status(max_comp_pages, stream);
+
+  device_span<gpu_inflate_input_s> comp_in{compression_input.data(), compression_input.size()};
+  device_span<gpu_inflate_status_s> comp_stat{compression_status.data(), compression_status.size()};
+
+  gpu::EncodePages(batch_pages, comp_in, comp_stat, stream);
   switch (compression_) {
     case parquet::Compression::SNAPPY:
-      CUDA_TRY(gpu_snap(comp_in, comp_out, pages_in_batch, stream));
+      CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
       break;
     default: break;
   }
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
-  DecideCompression(chunks.device_ptr() + first_rowgroup * num_columns,
-                    pages,
-                    rowgroups_in_batch * num_columns,
-                    first_page_in_batch,
-                    comp_out,
-                    stream);
-  EncodePageHeaders(pages,
-                    chunks.device_ptr(),
-                    pages_in_batch,
-                    first_page_in_batch,
-                    comp_out,
-                    page_stats,
-                    chunk_stats,
-                    stream);
-  GatherPages(chunks.device_ptr() + first_rowgroup * num_columns,
-              pages,
-              rowgroups_in_batch * num_columns,
-              stream);
-  CUDA_TRY(cudaMemcpyAsync(&chunks[first_rowgroup * num_columns],
-                           chunks.device_ptr() + first_rowgroup * num_columns,
-                           rowgroups_in_batch * num_columns * sizeof(gpu::EncColumnChunk),
+  auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
+  DecideCompression(d_chunks_in_batch.flat_view(), stream);
+  EncodePageHeaders(batch_pages, comp_stat, batch_pages_stats, chunk_stats, stream);
+  GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
+
+  auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
+  CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
+                           d_chunks_in_batch.data(),
+                           d_chunks_in_batch.flat_view().size_bytes(),
                            cudaMemcpyDeviceToHost,
                            stream.value()));
   stream.synchronize();
@@ -863,8 +846,8 @@ void writer::impl::encode_pages(hostdevice_vector<gpu::EncColumnChunk> &chunks,
 writer::impl::impl(std::unique_ptr<data_sink> sink,
                    parquet_writer_options const &options,
                    SingleWriteMode mode,
-                   rmm::mr::device_memory_resource *mr,
-                   rmm::cuda_stream_view stream)
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource *mr)
   : _mr(mr),
     stream(stream),
     compression_(to_parquet_compression(options.get_compression())),
@@ -882,8 +865,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
 writer::impl::impl(std::unique_ptr<data_sink> sink,
                    chunked_parquet_writer_options const &options,
                    SingleWriteMode mode,
-                   rmm::mr::device_memory_resource *mr,
-                   rmm::cuda_stream_view stream)
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource *mr)
   : _mr(mr),
     stream(stream),
     compression_(to_parquet_compression(options.get_compression())),
@@ -977,8 +960,8 @@ void writer::impl::write(table_view const &table)
   // This should've been `auto const&` but isn't since dictionary space is allocated when calling
   // get_device_view(). Fix during dictionary refactor.
   std::transform(
-    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [](auto &pcol) {
-      return pcol.get_device_view();
+    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto &pcol) {
+      return pcol.get_device_view(stream);
     });
 
   // Init page fragments
@@ -993,15 +976,16 @@ void writer::impl::write(table_view const &table)
                 "fragment size cannot be greater than max_page_fragment_size");
 
   uint32_t num_fragments = (uint32_t)((num_rows + fragment_size - 1) / fragment_size);
-  hostdevice_vector<gpu::PageFragment> fragments(num_columns * num_fragments, stream);
+  cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
+    num_columns, num_fragments, stream);
 
-  if (fragments.size() != 0) {
+  if (num_fragments != 0) {
     // Move column info to device
     col_desc.host_to_device(stream);
     leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
-    init_page_fragments(fragments, col_desc, num_columns, num_fragments, num_rows, fragment_size);
+    init_page_fragments(fragments, col_desc, num_rows, fragment_size);
   }
 
   size_t global_rowgroup_base = md.row_groups.size();
@@ -1014,7 +998,7 @@ void writer::impl::write(table_view const &table)
     size_t fragment_data_size = 0;
     // Replace with STL algorithm to transform and sum
     for (auto i = 0; i < num_columns; i++) {
-      fragment_data_size += fragments[i * num_fragments + f].fragment_data_size;
+      fragment_data_size += fragments[i][f].fragment_data_size;
     }
     if (f > rowgroup_start && (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
                                (f + 1 - rowgroup_start) * fragment_size > max_rowgroup_rows_)) {
@@ -1035,17 +1019,18 @@ void writer::impl::write(table_view const &table)
   }
 
   // Allocate column chunks and gather fragment statistics
-  rmm::device_vector<statistics_chunk> frag_stats;
+  rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
   if (stats_granularity_ != statistics_freq::STATISTICS_NONE) {
-    frag_stats.resize(num_fragments * num_columns);
+    frag_stats.resize(num_fragments * num_columns, stream);
     if (frag_stats.size() != 0) {
-      gather_fragment_statistics(
-        frag_stats.data().get(), fragments, col_desc, num_columns, num_fragments, fragment_size);
+      auto frag_stats_2dview =
+        device_2dspan<statistics_chunk>(frag_stats.data(), num_columns, num_fragments);
+      gather_fragment_statistics(frag_stats_2dview, fragments, col_desc, num_fragments);
     }
   }
   // Initialize row groups and column chunks
   uint32_t num_chunks = num_rowgroups * num_columns;
-  hostdevice_vector<gpu::EncColumnChunk> chunks(num_chunks, stream);
+  hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
   uint32_t num_dictionaries = 0;
   for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
@@ -1054,40 +1039,31 @@ void writer::impl::write(table_view const &table)
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
     for (int i = 0; i < num_columns; i++) {
-      gpu::EncColumnChunk *ck = &chunks[r * num_columns + i];
+      gpu::EncColumnChunk *ck = &chunks[r][i];
       bool dict_enable        = false;
 
-      ck->col_desc         = col_desc.device_ptr() + i;
-      ck->uncompressed_bfr = nullptr;
-      ck->compressed_bfr   = nullptr;
-      ck->bfr_size         = 0;
-      ck->compressed_size  = 0;
-      ck->fragments        = fragments.device_ptr() + i * num_fragments + f;
-      ck->stats =
-        (frag_stats.size() != 0) ? frag_stats.data().get() + i * num_fragments + f : nullptr;
-      ck->start_row      = start_row;
-      ck->num_rows       = (uint32_t)md.row_groups[global_r].num_rows;
-      ck->first_fragment = i * num_fragments + f;
+      *ck           = {};
+      ck->col_desc  = col_desc.device_ptr() + i;
+      ck->fragments = &fragments.device_view()[i][f];
+      ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr;
+      ck->start_row        = start_row;
+      ck->num_rows         = (uint32_t)md.row_groups[global_r].num_rows;
+      ck->first_fragment   = i * num_fragments + f;
+      auto chunk_fragments = fragments[i].subspan(f, fragments_in_chunk);
       ck->num_values =
-        std::accumulate(fragments.host_ptr(i * num_fragments + f),
-                        fragments.host_ptr(i * num_fragments + f) + fragments_in_chunk,
-                        0,
-                        [](uint32_t l, auto r) { return l + r.num_values; });
-      ck->first_page    = 0;
-      ck->num_pages     = 0;
-      ck->is_compressed = 0;
+        std::accumulate(chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) {
+          return l + r.num_values;
+        });
       ck->dictionary_id = num_dictionaries;
-      ck->ck_stat_size  = 0;
       if (col_desc[i].dict_data) {
-        const gpu::PageFragment *ck_frag = &fragments[i * num_fragments + f];
-        size_t plain_size                = 0;
-        size_t dict_size                 = 1;
-        uint32_t num_dict_vals           = 0;
+        size_t plain_size      = 0;
+        size_t dict_size       = 1;
+        uint32_t num_dict_vals = 0;
         for (uint32_t j = 0; j < fragments_in_chunk && num_dict_vals < 65536; j++) {
-          plain_size += ck_frag[j].fragment_data_size;
-          dict_size +=
-            ck_frag[j].dict_data_size + ((num_dict_vals > 256) ? 2 : 1) * ck_frag[j].non_nulls;
-          num_dict_vals += ck_frag[j].num_dict_vals;
+          plain_size += chunk_fragments[j].fragment_data_size;
+          dict_size += chunk_fragments[j].dict_data_size +
+                       ((num_dict_vals > 256) ? 2 : 1) * chunk_fragments[j].non_nulls;
+          num_dict_vals += chunk_fragments[j].num_dict_vals;
         }
         if (dict_size < plain_size) {
           parquet_columns[i].use_dictionary(true);
@@ -1112,11 +1088,11 @@ void writer::impl::write(table_view const &table)
   }
 
   // Free unused dictionaries
-  for (auto &col : parquet_columns) { col.check_dictionary_used(); }
+  for (auto &col : parquet_columns) { col.check_dictionary_used(stream); }
 
   // Build chunk dictionaries and count pages
   if (num_chunks != 0) {
-    build_chunk_dictionaries(chunks, col_desc, num_rowgroups, num_columns, num_dictionaries);
+    build_chunk_dictionaries(chunks, col_desc, num_columns, num_dictionaries);
   }
 
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
@@ -1131,7 +1107,7 @@ void writer::impl::write(table_view const &table)
     size_t rowgroup_size = 0;
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk *ck = &chunks[r * num_columns + i];
+        gpu::EncColumnChunk *ck = &chunks[r][i];
         ck->first_page          = num_pages;
         num_pages += ck->num_pages;
         pages_in_batch += ck->num_pages;
@@ -1161,22 +1137,20 @@ void writer::impl::write(table_view const &table)
     (compression_ != parquet::Compression::UNCOMPRESSED)
       ? gpu::GetMaxCompressedBfrSize(max_uncomp_bfr_size, max_pages_in_batch)
       : 0;
-  uint32_t max_comp_pages =
-    (compression_ != parquet::Compression::UNCOMPRESSED) ? max_pages_in_batch : 0;
   uint32_t num_stats_bfr =
     (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
   rmm::device_buffer uncomp_bfr(max_uncomp_bfr_size, stream);
   rmm::device_buffer comp_bfr(max_comp_bfr_size, stream);
-  rmm::device_vector<gpu_inflate_input_s> comp_in(max_comp_pages);
-  rmm::device_vector<gpu_inflate_status_s> comp_out(max_comp_pages);
-  rmm::device_vector<gpu::EncPage> pages(num_pages);
-  rmm::device_vector<statistics_chunk> page_stats(num_stats_bfr);
+  rmm::device_uvector<gpu::EncPage> pages(num_pages, stream);
+
+  // This contains stats for both the pages and the rowgroups. TODO: make them separate.
+  rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
   for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) {
     uint8_t *bfr   = static_cast<uint8_t *>(uncomp_bfr.data());
     uint8_t *bfr_c = static_cast<uint8_t *>(comp_bfr.data());
     for (uint32_t j = 0; j < batch_list[b]; j++, r++) {
       for (int i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk *ck = &chunks[r * num_columns + i];
+        gpu::EncColumnChunk *ck = &chunks[r][i];
         ck->uncompressed_bfr    = bfr;
         ck->compressed_bfr      = bfr_c;
         bfr += ck->bfr_size;
@@ -1188,10 +1162,9 @@ void writer::impl::write(table_view const &table)
   if (num_pages != 0) {
     init_encoder_pages(chunks,
                        col_desc,
-                       pages.data().get(),
-                       (num_stats_bfr) ? page_stats.data().get() : nullptr,
-                       (num_stats_bfr) ? frag_stats.data().get() : nullptr,
-                       num_rowgroups,
+                       {pages.data(), pages.size()},
+                       (num_stats_bfr) ? page_stats.data() : nullptr,
+                       (num_stats_bfr) ? frag_stats.data() : nullptr,
                        num_columns,
                        num_pages,
                        num_stats_bfr);
@@ -1204,26 +1177,24 @@ void writer::impl::write(table_view const &table)
        b++) {
     // Count pages in this batch
     uint32_t rnext               = r + batch_list[b];
-    uint32_t first_page_in_batch = chunks[r * num_columns].first_page;
+    uint32_t first_page_in_batch = chunks[r][0].first_page;
     uint32_t first_page_in_next_batch =
-      (rnext < num_rowgroups) ? chunks[rnext * num_columns].first_page : num_pages;
+      (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
     uint32_t pages_in_batch = first_page_in_next_batch - first_page_in_batch;
+    // device_span<gpu::EncPage> batch_pages{pages.data() + first_page_in_batch, }
     encode_pages(
       chunks,
-      pages.data().get(),
-      num_columns,
+      {pages.data(), pages.size()},
       pages_in_batch,
       first_page_in_batch,
       batch_list[b],
       r,
-      comp_in.data().get(),
-      comp_out.data().get(),
-      (stats_granularity_ == statistics_freq::STATISTICS_PAGE) ? page_stats.data().get() : nullptr,
-      (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data().get() + num_pages
+      (stats_granularity_ == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr,
+      (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                                : nullptr);
     for (; r < rnext; r++, global_r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk *ck = &chunks[r * num_columns + i];
+        gpu::EncColumnChunk *ck = &chunks[r][i];
         uint8_t *dev_bfr;
         if (ck->is_compressed) {
           md.row_groups[global_r].columns[i].meta_data.codec = compression_;
@@ -1321,18 +1292,18 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
 writer::writer(std::unique_ptr<data_sink> sink,
                parquet_writer_options const &options,
                SingleWriteMode mode,
-               rmm::mr::device_memory_resource *mr,
-               rmm::cuda_stream_view stream)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, mr, stream))
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource *mr)
+  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
 
 writer::writer(std::unique_ptr<data_sink> sink,
                chunked_parquet_writer_options const &options,
                SingleWriteMode mode,
-               rmm::mr::device_memory_resource *mr,
-               rmm::cuda_stream_view stream)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, mr, stream))
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource *mr)
+  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
 {
 }
 
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index b8532d755eb..8d9bdc8adbd 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -21,8 +21,8 @@
 
 #pragma once
 
-#include <io/parquet/parquet.hpp>
-#include <io/parquet/parquet_gpu.hpp>
+#include "parquet.hpp"
+#include "parquet_gpu.hpp"
 
 #include <cudf/io/data_sink.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
@@ -48,6 +48,9 @@ struct parquet_column_view;
 
 using namespace cudf::io::parquet;
 using namespace cudf::io;
+using cudf::detail::device_2dspan;
+using cudf::detail::host_2dspan;
+using cudf::detail::hostdevice_2dvector;
 
 /**
  * @brief Implementation for parquet writer
@@ -67,14 +70,14 @@ class writer::impl {
    * @param filepath Filepath if storing dataset to a file
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   explicit impl(std::unique_ptr<data_sink> sink,
                 parquet_writer_options const& options,
                 SingleWriteMode mode,
-                rmm::mr::device_memory_resource* mr,
-                rmm::cuda_stream_view stream);
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Constructor with chunked writer options.
@@ -88,8 +91,8 @@ class writer::impl {
   explicit impl(std::unique_ptr<data_sink> sink,
                 chunked_parquet_writer_options const& options,
                 SingleWriteMode mode,
-                rmm::mr::device_memory_resource* mr,
-                rmm::cuda_stream_view stream);
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr);
 
   /**
    * @brief Destructor to complete any incomplete write and release resources.
@@ -124,15 +127,11 @@ class writer::impl {
    *
    * @param frag Destination page fragments
    * @param col_desc column description array
-   * @param num_columns Total number of columns
-   * @param num_fragments Total number of fragments per column
    * @param num_rows Total number of rows
    * @param fragment_size Number of rows per fragment
    */
-  void init_page_fragments(hostdevice_vector<gpu::PageFragment>& frag,
-                           hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
-                           uint32_t num_columns,
-                           uint32_t num_fragments,
+  void init_page_fragments(hostdevice_2dvector<gpu::PageFragment>& frag,
+                           device_span<gpu::parquet_column_device_view const> col_desc,
                            uint32_t num_rows,
                            uint32_t fragment_size);
 
@@ -142,28 +141,22 @@ class writer::impl {
    * @param dst_stats output statistics
    * @param frag Input page fragments
    * @param col_desc column description array
-   * @param num_columns Total number of columns
    * @param num_fragments Total number of fragments per column
-   * @param fragment_size Number of rows per fragment
    */
-  void gather_fragment_statistics(statistics_chunk* dst_stats,
-                                  hostdevice_vector<gpu::PageFragment>& frag,
-                                  hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
-                                  uint32_t num_columns,
-                                  uint32_t num_fragments,
-                                  uint32_t fragment_size);
+  void gather_fragment_statistics(device_2dspan<statistics_chunk> dst_stats,
+                                  device_2dspan<gpu::PageFragment const> frag,
+                                  device_span<gpu::parquet_column_device_view const> col_desc,
+                                  uint32_t num_fragments);
   /**
    * @brief Build per-chunk dictionaries and count data pages
    *
    * @param chunks column chunk array
    * @param col_desc column description array
-   * @param num_rowgroups Total number of rowgroups
    * @param num_columns Total number of columns
    * @param num_dictionaries Total number of dictionaries
    */
-  void build_chunk_dictionaries(hostdevice_vector<gpu::EncColumnChunk>& chunks,
-                                hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
-                                uint32_t num_rowgroups,
+  void build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                                device_span<gpu::parquet_column_device_view const> col_desc,
                                 uint32_t num_columns,
                                 uint32_t num_dictionaries);
   /**
@@ -172,17 +165,15 @@ class writer::impl {
    * @param chunks column chunk array
    * @param col_desc column description array
    * @param pages encoder pages array
-   * @param num_rowgroups Total number of rowgroups
    * @param num_columns Total number of columns
    * @param num_pages Total number of pages
    * @param num_stats_bfr Number of statistics buffers
    */
-  void init_encoder_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
-                          hostdevice_vector<gpu::parquet_column_device_view>& col_desc,
-                          gpu::EncPage* pages,
+  void init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                          device_span<gpu::parquet_column_device_view const> col_desc,
+                          device_span<gpu::EncPage> pages,
                           statistics_chunk* page_stats,
                           statistics_chunk* frag_stats,
-                          uint32_t num_rowgroups,
                           uint32_t num_columns,
                           uint32_t num_pages,
                           uint32_t num_stats_bfr);
@@ -191,25 +182,19 @@ class writer::impl {
    *
    * @param chunks column chunk array
    * @param pages encoder pages array
-   * @param num_columns Total number of columns
    * @param pages_in_batch number of pages in this batch
    * @param first_page_in_batch first page in batch
    * @param rowgroups_in_batch number of rowgroups in this batch
    * @param first_rowgroup first rowgroup in batch
-   * @param comp_in compressor input array
-   * @param comp_out compressor status array
    * @param page_stats optional page-level statistics (nullptr if none)
    * @param chunk_stats optional chunk-level statistics (nullptr if none)
    */
-  void encode_pages(hostdevice_vector<gpu::EncColumnChunk>& chunks,
-                    gpu::EncPage* pages,
-                    uint32_t num_columns,
+  void encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                    device_span<gpu::EncPage> pages,
                     uint32_t pages_in_batch,
                     uint32_t first_page_in_batch,
                     uint32_t rowgroups_in_batch,
                     uint32_t first_rowgroup,
-                    gpu_inflate_input_s* comp_in,
-                    gpu_inflate_status_s* comp_out,
                     const statistics_chunk* page_stats,
                     const statistics_chunk* chunk_stats);
 
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
new file mode 100644
index 00000000000..fd148724712
--- /dev/null
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file column_statistics.cuh
+ * @brief Functors for statistics calculation to be used in ORC and PARQUET
+ */
+
+#pragma once
+
+#include "temp_storage_wrapper.cuh"
+
+#include "typed_statistics_chunk.cuh"
+
+#include "statistics.cuh"
+
+namespace cudf {
+namespace io {
+
+/**
+ * @brief shared state for statistics calculation kernel
+ */
+struct stats_state_s {
+  stats_column_desc col;   ///< Column information
+  statistics_group group;  ///< Group description
+  statistics_chunk ck;     ///< Output statistics chunk
+};
+
+/**
+ * @brief shared state for statistics merge kernel
+ */
+struct merge_state_s {
+  stats_column_desc col;         ///< Column information
+  statistics_merge_group group;  ///< Group description
+  statistics_chunk ck;           ///< Resulting statistics chunk
+};
+
+template <int dimension>
+using block_reduce_storage = detail::block_reduce_storage<dimension>;
+
+/**
+ * @brief Functor to calculate the statistics of rows in a column belonging to a
+ * statistics group
+ *
+ * @tparam block_size Dimension of the block
+ * @tparam IO File format for which statistics calculation is being done
+ */
+template <int block_size, detail::io_file_format IO>
+struct calculate_group_statistics_functor {
+  block_reduce_storage<block_size> &temp_storage;
+
+  /**
+   * @brief Construct a statistics calculator
+   *
+   * @param d_temp_storage Temporary storage to be used by cub calls
+   */
+  __device__ calculate_group_statistics_functor(block_reduce_storage<block_size> &d_temp_storage)
+    : temp_storage(d_temp_storage)
+  {
+  }
+
+  template <typename T,
+            std::enable_if_t<detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
+  __device__ void operator()(stats_state_s &s, uint32_t t)
+  {
+    // No-op for unsupported aggregation types
+  }
+
+  /**
+   * @brief Iterates through the rows specified by statistics group and stores the combined
+   * statistics into the statistics chunk.
+   *
+   * @param s Statistics state which specifies the column, the group being worked and the chunk
+   * the results will be stored into
+   * @param t thread id
+   */
+  template <typename T,
+            std::enable_if_t<not detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
+  __device__ void operator()(stats_state_s &s, uint32_t t)
+  {
+    detail::storage_wrapper<block_size> storage(temp_storage);
+
+    using type_convert = detail::type_conversion<detail::conversion_map<IO>>;
+    using CT           = typename type_convert::template type<T>;
+    typed_statistics_chunk<CT, detail::statistics_type_category<T, IO>::is_aggregated> chunk(
+      s.group.num_rows);
+
+    for (uint32_t i = 0; i < s.group.num_rows; i += block_size) {
+      uint32_t r          = i + t;
+      uint32_t row        = r + s.group.start_row;
+      auto const is_valid = (r < s.group.num_rows) ? s.col.leaf_column->is_valid(row) : 0;
+      if (is_valid) {
+        auto converted_value = type_convert::convert(s.col.leaf_column->element<T>(row));
+        chunk.reduce(converted_value);
+      }
+    }
+
+    chunk = block_reduce(chunk, storage);
+
+    if (t == 0) { s.ck = get_untyped_chunk(chunk); }
+  }
+};
+
+/**
+ * @brief Functor to merge the statistics chunks of a column belonging to a
+ * merge group
+ *
+ * @tparam block_size Dimension of the block
+ * @tparam IO File format for which statistics calculation is being done
+ */
+template <int block_size, detail::io_file_format IO>
+struct merge_group_statistics_functor {
+  block_reduce_storage<block_size> &temp_storage;
+
+  __device__ merge_group_statistics_functor(block_reduce_storage<block_size> &d_temp_storage)
+    : temp_storage(d_temp_storage)
+  {
+  }
+
+  template <typename T,
+            std::enable_if_t<detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
+  __device__ void operator()(merge_state_s &s,
+                             const statistics_chunk *chunks,
+                             const uint32_t num_chunks,
+                             uint32_t t)
+  {
+    // No-op for unsupported aggregation types
+  }
+
+  template <typename T,
+            std::enable_if_t<not detail::statistics_type_category<T, IO>::is_ignored> * = nullptr>
+  __device__ void operator()(merge_state_s &s,
+                             const statistics_chunk *chunks,
+                             const uint32_t num_chunks,
+                             uint32_t t)
+  {
+    detail::storage_wrapper<block_size> storage(temp_storage);
+
+    typed_statistics_chunk<T, detail::statistics_type_category<T, IO>::is_aggregated> chunk;
+
+    for (uint32_t i = t; i < num_chunks; i += block_size) { chunk.reduce(chunks[i]); }
+    chunk.has_minmax = (chunk.minimum_value <= chunk.maximum_value);
+
+    chunk = block_reduce(chunk, storage);
+
+    if (t == 0) { s.ck = get_untyped_chunk(chunk); }
+  }
+};
+
+/**
+ * @brief Function to cooperatively load an object from a pointer
+ *
+ * If the pointer is nullptr then the members of the object are set to 0
+ *
+ * @param[out] destination Object being loaded
+ * @param[in] source Source object
+ * @tparam T Type of object
+ */
+template <typename T>
+__device__ void cooperative_load(T &destination, const T *source = nullptr)
+{
+  using load_type = std::conditional_t<((sizeof(T) % sizeof(uint32_t)) == 0), uint32_t, uint8_t>;
+  if (source == nullptr) {
+    for (auto i = threadIdx.x; i < (sizeof(T) / sizeof(load_type)); i += blockDim.x) {
+      reinterpret_cast<load_type *>(&destination)[i] = load_type{0};
+    }
+  } else {
+    for (auto i = threadIdx.x; i < sizeof(T) / sizeof(load_type); i += blockDim.x) {
+      reinterpret_cast<load_type *>(&destination)[i] =
+        reinterpret_cast<const load_type *>(source)[i];
+    }
+  }
+}
+
+/**
+ * @brief Kernel to calculate group statistics
+ *
+ * @param[out] chunks Statistics results [num_chunks]
+ * @param[in] groups Statistics row groups [num_chunks]
+ * @tparam block_size Dimension of the block
+ * @tparam IO File format for which statistics calculation is being done
+ */
+template <int block_size, detail::io_file_format IO>
+__global__ void __launch_bounds__(block_size, 1)
+  gpu_calculate_group_statistics(statistics_chunk *chunks, const statistics_group *groups)
+{
+  __shared__ __align__(8) stats_state_s state;
+  __shared__ block_reduce_storage<block_size> storage;
+
+  // Load state members
+  cooperative_load(state.group, &groups[blockIdx.x]);
+  cooperative_load(state.ck);
+  __syncthreads();
+  cooperative_load(state.col, state.group.col);
+  __syncthreads();
+
+  // Calculate statistics
+  type_dispatcher(state.col.leaf_column->type(),
+                  calculate_group_statistics_functor<block_size, IO>(storage),
+                  state,
+                  threadIdx.x);
+  __syncthreads();
+
+  cooperative_load(chunks[blockIdx.x], &state.ck);
+}
+
+namespace detail {
+
+/**
+ * @brief Launches kernel to calculate group statistics
+ *
+ * @param[out] chunks Statistics results [num_chunks]
+ * @param[in] groups Statistics row groups [num_chunks]
+ * @param[in] num_chunks Number of chunks & rowgroups
+ * @param[in] stream CUDA stream to use
+ * @tparam IO File format for which statistics calculation is being done
+ */
+template <detail::io_file_format IO>
+void calculate_group_statistics(statistics_chunk *chunks,
+                                const statistics_group *groups,
+                                uint32_t num_chunks,
+                                rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 256;
+  gpu_calculate_group_statistics<block_size, IO>
+    <<<num_chunks, block_size, 0, stream.value()>>>(chunks, groups);
+}
+
+/**
+ * @brief Kernel to merge column statistics
+ *
+ * @param[out] chunks_out Statistics results [num_chunks]
+ * @param[in] chunks_in Input statistics
+ * @param[in] groups Statistics groups [num_chunks]
+ * @tparam block_size Dimension of the block
+ * @tparam IO File format for which statistics calculation is being done
+ */
+template <int block_size, detail::io_file_format IO>
+__global__ void __launch_bounds__(block_size, 1)
+  gpu_merge_group_statistics(statistics_chunk *chunks_out,
+                             const statistics_chunk *chunks_in,
+                             const statistics_merge_group *groups)
+{
+  __shared__ __align__(8) merge_state_s state;
+  __shared__ block_reduce_storage<block_size> storage;
+
+  cooperative_load(state.group, &groups[blockIdx.x]);
+  __syncthreads();
+  cooperative_load(state.col, state.group.col);
+  __syncthreads();
+
+  type_dispatcher(state.col.leaf_column->type(),
+                  merge_group_statistics_functor<block_size, IO>(storage),
+                  state,
+                  chunks_in + state.group.start_chunk,
+                  state.group.num_chunks,
+                  threadIdx.x);
+  __syncthreads();
+
+  cooperative_load(chunks_out[blockIdx.x], &state.ck);
+}
+
+/**
+ * @brief Launches kernel to merge column statistics
+ *
+ * @param[out] chunks_out Statistics results [num_chunks]
+ * @param[in] chunks_in Input statistics
+ * @param[in] groups Statistics groups [num_chunks]
+ * @param[in] num_chunks Number of chunks & groups
+ * @param[in] stream CUDA stream to use
+ * @tparam IO File format for which statistics calculation is being done
+ */
+template <detail::io_file_format IO>
+void merge_group_statistics(statistics_chunk *chunks_out,
+                            const statistics_chunk *chunks_in,
+                            const statistics_merge_group *groups,
+                            uint32_t num_chunks,
+                            rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 256;
+  gpu_merge_group_statistics<block_size, IO>
+    <<<num_chunks, block_size, 0, stream.value()>>>(chunks_out, chunks_in, groups);
+}
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/statistics/column_stats.cu b/cpp/src/io/statistics/column_stats.cu
deleted file mode 100644
index 52f21f0a9ad..00000000000
--- a/cpp/src/io/statistics/column_stats.cu
+++ /dev/null
@@ -1,566 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "column_stats.h"
-
-#include <cudf/strings/string_view.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <cub/cub.cuh>
-
-#include <math_constants.h>
-
-constexpr int block_size = 1024;
-
-namespace cudf {
-namespace io {
-/**
- * @brief shared state for statistics gather kernel
- */
-struct stats_state_s {
-  stats_column_desc col;                 ///< Column information
-  statistics_group group;                ///< Group description
-  statistics_chunk ck;                   ///< Output statistics chunk
-  volatile statistics_val warp_min[32];  ///< Min reduction scratch
-  volatile statistics_val warp_max[32];  ///< Max reduction scratch
-};
-
-/**
- * @brief shared state for statistics merge kernel
- */
-struct merge_state_s {
-  stats_column_desc col;                 ///< Column information
-  statistics_merge_group group;          ///< Group description
-  statistics_chunk ck;                   ///< Resulting statistics chunk
-  volatile statistics_val warp_min[32];  ///< Min reduction scratch
-  volatile statistics_val warp_max[32];  ///< Max reduction scratch
-};
-
-/**
- * Custom addition functor to ignore NaN inputs
- */
-struct IgnoreNaNSum {
-  __device__ __forceinline__ double operator()(const double &a, const double &b)
-  {
-    double aval = isnan(a) ? 0 : a;
-    double bval = isnan(b) ? 0 : b;
-    return aval + bval;
-  }
-};
-
-/**
- * @brief Gather statistics for integer-like columns
- *
- * @param s shared block state
- * @param dtype data type
- * @param t thread id
- * @param storage temporary storage for reduction
- */
-template <typename Storage>
-void __device__
-gatherIntColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Storage &storage)
-{
-  using block_reduce = cub::BlockReduce<int64_t, block_size>;
-  int64_t vmin       = INT64_MAX;
-  int64_t vmax       = INT64_MIN;
-  int64_t vsum       = 0;
-  int64_t v;
-  uint32_t nn_cnt = 0;
-  __shared__ volatile bool has_minmax;
-  for (uint32_t i = 0; i < s->group.num_rows; i += block_size) {
-    uint32_t r        = i + t;
-    uint32_t row      = r + s->group.start_row;
-    uint32_t is_valid = (r < s->group.num_rows) ? s->col.leaf_column->is_valid(row) : 0;
-    if (is_valid) {
-      switch (dtype) {
-        case dtype_int32:
-        case dtype_date32: v = s->col.leaf_column->element<int32_t>(row); break;
-        case dtype_int64:
-        case dtype_decimal64: v = s->col.leaf_column->element<int64_t>(row); break;
-        case dtype_int16: v = s->col.leaf_column->element<int16_t>(row); break;
-        case dtype_timestamp64:
-          v = s->col.leaf_column->element<int64_t>(row);
-          if (s->col.ts_scale < -1) {
-            v /= -s->col.ts_scale;
-          } else if (s->col.ts_scale > 1) {
-            v *= s->col.ts_scale;
-          }
-          break;
-        default: v = s->col.leaf_column->element<int8_t>(row); break;
-      }
-      vmin = min(vmin, v);
-      vmax = max(vmax, v);
-      vsum += v;
-    }
-    nn_cnt += __syncthreads_count(is_valid);
-  }
-  if (!t) {
-    s->ck.non_nulls  = nn_cnt;
-    s->ck.null_count = s->group.num_rows - nn_cnt;
-  }
-  vmin = block_reduce(storage.integer_stats).Reduce(vmin, cub::Min());
-  __syncthreads();
-  vmax = block_reduce(storage.integer_stats).Reduce(vmax, cub::Max());
-  if (!t) { has_minmax = (vmin <= vmax); }
-  __syncthreads();
-  if (has_minmax) { vsum = block_reduce(storage.integer_stats).Sum(vsum); }
-  if (!t) {
-    if (has_minmax) {
-      s->ck.min_value.i_val = vmin;
-      s->ck.max_value.i_val = vmax;
-      s->ck.sum.i_val       = vsum;
-    }
-    s->ck.has_minmax = has_minmax;
-    // TODO: For now, don't set the sum flag with 64-bit values so we don't have to check for
-    // 64-bit sum overflow
-    s->ck.has_sum = (dtype <= dtype_int32 && has_minmax);
-  }
-}
-
-/**
- * @brief Gather statistics for floating-point columns
- *
- * @param s shared block state
- * @param dtype data type
- * @param t thread id
- * @param storage temporary storage for reduction
- */
-template <typename Storage>
-void __device__
-gatherFloatColumnStats(stats_state_s *s, statistics_dtype dtype, uint32_t t, Storage &storage)
-{
-  using block_reduce = cub::BlockReduce<double, block_size>;
-  double vmin        = CUDART_INF;
-  double vmax        = -CUDART_INF;
-  double vsum        = 0;
-  double v;
-  uint32_t nn_cnt = 0;
-  __shared__ volatile bool has_minmax;
-  for (uint32_t i = 0; i < s->group.num_rows; i += block_size) {
-    uint32_t r        = i + t;
-    uint32_t row      = r + s->group.start_row;
-    uint32_t is_valid = (r < s->group.num_rows) ? s->col.leaf_column->is_valid(row) : 0;
-    if (is_valid) {
-      if (dtype == dtype_float64) {
-        v = s->col.leaf_column->element<double>(row);
-      } else {
-        v = s->col.leaf_column->element<float>(row);
-      }
-      vmin = min(vmin, v);
-      vmax = max(vmax, v);
-      if (!isnan(v)) { vsum += v; }
-    }
-    nn_cnt += __syncthreads_count(is_valid);
-  }
-  if (!t) {
-    s->ck.non_nulls  = nn_cnt;
-    s->ck.null_count = s->group.num_rows - nn_cnt;
-  }
-  vmin = block_reduce(storage.float_stats).Reduce(vmin, cub::Min());
-  __syncthreads();
-  vmax = block_reduce(storage.float_stats).Reduce(vmax, cub::Max());
-  if (!t) { has_minmax = (vmin <= vmax); }
-  __syncthreads();
-  if (has_minmax) { vsum = block_reduce(storage.float_stats).Reduce(vsum, IgnoreNaNSum()); }
-  if (!t) {
-    if (has_minmax) {
-      s->ck.min_value.fp_val = (vmin != 0.0) ? vmin : CUDART_NEG_ZERO;
-      s->ck.max_value.fp_val = (vmax != 0.0) ? vmax : CUDART_ZERO;
-      s->ck.sum.fp_val       = vsum;
-    }
-    s->ck.has_minmax = has_minmax;
-    s->ck.has_sum    = has_minmax;  // Implies sum is valid as well
-  }
-}
-
-/**
- * @brief Gather statistics for string columns
- *
- * @param s shared block state
- * @param t thread id
- * @param storage temporary storage for reduction
- */
-template <typename Storage>
-void __device__ gatherStringColumnStats(stats_state_s *s, uint32_t t, Storage &storage)
-{
-  using block_reduce  = cub::BlockReduce<uint32_t, block_size>;
-  using string_reduce = cub::BlockReduce<string_view, block_size>;
-  uint32_t len_sum    = 0;
-  uint32_t nn_cnt     = 0;
-  bool has_minmax     = false;
-
-  string_view minimum_value = string_view::max();
-  string_view maximum_value = string_view::min();
-
-  for (uint32_t i = 0; i < s->group.num_rows; i += block_size) {
-    uint32_t r        = i + t;
-    uint32_t row      = r + s->group.start_row;
-    uint32_t is_valid = (r < s->group.num_rows) ? s->col.leaf_column->is_valid(row) : 0;
-    if (is_valid) {
-      has_minmax = true;
-      auto str   = s->col.leaf_column->element<string_view>(row);
-      len_sum += str.size_bytes();
-      minimum_value = thrust::min<string_view>(minimum_value, str);
-      maximum_value = thrust::max<string_view>(maximum_value, str);
-    }
-    nn_cnt += __syncthreads_count(is_valid);
-  }
-  if (!t) {
-    s->ck.non_nulls  = nn_cnt;
-    s->ck.null_count = s->group.num_rows - nn_cnt;
-  }
-  minimum_value = string_reduce(storage.string_val_stats).Reduce(minimum_value, cub::Min());
-  __syncthreads();
-  maximum_value = string_reduce(storage.string_val_stats).Reduce(maximum_value, cub::Max());
-  has_minmax    = __syncthreads_or(has_minmax);
-  if (has_minmax) { len_sum = block_reduce(storage.string_stats).Sum(len_sum); }
-
-  if (!t) {
-    if (has_minmax) {
-      s->ck.min_value.str_val = minimum_value;
-      s->ck.max_value.str_val = maximum_value;
-      s->ck.sum.i_val         = len_sum;
-    }
-    s->ck.has_minmax = has_minmax;
-    s->ck.has_sum    = has_minmax;
-  }
-}
-
-/**
- * @brief Gather column chunk statistics (min/max values, sum and null count)
- * for a group of rows.
- *
- * blockDim {1024,1,1}
- *
- * @param chunks Destination statistics results
- * @param groups Statistics source information
- */
-template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  gpuGatherColumnStatistics(statistics_chunk *chunks, const statistics_group *groups)
-{
-  __shared__ __align__(8) stats_state_s state_g;
-  __shared__ union {
-    typename cub::BlockReduce<int64_t, block_size>::TempStorage integer_stats;
-    typename cub::BlockReduce<double, block_size>::TempStorage float_stats;
-    typename cub::BlockReduce<uint32_t, block_size>::TempStorage string_stats;
-    typename cub::BlockReduce<string_view, block_size>::TempStorage string_val_stats;
-  } temp_storage;
-
-  stats_state_s *const s = &state_g;
-  uint32_t t             = threadIdx.x;
-  statistics_dtype dtype;
-
-  if (t < sizeof(statistics_group) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&s->group)[t] =
-      reinterpret_cast<const uint32_t *>(&groups[blockIdx.x])[t];
-  }
-  if (t < sizeof(statistics_chunk) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&s->ck)[t] = 0;
-  }
-  __syncthreads();
-  if (t < sizeof(stats_column_desc) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&s->col)[t] = reinterpret_cast<const uint32_t *>(s->group.col)[t];
-  }
-  __syncthreads();
-  dtype = s->col.stats_dtype;
-  if (dtype >= dtype_bool && dtype <= dtype_decimal64) {
-    gatherIntColumnStats(s, dtype, t, temp_storage);
-  } else if (dtype >= dtype_float32 && dtype <= dtype_float64) {
-    gatherFloatColumnStats(s, dtype, t, temp_storage);
-  } else if (dtype == dtype_string) {
-    gatherStringColumnStats(s, t, temp_storage);
-  }
-  __syncthreads();
-  if (t < sizeof(statistics_chunk) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&chunks[blockIdx.x])[t] = reinterpret_cast<uint32_t *>(&s->ck)[t];
-  }
-}
-
-/**
- * @brief Merge statistics for integer-like columns
- *
- * @param s shared block state
- * @param dtype data type
- * @param ck_in pointer to first statistic chunk
- * @param num_chunks number of statistic chunks to merge
- * @param t thread id
- * @param storage temporary storage for reduction
- */
-template <typename Storage>
-void __device__ mergeIntColumnStats(merge_state_s *s,
-                                    statistics_dtype dtype,
-                                    const statistics_chunk *ck_in,
-                                    uint32_t num_chunks,
-                                    uint32_t t,
-                                    Storage &storage)
-{
-  int64_t vmin        = INT64_MAX;
-  int64_t vmax        = INT64_MIN;
-  int64_t vsum        = 0;
-  uint32_t non_nulls  = 0;
-  uint32_t null_count = 0;
-  __shared__ volatile bool has_minmax;
-  for (uint32_t i = t; i < num_chunks; i += block_size) {
-    const statistics_chunk *ck = &ck_in[i];
-    if (ck->has_minmax) {
-      vmin = min(vmin, ck->min_value.i_val);
-      vmax = max(vmax, ck->max_value.i_val);
-    }
-    if (ck->has_sum) { vsum += ck->sum.i_val; }
-    non_nulls += ck->non_nulls;
-    null_count += ck->null_count;
-  }
-  vmin = cub::BlockReduce<int64_t, block_size>(storage.i64).Reduce(vmin, cub::Min());
-  __syncthreads();
-  vmax = cub::BlockReduce<int64_t, block_size>(storage.i64).Reduce(vmax, cub::Max());
-  if (!t) { has_minmax = (vmin <= vmax); }
-  __syncthreads();
-  non_nulls = cub::BlockReduce<uint32_t, block_size>(storage.u32).Sum(non_nulls);
-  __syncthreads();
-  null_count = cub::BlockReduce<uint32_t, block_size>(storage.u32).Sum(null_count);
-  __syncthreads();
-  if (has_minmax) { vsum = cub::BlockReduce<int64_t, block_size>(storage.i64).Sum(vsum); }
-
-  if (!t) {
-    if (has_minmax) {
-      s->ck.min_value.i_val = vmin;
-      s->ck.max_value.i_val = vmax;
-      s->ck.sum.i_val       = vsum;
-    }
-    s->ck.has_minmax = has_minmax;
-    // TODO: For now, don't set the sum flag with 64-bit values so we don't have to check for
-    // 64-bit sum overflow
-    s->ck.has_sum    = (dtype <= dtype_int32 && has_minmax);
-    s->ck.non_nulls  = non_nulls;
-    s->ck.null_count = null_count;
-  }
-}
-
-/**
- * @brief Merge statistics for floating-point columns
- *
- * @param s shared block state
- * @param dtype data type
- * @param ck_in pointer to first statistic chunk
- * @param num_chunks number of statistic chunks to merge
- * @param t thread id
- * @param storage temporary storage for reduction
- */
-template <typename Storage>
-void __device__ mergeFloatColumnStats(merge_state_s *s,
-                                      const statistics_chunk *ck_in,
-                                      uint32_t num_chunks,
-                                      uint32_t t,
-                                      Storage &storage)
-{
-  double vmin         = CUDART_INF;
-  double vmax         = -CUDART_INF;
-  double vsum         = 0;
-  uint32_t non_nulls  = 0;
-  uint32_t null_count = 0;
-  __shared__ volatile bool has_minmax;
-  for (uint32_t i = t; i < num_chunks; i += block_size) {
-    const statistics_chunk *ck = &ck_in[i];
-    if (ck->has_minmax) {
-      vmin = min(vmin, ck->min_value.fp_val);
-      vmax = max(vmax, ck->max_value.fp_val);
-    }
-    if (ck->has_sum) { vsum += ck->sum.fp_val; }
-    non_nulls += ck->non_nulls;
-    null_count += ck->null_count;
-  }
-
-  vmin = cub::BlockReduce<double, block_size>(storage.f64).Reduce(vmin, cub::Min());
-  __syncthreads();
-  vmax = cub::BlockReduce<double, block_size>(storage.f64).Reduce(vmax, cub::Max());
-  if (!t) { has_minmax = (vmin <= vmax); }
-  __syncthreads();
-  non_nulls = cub::BlockReduce<uint32_t, block_size>(storage.u32).Sum(non_nulls);
-  __syncthreads();
-  null_count = cub::BlockReduce<uint32_t, block_size>(storage.u32).Sum(null_count);
-  __syncthreads();
-  if (has_minmax) {
-    vsum = cub::BlockReduce<double, block_size>(storage.f64).Reduce(vsum, IgnoreNaNSum());
-  }
-
-  if (!t) {
-    if (has_minmax) {
-      s->ck.min_value.fp_val = (vmin != 0.0) ? vmin : CUDART_NEG_ZERO;
-      s->ck.max_value.fp_val = (vmax != 0.0) ? vmax : CUDART_ZERO;
-      s->ck.sum.fp_val       = vsum;
-    }
-    s->ck.has_minmax = has_minmax;
-    s->ck.has_sum    = has_minmax;  // Implies sum is valid as well
-    s->ck.non_nulls  = non_nulls;
-    s->ck.null_count = null_count;
-  }
-}
-
-/**
- * @brief Merge statistics for string columns
- *
- * @param s shared block state
- * @param ck_in pointer to first statistic chunk
- * @param num_chunks number of statistic chunks to merge
- * @param t thread id
- * @param storage temporary storage for reduction
- */
-template <typename Storage>
-void __device__ mergeStringColumnStats(merge_state_s *s,
-                                       const statistics_chunk *ck_in,
-                                       uint32_t num_chunks,
-                                       uint32_t t,
-                                       Storage &storage)
-{
-  using block_reduce  = cub::BlockReduce<uint32_t, block_size>;
-  using string_reduce = cub::BlockReduce<string_view, block_size>;
-  uint32_t len_sum    = 0;
-  uint32_t non_nulls  = 0;
-  uint32_t null_count = 0;
-  bool has_minmax     = false;
-
-  string_view minimum_value = string_view::max();
-  string_view maximum_value = string_view::min();
-
-  for (uint32_t i = t; i < num_chunks; i += block_size) {
-    const statistics_chunk *ck = &ck_in[i];
-    if (ck->has_minmax) {
-      has_minmax    = true;
-      minimum_value = thrust::min<string_view>(minimum_value, ck->min_value.str_val);
-      maximum_value = thrust::max<string_view>(maximum_value, ck->max_value.str_val);
-    }
-    if (ck->has_sum) { len_sum += (uint32_t)ck->sum.i_val; }
-    non_nulls += ck->non_nulls;
-    null_count += ck->null_count;
-  }
-  minimum_value = string_reduce(storage.str).Reduce(minimum_value, cub::Min());
-  __syncthreads();
-  maximum_value = string_reduce(storage.str).Reduce(maximum_value, cub::Max());
-  has_minmax    = __syncthreads_or(has_minmax);
-
-  non_nulls = block_reduce(storage.u32).Sum(non_nulls);
-  __syncthreads();
-  null_count = block_reduce(storage.u32).Sum(null_count);
-  __syncthreads();
-  if (has_minmax) { len_sum = block_reduce(storage.u32).Sum(len_sum); }
-
-  if (!t) {
-    if (has_minmax) {
-      s->ck.min_value.str_val = minimum_value;
-      s->ck.max_value.str_val = maximum_value;
-      s->ck.sum.i_val         = len_sum;
-    }
-    s->ck.has_minmax = has_minmax;
-    s->ck.has_sum    = has_minmax;
-    s->ck.non_nulls  = non_nulls;
-    s->ck.null_count = null_count;
-  }
-}
-
-/**
- * @brief Combine multiple statistics chunk together to form new statistics chunks
- *
- * blockDim {1024,1,1}
- *
- * @param chunks_out Destination statistic chunks
- * @param chunks_in Source statistic chunks
- * @param groups Statistic chunk grouping information
- */
-template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  gpuMergeColumnStatistics(statistics_chunk *chunks_out,
-                           const statistics_chunk *chunks_in,
-                           const statistics_merge_group *groups)
-{
-  __shared__ __align__(8) merge_state_s state_g;
-  __shared__ struct {
-    typename cub::BlockReduce<uint32_t, block_size>::TempStorage u32;
-    typename cub::BlockReduce<int64_t, block_size>::TempStorage i64;
-    typename cub::BlockReduce<double, block_size>::TempStorage f64;
-    typename cub::BlockReduce<string_view, block_size>::TempStorage str;
-  } storage;
-
-  merge_state_s *const s = &state_g;
-  uint32_t t             = threadIdx.x;
-  statistics_dtype dtype;
-
-  if (t < sizeof(statistics_merge_group) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&s->group)[t] =
-      reinterpret_cast<const uint32_t *>(&groups[blockIdx.x])[t];
-  }
-  __syncthreads();
-  if (t < sizeof(stats_column_desc) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&s->col)[t] = reinterpret_cast<const uint32_t *>(s->group.col)[t];
-  }
-  __syncthreads();
-  dtype = s->col.stats_dtype;
-
-  if (dtype >= dtype_bool && dtype <= dtype_decimal64) {
-    mergeIntColumnStats(
-      s, dtype, chunks_in + s->group.start_chunk, s->group.num_chunks, t, storage);
-  } else if (dtype >= dtype_float32 && dtype <= dtype_float64) {
-    mergeFloatColumnStats(s, chunks_in + s->group.start_chunk, s->group.num_chunks, t, storage);
-  } else if (dtype == dtype_string) {
-    mergeStringColumnStats(s, chunks_in + s->group.start_chunk, s->group.num_chunks, t, storage);
-  }
-
-  __syncthreads();
-  if (t < sizeof(statistics_chunk) / sizeof(uint32_t)) {
-    reinterpret_cast<uint32_t *>(&chunks_out[blockIdx.x])[t] =
-      reinterpret_cast<uint32_t *>(&s->ck)[t];
-  }
-}
-
-/**
- * @brief Launches kernel to gather column statistics
- *
- * @param[out] chunks Statistics results [num_chunks]
- * @param[in] groups Statistics row groups [num_chunks]
- * @param[in] num_chunks Number of chunks & rowgroups
- * @param[in] stream CUDA stream to use, default 0
- */
-void GatherColumnStatistics(statistics_chunk *chunks,
-                            const statistics_group *groups,
-                            uint32_t num_chunks,
-                            rmm::cuda_stream_view stream)
-{
-  gpuGatherColumnStatistics<block_size>
-    <<<num_chunks, block_size, 0, stream.value()>>>(chunks, groups);
-}
-
-/**
- * @brief Launches kernel to merge column statistics
- *
- * @param[out] chunks_out Statistics results [num_chunks]
- * @param[out] chunks_in Input statistics
- * @param[in] groups Statistics groups [num_chunks]
- * @param[in] num_chunks Number of chunks & groups
- * @param[in] stream CUDA stream to use, default 0
- */
-void MergeColumnStatistics(statistics_chunk *chunks_out,
-                           const statistics_chunk *chunks_in,
-                           const statistics_merge_group *groups,
-                           uint32_t num_chunks,
-                           rmm::cuda_stream_view stream)
-{
-  gpuMergeColumnStatistics<block_size>
-    <<<num_chunks, block_size, 0, stream.value()>>>(chunks_out, chunks_in, groups);
-}
-
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/statistics/conversion_type_select.cuh b/cpp/src/io/statistics/conversion_type_select.cuh
new file mode 100644
index 00000000000..225377bfc4b
--- /dev/null
+++ b/cpp/src/io/statistics/conversion_type_select.cuh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file conversion_type_select.cuh
+ * @brief Utility classes for timestamp and duration conversion for PARQUET and ORC
+ */
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+template <int, int, typename>
+class DetectInnerIteration;
+
+template <int N0, typename... T>
+class DetectInnerIteration<N0, 0, std::tuple<T...>> {
+ public:
+  static constexpr bool is_duplicate =
+    std::is_same_v<typename std::tuple_element<N0, std::tuple<T...>>::type,
+                   typename std::tuple_element<0, std::tuple<T...>>::type>;
+};
+
+template <int N0, int N1, typename... T>
+class DetectInnerIteration<N0, N1, std::tuple<T...>> {
+ public:
+  static constexpr bool is_duplicate =
+    std::is_same_v<typename std::tuple_element<N0, std::tuple<T...>>::type,
+                   typename std::tuple_element<N1, std::tuple<T...>>::type> ||
+    DetectInnerIteration<N0, N1 - 1, std::tuple<T...>>::is_duplicate;
+};
+
+template <int, typename>
+class DetectIteration;
+
+template <typename... T>
+class DetectIteration<0, std::tuple<T...>> {
+ public:
+  static constexpr bool is_duplicate = false;
+};
+
+template <int N, typename... T>
+class DetectIteration<N, std::tuple<T...>> {
+ public:
+  static constexpr bool is_duplicate =
+    DetectInnerIteration<N, N - 1, std::tuple<T...>>::is_duplicate ||
+    DetectIteration<N - 1, std::tuple<T...>>::is_duplicate;
+};
+
+template <typename>
+class Detect;
+
+/**
+ * @brief Utility class to detect multiple occurences of a type in the first element of pairs in a
+ * tuple For eg. with the following tuple :
+ *
+ * using conversion_types =
+ * std::tuple<
+ *  std::pair<int, A>,
+ *  std::pair<char, B>,
+ *  std::pair<int, C>,
+ *  std::pair<int, D>,
+ *  std::pair<unsigned, E>,
+ *  std::pair<unsigned, F>>;
+ *
+ * Detect<conversion_types>::is_duplicate will evaluate to true at compile time.
+ * Here std::pair<int, A>, std::pair<int, C> and std::pair<int, D> are treated as duplicates
+ * and std::pair<unsigned, E> and std::pair<unsigned, F>> are treated as duplicates.
+ *
+ * @tparam T... Parameter pack of pairs of types
+ */
+template <typename... T>
+class Detect<std::tuple<T...>> {
+ public:
+  static constexpr bool is_duplicate =
+    DetectIteration<(sizeof...(T) - 1), std::tuple<T...>>::is_duplicate;
+};
+
+template <typename>
+class ConversionTypeSelect;
+
+template <typename I0>
+class ConversionTypeSelect<std::tuple<I0>> {
+ public:
+  template <typename T>
+  using type = std::conditional_t<std::is_same_v<T, typename std::tuple_element<0, I0>::type>,
+                                  typename std::tuple_element<1, I0>::type,
+                                  T>;
+};
+
+/**
+ * @brief Utility to select between types based on an input type
+ *
+ * using Conversion = std::tuple<
+ *  std::pair<cudf::timestamp_s, cudf::timestamp_ms>,
+ *  std::pair<cudf::timestamp_ns, cudf::timestamp_us>,
+ *  std::pair<cudf::duration_s, cudf::duration_ms>,
+ *  std::pair<cudf::duration_ns, cudf::duration_us>>
+ *
+ * using type = ConversionTypeSelect<Conversion>::type<cudf::duration_ns>
+ * Here type will resolve to cudf::duration_us
+ * If the type passed does not match any entries the type is returned as it is
+ * This utility takes advantage of Detect class to reject any tuple with duplicate first
+ * entries at compile time
+ *
+ * @tparam T... Parameter pack of pairs of types
+ */
+template <typename I0, typename... In>
+class ConversionTypeSelect<std::tuple<I0, In...>> {
+ public:
+  template <typename T>
+  using type =
+    std::conditional_t<std::is_same_v<T, typename std::tuple_element<0, I0>::type>,
+                       typename std::tuple_element<1, I0>::type,
+                       typename ConversionTypeSelect<std::tuple<In...>>::template type<T>>;
+
+  static_assert(not Detect<std::tuple<I0, In...>>::is_duplicate,
+                "Type tuple has duplicate first entries");
+};
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/statistics/orc_column_statistics.cu b/cpp/src/io/statistics/orc_column_statistics.cu
new file mode 100644
index 00000000000..ad8a05a56f5
--- /dev/null
+++ b/cpp/src/io/statistics/orc_column_statistics.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file orc_column_statistics.cu
+ * @brief Template specialization for ORC statistics calls
+ */
+
+#include "column_statistics.cuh"
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+template <>
+void merge_group_statistics<detail::io_file_format::ORC>(statistics_chunk *chunks_out,
+                                                         const statistics_chunk *chunks_in,
+                                                         const statistics_merge_group *groups,
+                                                         uint32_t num_chunks,
+                                                         rmm::cuda_stream_view stream);
+template <>
+void calculate_group_statistics<detail::io_file_format::ORC>(statistics_chunk *chunks,
+                                                             const statistics_group *groups,
+                                                             uint32_t num_chunks,
+                                                             rmm::cuda_stream_view stream);
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/statistics/parquet_column_statistics.cu b/cpp/src/io/statistics/parquet_column_statistics.cu
new file mode 100644
index 00000000000..ad067cd4aad
--- /dev/null
+++ b/cpp/src/io/statistics/parquet_column_statistics.cu
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file parquet_column_statistics.cu
+ * @brief Template specialization for PARQUET statistics calls
+ */
+
+#include "column_statistics.cuh"
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+template <>
+void merge_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk *chunks_out,
+                                                             const statistics_chunk *chunks_in,
+                                                             const statistics_merge_group *groups,
+                                                             uint32_t num_chunks,
+                                                             rmm::cuda_stream_view stream);
+template <>
+void calculate_group_statistics<detail::io_file_format::PARQUET>(statistics_chunk *chunks,
+                                                                 const statistics_group *groups,
+                                                                 uint32_t num_chunks,
+                                                                 rmm::cuda_stream_view stream);
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/statistics/column_stats.h b/cpp/src/io/statistics/statistics.cuh
similarity index 68%
rename from cpp/src/io/statistics/column_stats.h
rename to cpp/src/io/statistics/statistics.cuh
index d7895de50ce..f7bf6e407c1 100644
--- a/cpp/src/io/statistics/column_stats.h
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -13,6 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/**
+ * @file statistics.cuh
+ * @brief Common structures and utility functions for statistics
+ */
+
 #pragma once
 #include <stdint.h>
 
@@ -69,16 +75,17 @@ struct string_stats {
   {
     return string_view(ptr, static_cast<size_type>(length));
   }
+  __host__ __device__ __forceinline__ operator string_view()
+  {
+    return string_view(ptr, static_cast<size_type>(length));
+  }
 };
 
 union statistics_val {
   string_stats str_val;  //!< string columns
   double fp_val;         //!< float columns
   int64_t i_val;         //!< integer columns
-  struct {
-    uint64_t lo64;
-    int64_t hi64;
-  } i128_val;  //!< decimal128 columns
+  uint64_t u_val;        //!< unsigned integer columns
 };
 
 struct statistics_chunk {
@@ -86,12 +93,9 @@ struct statistics_chunk {
   uint32_t null_count;       //!< number of null values in chunk
   statistics_val min_value;  //!< minimum value in chunk
   statistics_val max_value;  //!< maximum value in chunk
-  union {
-    double fp_val;  //!< Sum for fp types
-    int64_t i_val;  //!< Sum for integer types or string lengths
-  } sum;
-  uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;     //!< Nonzero if sum is valid
+  statistics_val sum;        //!< sum of chunk
+  uint8_t has_minmax;        //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum;           //!< Nonzero if sum is valid
 };
 
 struct statistics_group {
@@ -106,33 +110,5 @@ struct statistics_merge_group {
   uint32_t num_chunks;           //!< Number of chunks in group
 };
 
-/**
- * @brief Launches kernel to gather column statistics
- *
- * @param[out] chunks Statistics results [num_chunks]
- * @param[in] groups Statistics row groups [num_chunks]
- * @param[in] num_chunks Number of chunks & rowgroups
- * @param[in] stream CUDA stream to use, default 0
- */
-void GatherColumnStatistics(statistics_chunk *chunks,
-                            const statistics_group *groups,
-                            uint32_t num_chunks,
-                            rmm::cuda_stream_view stream);
-
-/**
- * @brief Launches kernel to merge column statistics
- *
- * @param[out] chunks_out Statistics results [num_chunks]
- * @param[out] chunks_in Input statistics
- * @param[in] groups Statistics groups [num_chunks]
- * @param[in] num_chunks Number of chunks & groups
- * @param[in] stream CUDA stream to use, default 0
- */
-void MergeColumnStatistics(statistics_chunk *chunks_out,
-                           const statistics_chunk *chunks_in,
-                           const statistics_merge_group *groups,
-                           uint32_t num_chunks,
-                           rmm::cuda_stream_view stream);
-
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh
new file mode 100644
index 00000000000..84399a307a5
--- /dev/null
+++ b/cpp/src/io/statistics/statistics_type_identification.cuh
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file statistics_type_identification.cuh
+ * @brief Utility classes to identify extrema, aggregate and conversion types for ORC and PARQUET
+ */
+
+#pragma once
+
+#include <cudf/fixed_point/fixed_point.hpp>
+
+#include <cudf/wrappers/timestamps.hpp>
+
+#include <cudf/strings/string_view.cuh>
+
+#include <cudf/wrappers/durations.hpp>
+
+#include <cudf/utilities/traits.hpp>
+
+#include "conversion_type_select.cuh"
+
+#include <tuple>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+enum class io_file_format { ORC, PARQUET };
+
+template <io_file_format IO>
+struct conversion_map;
+
+// Every timestamp or duration type is converted to milliseconds in ORC statistics
+template <>
+struct conversion_map<io_file_format::ORC> {
+  using types = std::tuple<std::pair<cudf::timestamp_s, cudf::timestamp_ms>,
+                           std::pair<cudf::timestamp_us, cudf::timestamp_ms>,
+                           std::pair<cudf::timestamp_ns, cudf::timestamp_ms>,
+                           std::pair<cudf::duration_s, cudf::duration_ms>,
+                           std::pair<cudf::duration_us, cudf::duration_ms>,
+                           std::pair<cudf::duration_ns, cudf::duration_ms>>;
+};
+
+// In Parquet timestamps and durations with second resoluion are converted to
+// milliseconds. Timestamps and durations with nanosecond resoluion are
+// converted to microseconds.
+template <>
+struct conversion_map<io_file_format::PARQUET> {
+  using types = std::tuple<std::pair<cudf::timestamp_s, cudf::timestamp_ms>,
+                           std::pair<cudf::timestamp_ns, cudf::timestamp_us>,
+                           std::pair<cudf::duration_s, cudf::duration_ms>,
+                           std::pair<cudf::duration_ns, cudf::duration_us>>;
+};
+
+/**
+ * @brief Utility class to help conversion of timestamps and durations to their
+ * representation type
+ *
+ * @tparam conversion A conversion_map structure
+ */
+template <typename conversion>
+class type_conversion {
+  using type_selector = ConversionTypeSelect<typename conversion::types>;
+
+ public:
+  template <typename T>
+  using type = typename type_selector::template type<T>;
+
+  template <typename T>
+  static constexpr __device__ typename type_selector::template type<T> convert(const T& elem)
+  {
+    using Type = typename type_selector::template type<T>;
+    if constexpr (cudf::is_duration<T>()) {
+      return cuda::std::chrono::duration_cast<Type>(elem);
+    } else if constexpr (cudf::is_timestamp<T>()) {
+      using Duration = typename Type::duration;
+      return cuda::std::chrono::time_point_cast<Duration>(elem);
+    } else {
+      return elem;
+    }
+    return Type{};
+  }
+};
+
+template <class T>
+struct dependent_false : std::false_type {
+};
+
+/**
+ * @brief Utility class to convert a leaf column element into its extrema type
+ *
+ * @tparam T Column type
+ */
+template <typename T>
+class extrema_type {
+ private:
+  using integral_extrema_type = typename std::conditional_t<std::is_signed_v<T>, int64_t, uint64_t>;
+
+  using arithmetic_extrema_type =
+    typename std::conditional_t<std::is_integral_v<T>, integral_extrema_type, double>;
+
+  using non_arithmetic_extrema_type = typename std::conditional_t<
+    cudf::is_fixed_point<T>() or cudf::is_duration<T>() or cudf::is_timestamp<T>(),
+    int64_t,
+    typename std::conditional_t<std::is_same_v<T, string_view>, string_view, void>>;
+
+  // unsigned int/bool -> uint64_t
+  // signed int        -> int64_t
+  // float/double      -> double
+  // decimal32/64      -> int64_t
+  // duration_[T]      -> int64_t
+  // string_view       -> string_view
+  // timestamp_[T]     -> int64_t
+
+ public:
+  // Does type T have an extrema?
+  static constexpr bool is_supported = std::is_arithmetic_v<T> or std::is_same_v<T, string_view> or
+                                       cudf::is_duration<T>() or cudf::is_timestamp<T>() or
+                                       cudf::is_fixed_point<T>();
+
+  using type = typename std::
+    conditional_t<std::is_arithmetic_v<T>, arithmetic_extrema_type, non_arithmetic_extrema_type>;
+
+  /**
+   * @brief Function that converts an element of a leaf column into its extrema type
+   */
+  __device__ static type convert(const T& val)
+  {
+    if constexpr (std::is_arithmetic_v<T> or std::is_same_v<T, string_view>) {
+      return val;
+    } else if constexpr (cudf::is_fixed_point<T>()) {
+      return val.value();
+    } else if constexpr (cudf::is_duration<T>()) {
+      return val.count();
+    } else if constexpr (cudf::is_timestamp<T>()) {
+      return val.time_since_epoch().count();
+    } else {
+      static_assert(dependent_false<T>::value, "aggregation_type does not exist");
+    }
+    return type{};
+  }
+};
+
+/**
+ * @brief Utility class to convert a leaf column element into its aggregate type
+ *
+ * @tparam T Column type
+ */
+template <typename T>
+class aggregation_type {
+ private:
+  using integral_aggregation_type =
+    typename std::conditional_t<std::is_signed_v<T>, int64_t, uint64_t>;
+
+  using arithmetic_aggregation_type =
+    typename std::conditional_t<std::is_integral_v<T>, integral_aggregation_type, double>;
+
+  using non_arithmetic_aggregation_type =
+    typename std::conditional_t<cudf::is_fixed_point<T>() or cudf::is_duration<T>() or
+                                  cudf::is_timestamp<T>()  // To be disabled with static_assert
+                                  or std::is_same_v<T, string_view>,
+                                int64_t,
+                                void>;
+
+  // unsigned int/bool -> uint64_t
+  // signed int        -> int64_t
+  // float/double      -> double
+  // decimal32/64      -> int64_t
+  // duration_[T]      -> int64_t
+  // string_view       -> int64_t
+  // NOTE : timestamps do not have an aggregation type
+
+ public:
+  // Does type T aggregate?
+  static constexpr bool is_supported = std::is_arithmetic_v<T> or std::is_same_v<T, string_view> or
+                                       cudf::is_duration<T>() or cudf::is_fixed_point<T>();
+
+  using type = typename std::conditional_t<std::is_arithmetic_v<T>,
+                                           arithmetic_aggregation_type,
+                                           non_arithmetic_aggregation_type>;
+
+  /**
+   * @brief Function that converts an element of a leaf column into its aggregate type
+   */
+  __device__ static type convert(const T& val)
+  {
+    if constexpr (std::is_same_v<T, string_view>) {
+      return val.size_bytes();
+    } else if constexpr (std::is_integral_v<T>) {
+      return val;
+    } else if constexpr (std::is_floating_point_v<T>) {
+      return isnan(val) ? 0 : val;
+    } else if constexpr (cudf::is_fixed_point<T>()) {
+      return val.value();
+    } else if constexpr (cudf::is_duration<T>()) {
+      return val.count();
+    } else if constexpr (cudf::is_timestamp<T>()) {
+      static_assert(dependent_false<T>::value, "aggregation_type for timestamps do not exist");
+    } else {
+      static_assert(dependent_false<T>::value, "aggregation_type for supplied type do not exist");
+    }
+    return type{};
+  }
+};
+
+template <typename T>
+__inline__ __device__ constexpr T minimum_identity()
+{
+  if constexpr (std::is_same_v<T, string_view>) { return string_view::max(); }
+  return cuda::std::numeric_limits<T>::max();
+}
+
+template <typename T>
+__inline__ __device__ constexpr T maximum_identity()
+{
+  if constexpr (std::is_same_v<T, string_view>) { return string_view::min(); }
+  return cuda::std::numeric_limits<T>::lowest();
+}
+
+/**
+ * @brief Utility class to identify whether a type T is aggregated or ignored
+ * for ORC or PARQUET
+ *
+ * @tparam T Leaf column type
+ * @tparam IO File format for which statistics calculation is being done
+ */
+template <typename T, io_file_format IO>
+class statistics_type_category {
+ public:
+  // Types that calculate the sum of elements encountered
+  static constexpr bool is_aggregated =
+    (IO == io_file_format::PARQUET) ? false : aggregation_type<T>::is_supported;
+
+  // Types for which sum does not make sense
+  static constexpr bool is_not_aggregated =
+    (IO == io_file_format::PARQUET) ? aggregation_type<T>::is_supported or cudf::is_timestamp<T>()
+                                    : cudf::is_timestamp<T>();
+
+  // Do not calculate statistics for any other type
+  static constexpr bool is_ignored = not(is_aggregated or is_not_aggregated);
+};
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/statistics/temp_storage_wrapper.cuh b/cpp/src/io/statistics/temp_storage_wrapper.cuh
new file mode 100644
index 00000000000..7a36c873ba6
--- /dev/null
+++ b/cpp/src/io/statistics/temp_storage_wrapper.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file temp_storage_wrapper.cuh
+ * @brief Temporary storage for cub calls and helper wrapper class
+ */
+
+#pragma once
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/wrappers/durations.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+#include "statistics.cuh"
+
+#include <cub/cub.cuh>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+template <typename T, int block_size>
+using cub_temp_storage = typename cub::BlockReduce<T, block_size>::TempStorage;
+
+#define MEMBER_NAME(TYPE) TYPE##_stats
+
+#define DECLARE_MEMBER(TYPE) cub_temp_storage<TYPE, block_size> MEMBER_NAME(TYPE);
+
+/**
+ * @brief Templated union to hold temporary storage to be used by cub reduce
+ * calls
+ *
+ * @tparam block_size Dimension of the block
+ */
+template <int block_size>
+union block_reduce_storage {
+  DECLARE_MEMBER(bool)
+  DECLARE_MEMBER(int8_t)
+  DECLARE_MEMBER(int16_t)
+  DECLARE_MEMBER(int32_t)
+  DECLARE_MEMBER(int64_t)
+  DECLARE_MEMBER(uint8_t)
+  DECLARE_MEMBER(uint16_t)
+  DECLARE_MEMBER(uint32_t)
+  DECLARE_MEMBER(uint64_t)
+  DECLARE_MEMBER(float)
+  DECLARE_MEMBER(double)
+  DECLARE_MEMBER(string_view)
+};
+
+#define STORAGE_WRAPPER_GET(TYPE)                                                                 \
+  template <typename T>                                                                           \
+  __device__ std::enable_if_t<std::is_same_v<T, TYPE>, cub_temp_storage<TYPE, block_size>&> get() \
+  {                                                                                               \
+    return storage.MEMBER_NAME(TYPE);                                                             \
+  }
+
+/**
+ * @brief Templated wrapper for block_reduce_storage to return member reference based on requested
+ * type
+ *
+ * @tparam block_size Dimension of the block
+ */
+template <int block_size>
+struct storage_wrapper {
+  block_reduce_storage<block_size>& storage;
+  __device__ storage_wrapper(block_reduce_storage<block_size>& _temp_storage)
+    : storage(_temp_storage)
+  {
+  }
+
+  STORAGE_WRAPPER_GET(bool);
+  STORAGE_WRAPPER_GET(int8_t);
+  STORAGE_WRAPPER_GET(int16_t);
+  STORAGE_WRAPPER_GET(int32_t);
+  STORAGE_WRAPPER_GET(int64_t);
+  STORAGE_WRAPPER_GET(uint8_t);
+  STORAGE_WRAPPER_GET(uint16_t);
+  STORAGE_WRAPPER_GET(uint32_t);
+  STORAGE_WRAPPER_GET(uint64_t);
+  STORAGE_WRAPPER_GET(float);
+  STORAGE_WRAPPER_GET(double);
+  STORAGE_WRAPPER_GET(string_view);
+};
+
+#undef DECLARE_MEMBER
+#undef MEMBER_NAME
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
new file mode 100644
index 00000000000..20b7fdc927b
--- /dev/null
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file typed_statistics_chunk
+ * @brief Templated wrapper to generalize statistics chunk reduction and aggregation
+ * across different leaf column types
+ */
+
+#pragma once
+
+#include "statistics.cuh"
+#include "statistics_type_identification.cuh"
+#include "temp_storage_wrapper.cuh"
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/wrappers/timestamps.hpp>
+
+#include <math_constants.h>
+
+namespace cudf {
+namespace io {
+
+/**
+ * @brief Class used to get reference to members of unions related to statistics calculations
+ */
+class union_member {
+  template <typename U, typename V>
+  using reference_type = std::conditional_t<std::is_const_v<U>, const V&, V&>;
+
+ public:
+  template <typename T, typename U>
+  using type = std::conditional_t<std::is_same_v<std::remove_cv_t<T>, string_view>,
+                                  reference_type<U, string_stats>,
+                                  reference_type<U, T>>;
+
+  template <typename T, typename U>
+  __device__ static std::enable_if_t<std::is_integral_v<T> and std::is_unsigned_v<T>, type<T, U>>
+  get(U& val)
+  {
+    return val.u_val;
+  }
+
+  template <typename T, typename U>
+  __device__ static std::enable_if_t<std::is_integral_v<T> and std::is_signed_v<T>, type<T, U>> get(
+    U& val)
+  {
+    return val.i_val;
+  }
+
+  template <typename T, typename U>
+  __device__ static std::enable_if_t<std::is_floating_point_v<T>, type<T, U>> get(U& val)
+  {
+    return val.fp_val;
+  }
+
+  template <typename T, typename U>
+  __device__ static std::enable_if_t<std::is_same_v<T, string_view>, type<T, U>> get(U& val)
+  {
+    return val.str_val;
+  }
+};
+
+/**
+ * @brief Templated structure used for merging and gathering of statistics chunks
+ *
+ * This uses the reduce function to compute the minimum, maximum and aggregate
+ * values simultaneously.
+ *
+ * @tparam T The input type associated with the chunk
+ * @tparam is_aggregation_supported Set to true if input type is meant to be aggregated
+ */
+template <typename T, bool is_aggregation_supported>
+struct typed_statistics_chunk {
+};
+
+template <typename T>
+struct typed_statistics_chunk<T, true> {
+  using E = typename detail::extrema_type<T>::type;
+  using A = typename detail::aggregation_type<T>::type;
+
+  uint32_t num_rows;    //!< number of non-null values in chunk
+  uint32_t non_nulls;   //!< number of non-null values in chunk
+  uint32_t null_count;  //!< number of null values in chunk
+
+  E minimum_value;
+  E maximum_value;
+  A aggregate;
+
+  uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum;     //!< Nonzero if sum is valid
+
+  __device__ typed_statistics_chunk(const uint32_t _num_rows = 0)
+    : num_rows(_num_rows),
+      non_nulls(0),
+      null_count(0),
+      minimum_value(detail::minimum_identity<E>()),
+      maximum_value(detail::maximum_identity<E>()),
+      aggregate(0),
+      has_minmax(false),
+      has_sum(false)  // Set to true when storing
+  {
+  }
+
+  __device__ void reduce(const T& elem)
+  {
+    non_nulls++;
+    minimum_value = thrust::min<E>(minimum_value, detail::extrema_type<T>::convert(elem));
+    maximum_value = thrust::max<E>(maximum_value, detail::extrema_type<T>::convert(elem));
+    aggregate += detail::aggregation_type<T>::convert(elem);
+    has_minmax = true;
+  }
+
+  __device__ void reduce(const statistics_chunk& chunk)
+  {
+    if (chunk.has_minmax) {
+      minimum_value = thrust::min<E>(minimum_value, union_member::get<E>(chunk.min_value));
+      maximum_value = thrust::max<E>(maximum_value, union_member::get<E>(chunk.max_value));
+    }
+    if (chunk.has_sum) {
+      aggregate += detail::aggregation_type<A>::convert(union_member::get<A>(chunk.sum));
+    }
+    non_nulls += chunk.non_nulls;
+    null_count += chunk.null_count;
+    num_rows += (chunk.non_nulls + chunk.null_count);
+  }
+};
+
+template <typename T>
+struct typed_statistics_chunk<T, false> {
+  using E = typename detail::extrema_type<T>::type;
+
+  uint32_t num_rows;    //!< number of non-null values in chunk
+  uint32_t non_nulls;   //!< number of non-null values in chunk
+  uint32_t null_count;  //!< number of null values in chunk
+
+  E minimum_value;
+  E maximum_value;
+
+  uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum;     //!< Nonzero if sum is valid
+
+  __device__ typed_statistics_chunk(const uint32_t _num_rows = 0)
+    : num_rows(_num_rows),
+      non_nulls(0),
+      null_count(0),
+      minimum_value(detail::minimum_identity<E>()),
+      maximum_value(detail::maximum_identity<E>()),
+      has_minmax(false),
+      has_sum(false)  // Set to true when storing
+  {
+  }
+
+  __device__ void reduce(const T& elem)
+  {
+    non_nulls++;
+    minimum_value = thrust::min<E>(minimum_value, detail::extrema_type<T>::convert(elem));
+    maximum_value = thrust::max<E>(maximum_value, detail::extrema_type<T>::convert(elem));
+    has_minmax    = true;
+  }
+
+  __device__ void reduce(const statistics_chunk& chunk)
+  {
+    if (chunk.has_minmax) {
+      minimum_value = thrust::min<E>(minimum_value, union_member::get<E>(chunk.min_value));
+      maximum_value = thrust::max<E>(maximum_value, union_member::get<E>(chunk.max_value));
+    }
+    non_nulls += chunk.non_nulls;
+    null_count += chunk.null_count;
+    num_rows += (chunk.non_nulls + chunk.null_count);
+  }
+};
+
+/**
+ * @brief Function to reduce members of a typed_statistics_chunk across a thread block
+ *
+ * @tparam T Type associated with typed_statistics_chunk
+ * @tparam block_size Dimension of the thread block
+ * @param chunk The input typed_statistics_chunk
+ * @param storage Temporary storage to be used by cub calls
+ */
+template <typename T, bool is_aggregated, int block_size>
+__inline__ __device__ typed_statistics_chunk<T, is_aggregated> block_reduce(
+  typed_statistics_chunk<T, is_aggregated>& chunk, detail::storage_wrapper<block_size>& storage)
+{
+  typed_statistics_chunk<T, is_aggregated> output_chunk = chunk;
+
+  using E              = typename detail::extrema_type<T>::type;
+  using extrema_reduce = cub::BlockReduce<E, block_size>;
+  using count_reduce   = cub::BlockReduce<uint32_t, block_size>;
+  output_chunk.minimum_value =
+    extrema_reduce(storage.template get<E>()).Reduce(output_chunk.minimum_value, cub::Min());
+  __syncthreads();
+  output_chunk.maximum_value =
+    extrema_reduce(storage.template get<E>()).Reduce(output_chunk.maximum_value, cub::Max());
+  __syncthreads();
+  output_chunk.non_nulls =
+    count_reduce(storage.template get<uint32_t>()).Sum(output_chunk.non_nulls);
+  __syncthreads();
+  output_chunk.null_count =
+    count_reduce(storage.template get<uint32_t>()).Sum(output_chunk.null_count);
+  __syncthreads();
+  output_chunk.has_minmax = __syncthreads_or(output_chunk.has_minmax);
+
+  // FIXME : Is another syncthreads needed here?
+  if constexpr (is_aggregated) {
+    if (output_chunk.has_minmax) {
+      using A                = typename detail::aggregation_type<T>::type;
+      using aggregate_reduce = cub::BlockReduce<A, block_size>;
+      output_chunk.aggregate =
+        aggregate_reduce(storage.template get<A>()).Sum(output_chunk.aggregate);
+    }
+  }
+  return output_chunk;
+}
+
+/**
+ * @brief Function to convert typed_statistics_chunk into statistics_chunk
+ *
+ * @tparam T Type associated with typed_statistics_chunk
+ * @param chunk The input typed_statistics_chunk
+ */
+template <typename T, bool is_aggregated>
+__inline__ __device__ statistics_chunk
+get_untyped_chunk(const typed_statistics_chunk<T, is_aggregated>& chunk)
+{
+  statistics_chunk stat;
+  stat.non_nulls  = chunk.non_nulls;
+  stat.null_count = chunk.num_rows - chunk.non_nulls;
+  stat.has_minmax = chunk.has_minmax;
+  stat.has_sum =
+    chunk.has_minmax;  // If a valid input was encountered we assume that the sum is valid
+  if (chunk.has_minmax) {
+    using E = typename detail::extrema_type<T>::type;
+    if constexpr (std::is_floating_point_v<E>) {
+      union_member::get<E>(stat.min_value) =
+        (chunk.minimum_value != 0.0) ? chunk.minimum_value : CUDART_NEG_ZERO;
+      union_member::get<E>(stat.max_value) =
+        (chunk.maximum_value != 0.0) ? chunk.maximum_value : CUDART_ZERO;
+    } else {
+      union_member::get<E>(stat.min_value) = chunk.minimum_value;
+      union_member::get<E>(stat.max_value) = chunk.maximum_value;
+    }
+    if constexpr (is_aggregated) {
+      using A                        = typename detail::aggregation_type<T>::type;
+      union_member::get<A>(stat.sum) = chunk.aggregate;
+    }
+  }
+  return stat;
+}
+
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index ec5177d70c3..759aa2517b6 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -187,60 +187,5 @@ inline __device__ void memcpy_block(void *dstv, const void *srcv, uint32_t len,
   }
 }
 
-/**
- * @brief Compares two strings
- */
-template <class T, const T lesser, const T greater, const T equal>
-inline __device__ T nvstr_compare(const char *as, uint32_t alen, const char *bs, uint32_t blen)
-{
-  uint32_t len = min(alen, blen);
-  uint32_t i   = 0;
-  if (len >= 4) {
-    uint32_t align_a     = 3 & reinterpret_cast<uintptr_t>(as);
-    uint32_t align_b     = 3 & reinterpret_cast<uintptr_t>(bs);
-    const uint32_t *as32 = reinterpret_cast<const uint32_t *>(as - align_a);
-    const uint32_t *bs32 = reinterpret_cast<const uint32_t *>(bs - align_b);
-    uint32_t ofsa        = align_a * 8;
-    uint32_t ofsb        = align_b * 8;
-    do {
-      uint32_t a = *as32++;
-      uint32_t b = *bs32++;
-      if (ofsa) a = __funnelshift_r(a, *as32, ofsa);
-      if (ofsb) b = __funnelshift_r(b, *bs32, ofsb);
-      if (a != b) {
-        return (lesser == greater || __byte_perm(a, 0, 0x0123) < __byte_perm(b, 0, 0x0123))
-                 ? lesser
-                 : greater;
-      }
-      i += 4;
-    } while (i + 4 <= len);
-  }
-  while (i < len) {
-    uint8_t a = as[i];
-    uint8_t b = bs[i];
-    if (a != b) { return (a < b) ? lesser : greater; }
-    ++i;
-  }
-  return (alen == blen) ? equal : (alen < blen) ? lesser : greater;
-}
-
-inline __device__ bool nvstr_is_lesser(const char *as, uint32_t alen, const char *bs, uint32_t blen)
-{
-  return nvstr_compare<bool, true, false, false>(as, alen, bs, blen);
-}
-
-inline __device__ bool nvstr_is_greater(const char *as,
-                                        uint32_t alen,
-                                        const char *bs,
-                                        uint32_t blen)
-{
-  return nvstr_compare<bool, false, true, false>(as, alen, bs, blen);
-}
-
-inline __device__ bool nvstr_is_equal(const char *as, uint32_t alen, const char *bs, uint32_t blen)
-{
-  return nvstr_compare<bool, false, false, true>(as, alen, bs, blen);
-}
-
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
new file mode 100644
index 00000000000..d60c7e4fad4
--- /dev/null
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file column_buffer.cpp
+ * @brief cuDF-IO column_buffer class implementation
+ */
+
+#include "column_buffer.hpp"
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+
+namespace cudf {
+namespace io {
+namespace detail {
+
+void column_buffer::create(size_type _size,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
+{
+  size = _size;
+
+  switch (type.id()) {
+    case type_id::STRING:
+      _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
+        cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(size, stream));
+      break;
+
+    // list columns store a buffer of int32's as offsets to represent
+    // their individual rows
+    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
+
+    // struct columns store no data themselves.  just validity and children.
+    case type_id::STRUCT: break;
+
+    default: _data = create_data(type, size, stream, mr); break;
+  }
+  if (is_nullable) {
+    _null_mask =
+      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
+  }
+}
+
+/**
+ * @copydoc cudf::io::detail::make_column
+ */
+std::unique_ptr<column> make_column(column_buffer& buffer,
+                                    column_name_info* schema_info,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  if (schema_info != nullptr) { schema_info->name = buffer.name; }
+
+  switch (buffer.type.id()) {
+    case type_id::STRING:
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{"chars"});
+      }
+      return make_strings_column(*buffer._strings, stream, mr);
+
+    case type_id::LIST: {
+      // make offsets column
+      auto offsets =
+        std::make_unique<column>(data_type{type_id::INT32}, buffer.size, std::move(buffer._data));
+
+      column_name_info* child_info = nullptr;
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{""});
+        child_info = &schema_info->children.back();
+      }
+
+      // make child column
+      CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
+      auto child = make_column(buffer.children[0], child_info, stream, mr);
+
+      // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
+      // less)
+      return make_lists_column(buffer.size - 1,
+                               std::move(offsets),
+                               std::move(child),
+                               buffer._null_count,
+                               std::move(buffer._null_mask),
+                               stream,
+                               mr);
+    } break;
+
+    case type_id::STRUCT: {
+      std::vector<std::unique_ptr<cudf::column>> output_children;
+      output_children.reserve(buffer.children.size());
+      std::transform(buffer.children.begin(),
+                     buffer.children.end(),
+                     std::back_inserter(output_children),
+                     [&](column_buffer& col) {
+                       column_name_info* child_info = nullptr;
+                       if (schema_info != nullptr) {
+                         schema_info->children.push_back(column_name_info{""});
+                         child_info = &schema_info->children.back();
+                       }
+                       return make_column(col, child_info, stream, mr);
+                     });
+
+      return make_structs_column(buffer.size,
+                                 std::move(output_children),
+                                 buffer._null_count,
+                                 std::move(buffer._null_mask),
+                                 stream,
+                                 mr);
+    } break;
+
+    default: {
+      return std::make_unique<column>(buffer.type,
+                                      buffer.size,
+                                      std::move(buffer._data),
+                                      std::move(buffer._null_mask),
+                                      buffer._null_count);
+    }
+  }
+}
+
+/**
+ * @copydoc cudf::io::detail::empty_like
+ */
+std::unique_ptr<column> empty_like(column_buffer& buffer,
+                                   column_name_info* schema_info,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  if (schema_info != nullptr) { schema_info->name = buffer.name; }
+
+  switch (buffer.type.id()) {
+    case type_id::LIST: {
+      // make offsets column
+      auto offsets = cudf::make_empty_column(data_type{type_id::INT32});
+
+      column_name_info* child_info = nullptr;
+      if (schema_info != nullptr) {
+        schema_info->children.push_back(column_name_info{"offsets"});
+        schema_info->children.push_back(column_name_info{""});
+        child_info = &schema_info->children.back();
+      }
+
+      // make child column
+      CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
+      auto child = empty_like(buffer.children[0], child_info, stream, mr);
+
+      // make the final list column
+      return make_lists_column(0,
+                               std::move(offsets),
+                               std::move(child),
+                               buffer._null_count,
+                               std::move(buffer._null_mask),
+                               stream,
+                               mr);
+    } break;
+
+    case type_id::STRUCT: {
+      std::vector<std::unique_ptr<cudf::column>> output_children;
+      output_children.reserve(buffer.children.size());
+      std::transform(buffer.children.begin(),
+                     buffer.children.end(),
+                     std::back_inserter(output_children),
+                     [&](column_buffer& col) {
+                       column_name_info* child_info = nullptr;
+                       if (schema_info != nullptr) {
+                         schema_info->children.push_back(column_name_info{""});
+                         child_info = &schema_info->children.back();
+                       }
+                       return empty_like(col, child_info, stream, mr);
+                     });
+
+      return make_structs_column(0,
+                                 std::move(output_children),
+                                 buffer._null_count,
+                                 std::move(buffer._null_mask),
+                                 stream,
+                                 mr);
+    } break;
+
+    case type_id::STRING: return cudf::strings::detail::make_empty_strings_column(stream, mr);
+
+    default: return cudf::make_empty_column(buffer.type);
+  }
+}
+
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 88444d41206..ab387616b24 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,8 +28,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <cudf_test/column_utilities.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
@@ -43,7 +41,7 @@ namespace detail {
  * @param type The intended data type to populate
  * @param size The number of elements to be represented by the mask
  * @param state The desired state of the mask
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  *
  * @return `rmm::device_buffer` Device buffer allocation
@@ -62,18 +60,13 @@ inline rmm::device_buffer create_data(
   return data;
 }
 
+using string_index_pair = thrust::pair<const char*, size_type>;
+
 /**
  * @brief Class for holding device memory buffers to column data that eventually
  * will be used to create a column.
  */
 struct column_buffer {
-  // there is a potential bug here.  In the decoding step, the buffer of
-  // data holding these pairs is cast to an nvstrdesc_s, which is a struct
-  // containing <const char *, size_t>.   So there is a mismatch between the
-  // size_type and the size_t.  I believe this works because the str_pair is
-  // aligned out to 8 bytes anyway.
-  using str_pair = thrust::pair<const char*, size_type>;
-
   column_buffer() = default;
 
   // construct without a known size. call create() later to actually
@@ -86,7 +79,7 @@ struct column_buffer {
                 bool _is_nullable                   = true,
                 rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : type(_type), is_nullable(_is_nullable), _null_count(0)
+    : type(_type), is_nullable(_is_nullable)
   {
     create(_size, stream, mr);
   }
@@ -103,30 +96,7 @@ struct column_buffer {
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size,
               rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-  {
-    size = _size;
-
-    switch (type.id()) {
-      case type_id::STRING:
-        _strings = std::make_unique<rmm::device_uvector<str_pair>>(size, stream);
-        cudaMemsetAsync(_strings->data(), 0, size * sizeof(str_pair), stream.value());
-        break;
-
-      // list columns store a buffer of int32's as offsets to represent
-      // their individual rows
-      case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, mr); break;
-
-      // struct columns store no data themselves.  just validity and children.
-      case type_id::STRUCT: break;
-
-      default: _data = create_data(type, size, stream, mr); break;
-    }
-    if (is_nullable) {
-      _null_mask = cudf::detail::create_null_mask(
-        size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
-    }
-  }
+              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   auto data() { return _strings ? _strings->data() : _data.data(); }
   auto data_size() const { return _strings ? _strings->size() : _data.size(); }
@@ -140,20 +110,19 @@ struct column_buffer {
 
   auto& null_count() { return _null_count; }
 
-  std::unique_ptr<rmm::device_uvector<str_pair>> _strings;
+  std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
 
-  bool is_nullable{false};
   data_type type{type_id::EMPTY};
+  bool is_nullable{false};
   size_type size{0};
   std::vector<column_buffer> children;
   uint32_t user_data{0};  // arbitrary user data
   std::string name;
 };
 
-namespace {
 /**
  * @brief Creates a column from an existing set of device memory buffers.
  *
@@ -169,81 +138,27 @@ std::unique_ptr<column> make_column(
   column_buffer& buffer,
   column_name_info* schema_info       = nullptr,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  using str_pair = thrust::pair<const char*, size_type>;
-
-  if (schema_info != nullptr) { schema_info->name = buffer.name; }
-
-  switch (buffer.type.id()) {
-    case type_id::STRING:
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{"chars"});
-      }
-      return make_strings_column(*buffer._strings, stream, mr);
-
-    case type_id::LIST: {
-      // make offsets column
-      auto offsets =
-        std::make_unique<column>(data_type{type_id::INT32}, buffer.size, std::move(buffer._data));
-
-      column_name_info* child_info = nullptr;
-      if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{""});
-        child_info = &schema_info->children.back();
-      }
-
-      // make child column
-      CUDF_EXPECTS(buffer.children.size() > 0, "Encountered malformed column_buffer");
-      auto child = make_column(buffer.children[0], child_info, stream, mr);
-
-      // make the final list column (note : size is the # of offsets, so our actual # of rows is 1
-      // less)
-      return make_lists_column(buffer.size - 1,
-                               std::move(offsets),
-                               std::move(child),
-                               buffer._null_count,
-                               std::move(buffer._null_mask),
-                               stream,
-                               mr);
-    } break;
-
-    case type_id::STRUCT: {
-      std::vector<std::unique_ptr<cudf::column>> output_children;
-      output_children.reserve(buffer.children.size());
-      std::transform(buffer.children.begin(),
-                     buffer.children.end(),
-                     std::back_inserter(output_children),
-                     [&](column_buffer& col) {
-                       column_name_info* child_info = nullptr;
-                       if (schema_info != nullptr) {
-                         schema_info->children.push_back(column_name_info{""});
-                         child_info = &schema_info->children.back();
-                       }
-                       return make_column(col, child_info, stream, mr);
-                     });
-
-      return make_structs_column(buffer.size,
-                                 std::move(output_children),
-                                 buffer._null_count,
-                                 std::move(buffer._null_mask),
-                                 stream,
-                                 mr);
-    } break;
-
-    default: {
-      return std::make_unique<column>(buffer.type,
-                                      buffer.size,
-                                      std::move(buffer._data),
-                                      std::move(buffer._null_mask),
-                                      buffer._null_count);
-    }
-  }
-}
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-}  // namespace
+/**
+ * @brief Creates an equivalent empty column from an existing set of device memory buffers.
+ *
+ * This function preserves nested column type information by producing complete/identical
+ * column hierarchies.
+ *
+ * @throws std::bad_alloc if device memory allocation fails
+ *
+ * @param buffer Column buffer descriptors
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @return `std::unique_ptr<cudf::column>` Column from the existing device data
+ */
+std::unique_ptr<column> empty_like(
+  column_buffer& buffer,
+  column_name_info* schema_info       = nullptr,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp
index 71ba0a3b1bb..99762595693 100644
--- a/cpp/src/io/utilities/column_type_histogram.hpp
+++ b/cpp/src/io/utilities/column_type_histogram.hpp
@@ -25,6 +25,7 @@ namespace io {
  * @brief Per-column histogram struct containing detected occurrences of each dtype
  */
 struct column_type_histogram {
+  cudf::size_type null_count;
   cudf::size_type float_count;
   cudf::size_type datetime_count;
   cudf::size_type string_count;
@@ -32,7 +33,6 @@ struct column_type_histogram {
   cudf::size_type positive_small_int_count;
   cudf::size_type big_int_count;
   cudf::size_type bool_count;
-  cudf::size_type null_count;
 };
 
 }  // namespace io
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 10af7bcb0bd..d133d813ab3 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -18,7 +18,7 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
-#include <io/utilities/file_io_utilities.hpp>
+#include "file_io_utilities.hpp"
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 8f2a5389b4d..ac8deccd078 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -21,7 +21,7 @@
 #include <unistd.h>
 
 #include <cudf/utilities/error.hpp>
-#include <io/utilities/file_io_utilities.hpp>
+#include "file_io_utilities.hpp"
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 322296715fc..abf3a3fdef0 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/utilities/file_io_utilities.hpp>
+#include "file_io_utilities.hpp"
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 0119484aee5..8a742076338 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -18,13 +18,13 @@
 
 #ifdef CUFILE_FOUND
 #include <cufile.h>
+#include <cudf_test/file_utilities.hpp>
 #endif
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf_test/file_utilities.hpp>
 
 #include <string>
 
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 62b87f727c4..ee4b23bf831 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -154,9 +154,15 @@ class hostdevice_2dvector {
   operator device_2dspan<T>() { return {_data.device_ptr(), _size}; }
   operator device_2dspan<T const>() const { return {_data.device_ptr(), _size}; }
 
+  device_2dspan<T> device_view() { return static_cast<device_2dspan<T>>(*this); }
+  device_2dspan<T> device_view() const { return static_cast<device_2dspan<T const>>(*this); }
+
   operator host_2dspan<T>() { return {_data.host_ptr(), _size}; }
   operator host_2dspan<T const>() const { return {_data.host_ptr(), _size}; }
 
+  host_2dspan<T> host_view() { return static_cast<host_2dspan<T>>(*this); }
+  host_2dspan<T> host_view() const { return static_cast<host_2dspan<T const>>(*this); }
+
   host_span<T> operator[](size_t row)
   {
     return {_data.host_ptr() + host_2dspan<T>::flatten_index(row, 0, _size), _size.second};
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 5713a562131..389e158ac0a 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -1,12 +1,10 @@
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/pair.h>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_vector.hpp>
-
-#include <cudf/io/types.hpp>
 
 #include <algorithm>
 
@@ -105,7 +103,8 @@ template <class T>
 cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
-                                  T* positions)
+                                  T* positions,
+                                  rmm::cuda_stream_view stream)
 {
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
@@ -113,17 +112,18 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
   const int grid_size = divCeil(d_data.size(), (size_t)block_size);
 
-  rmm::device_vector<cudf::size_type> d_count(1, 0);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
   for (char key : keys) {
-    count_and_set_positions<T><<<grid_size, block_size>>>(static_cast<const char*>(d_data.data()),
-                                                          d_data.size(),
-                                                          result_offset,
-                                                          key,
-                                                          d_count.data().get(),
-                                                          positions);
+    count_and_set_positions<T>
+      <<<grid_size, block_size, 0, stream.value()>>>(static_cast<const char*>(d_data.data()),
+                                                     d_data.size(),
+                                                     result_offset,
+                                                     key,
+                                                     d_count.data(),
+                                                     positions);
   }
 
-  return d_count[0];
+  return cudf::detail::make_std_vector_sync(d_count, stream)[0];
 }
 
 template <class T>
@@ -131,10 +131,11 @@ cudf::size_type find_all_from_set(const char* h_data,
                                   size_t h_size,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
-                                  T* positions)
+                                  T* positions,
+                                  rmm::cuda_stream_view stream)
 {
-  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, h_size));
-  rmm::device_vector<cudf::size_type> d_count(1, 0);
+  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, h_size), stream);
+  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
@@ -150,51 +151,62 @@ cudf::size_type find_all_from_set(const char* h_data,
     const int grid_size     = divCeil(chunk_bits, block_size);
 
     // Copy chunk to device
-    CUDA_TRY(cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault));
+    CUDA_TRY(
+      cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value()));
 
     for (char key : keys) {
-      count_and_set_positions<T><<<grid_size, block_size>>>(static_cast<char*>(d_chunk.data()),
-                                                            chunk_bytes,
-                                                            chunk_offset + result_offset,
-                                                            key,
-                                                            d_count.data().get(),
-                                                            positions);
+      count_and_set_positions<T>
+        <<<grid_size, block_size, 0, stream.value()>>>(static_cast<char*>(d_chunk.data()),
+                                                       chunk_bytes,
+                                                       chunk_offset + result_offset,
+                                                       key,
+                                                       d_count.data(),
+                                                       positions);
     }
   }
 
-  return d_count[0];
+  return cudf::detail::make_std_vector_sync(d_count, stream)[0];
 }
 
 template cudf::size_type find_all_from_set<uint64_t>(const rmm::device_buffer& d_data,
                                                      const std::vector<char>& keys,
                                                      uint64_t result_offset,
-                                                     uint64_t* positions);
+                                                     uint64_t* positions,
+                                                     rmm::cuda_stream_view stream);
 
 template cudf::size_type find_all_from_set<pos_key_pair>(const rmm::device_buffer& d_data,
                                                          const std::vector<char>& keys,
                                                          uint64_t result_offset,
-                                                         pos_key_pair* positions);
+                                                         pos_key_pair* positions,
+                                                         rmm::cuda_stream_view stream);
 
 template cudf::size_type find_all_from_set<uint64_t>(const char* h_data,
                                                      size_t h_size,
                                                      const std::vector<char>& keys,
                                                      uint64_t result_offset,
-                                                     uint64_t* positions);
+                                                     uint64_t* positions,
+                                                     rmm::cuda_stream_view stream);
 
 template cudf::size_type find_all_from_set<pos_key_pair>(const char* h_data,
                                                          size_t h_size,
                                                          const std::vector<char>& keys,
                                                          uint64_t result_offset,
-                                                         pos_key_pair* positions);
+                                                         pos_key_pair* positions,
+                                                         rmm::cuda_stream_view stream);
 
-cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, const std::vector<char>& keys)
+cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
+                                   const std::vector<char>& keys,
+                                   rmm::cuda_stream_view stream)
 {
-  return find_all_from_set<void>(d_data, keys, 0, nullptr);
+  return find_all_from_set<void>(d_data, keys, 0, nullptr, stream);
 }
 
-cudf::size_type count_all_from_set(const char* h_data, size_t h_size, const std::vector<char>& keys)
+cudf::size_type count_all_from_set(const char* h_data,
+                                   size_t h_size,
+                                   const std::vector<char>& keys,
+                                   rmm::cuda_stream_view stream)
 {
-  return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr);
+  return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr, stream);
 }
 
 std::string infer_compression_type(
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index b7719cba580..c7eae48cbbc 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -16,18 +16,21 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/trie.cuh>
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/span.hpp>
+#include <io/utilities/trie.cuh>
 
-#include <io/utilities/column_type_histogram.hpp>
+#include "column_type_histogram.hpp"
 
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <optional>
 
 using cudf::device_span;
 
 namespace cudf {
 namespace io {
+
 /**
  * @brief Structure for holding various options used when parsing and
  * converting CSV/json data to cuDF data type values.
@@ -43,9 +46,9 @@ struct parse_options_view {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
-  device_span<SerialTrieNode const> trie_true;
-  device_span<SerialTrieNode const> trie_false;
-  device_span<SerialTrieNode const> trie_na;
+  cudf::detail::trie_view trie_true;
+  cudf::detail::trie_view trie_false;
+  cudf::detail::trie_view trie_na;
   bool multi_delimiter;
 };
 
@@ -60,9 +63,9 @@ struct parse_options {
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
-  rmm::device_vector<SerialTrieNode> trie_true;
-  rmm::device_vector<SerialTrieNode> trie_false;
-  rmm::device_vector<SerialTrieNode> trie_na;
+  cudf::detail::optional_trie trie_true;
+  cudf::detail::optional_trie trie_false;
+  cudf::detail::optional_trie trie_na;
   bool multi_delimiter;
 
   parse_options_view view()
@@ -77,9 +80,9 @@ struct parse_options {
             doublequote,
             dayfirst,
             skipblanklines,
-            trie_true,
-            trie_false,
-            trie_na,
+            cudf::detail::make_trie_view(trie_true),
+            cudf::detail::make_trie_view(trie_false),
+            cudf::detail::make_trie_view(trie_na),
             multi_delimiter};
   }
 };
@@ -381,6 +384,7 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*
  * @param[in] keys Vector containing the keys to count in the buffer
  * @param[in] result_offset Offset to add to the output positions
  * @param[out] positions Array containing the output positions
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return cudf::size_type total number of occurrences
  */
@@ -388,7 +392,8 @@ template <class T>
 cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
-                                  T* positions);
+                                  T* positions,
+                                  rmm::cuda_stream_view stream);
 
 /**
  * @brief Searches the input character array for each of characters in a set.
@@ -403,6 +408,7 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
  * @param[in] keys Vector containing the keys to count in the buffer
  * @param[in] result_offset Offset to add to the output positions
  * @param[out] positions Array containing the output positions
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return cudf::size_type total number of occurrences
  */
@@ -411,18 +417,22 @@ cudf::size_type find_all_from_set(const char* h_data,
                                   size_t h_size,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
-                                  T* positions);
+                                  T* positions,
+                                  rmm::cuda_stream_view stream);
 
 /**
  * @brief Searches the input character array for each of characters in a set
  * and sums up the number of occurrences.
  *
- * @param[in] d_data Input data buffer in device memory
- * @param[in] keys Vector containing the keys to count in the buffer
+ * @param d_data Input data buffer in device memory
+ * @param keys Vector containing the keys to count in the buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return cudf::size_type total number of occurrences
  */
-cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, const std::vector<char>& keys);
+cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
+                                   const std::vector<char>& keys,
+                                   rmm::cuda_stream_view stream);
 
 /**
  * @brief Searches the input character array for each of characters in a set
@@ -431,15 +441,17 @@ cudf::size_type count_all_from_set(const rmm::device_buffer& d_data, const std::
  * Does not load the entire buffer into the GPU memory at any time, so it can
  * be used with buffers of any size.
  *
- * @param[in] h_data Pointer to the data in host memory
- * @param[in] h_size Size of the input data, in bytes
- * @param[in] keys Vector containing the keys to count in the buffer
+ * @param h_data Pointer to the data in host memory
+ * @param h_size Size of the input data, in bytes
+ * @param keys Vector containing the keys to count in the buffer
+ * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return cudf::size_type total number of occurrences
  */
 cudf::size_type count_all_from_set(const char* h_data,
                                    size_t h_size,
-                                   const std::vector<char>& keys);
+                                   const std::vector<char>& keys,
+                                   rmm::cuda_stream_view stream);
 
 /**
  * @brief Infer file compression type based on user supplied arguments.
diff --git a/cpp/include/cudf/detail/utilities/trie.cuh b/cpp/src/io/utilities/trie.cu
similarity index 51%
rename from cpp/include/cudf/detail/utilities/trie.cuh
rename to cpp/src/io/utilities/trie.cu
index f2d429d5529..82d8f5e8336 100644
--- a/cpp/include/cudf/detail/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,44 +16,25 @@
 
 /**
  * @brief Serialized trie implementation for C++/CUDA
- * @file trie.cuh
+ * @file trie.cu
  */
 
-#pragma once
+#include "trie.cuh"
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <cuda_runtime.h>
+
 #include <deque>
 #include <string>
 #include <vector>
 
-#include <cuda_runtime.h>
-#include <thrust/host_vector.h>
-
-using cudf::device_span;
-
-static constexpr char trie_terminating_character = '\n';
+namespace cudf {
+namespace detail {
 
-struct SerialTrieNode {
-  int16_t children_offset{-1};
-  char character{trie_terminating_character};
-  bool is_leaf{false};
-  SerialTrieNode() = default;  // FIXME This is necessary for a Thrust bug on CentOS7 + CUDA10
-  explicit SerialTrieNode(char c, bool leaf = false) noexcept : character(c), is_leaf(leaf) {}
-};
-
-/**
- * @brief Create a serialized trie for cache-friendly string search
- *
- * The resulting trie is a compact array - children array size is equal to the
- * actual number of children nodes, not the size of the alphabet
- *
- * @param[in] keys Array of strings to insert into the trie
- *
- * @return A host vector of nodes representing the serialized trie
- */
-inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
-  const std::vector<std::string> &keys)
+rmm::device_uvector<serial_trie_node> create_serialized_trie(const std::vector<std::string> &keys,
+                                                             rmm::cuda_stream_view stream)
 {
   static constexpr int alphabet_size = std::numeric_limits<char>::max() + 1;
   struct TreeTrieNode {
@@ -87,12 +68,12 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
 
   // Serialize the tree trie
   std::deque<IndexedTrieNode> to_visit;
-  thrust::host_vector<SerialTrieNode> nodes;
+  std::vector<serial_trie_node> nodes;
 
   // If the Tree trie matches empty strings, the root node is marked as 'end of word'.
   // The first node in the serialized trie is also used to match empty strings, so we're
   // initializing it using the `is_end_of_word` value from the root node.
-  nodes.push_back(SerialTrieNode(trie_terminating_character, tree_trie.is_end_of_word));
+  nodes.push_back(serial_trie_node(trie_terminating_character, tree_trie.is_end_of_word));
 
   // Add root node to queue. this node is not included to the serialized trie
   to_visit.emplace_back(&tree_trie, -1);
@@ -110,47 +91,18 @@ inline thrust::host_vector<SerialTrieNode> createSerializedTrie(
           nodes[idx].children_offset = static_cast<uint16_t>(nodes.size() - idx);
         }
         // Add node to the trie
-        nodes.push_back(SerialTrieNode(static_cast<char>(i), node->children[i]->is_end_of_word));
+        nodes.push_back(serial_trie_node(static_cast<char>(i), node->children[i]->is_end_of_word));
         // Add to the queue, with the index within the new trie
         to_visit.emplace_back(node->children[i].get(), static_cast<uint16_t>(nodes.size()) - 1);
 
         has_children = true;
       }
     }
-    // Only add the terminating character any nodes were added
-    if (has_children) { nodes.push_back(SerialTrieNode(trie_terminating_character)); }
+    // Only add the terminating character if any nodes were added
+    if (has_children) { nodes.push_back(serial_trie_node(trie_terminating_character)); }
   }
-  return nodes;
+  return cudf::detail::make_device_uvector_sync(nodes, stream);
 }
 
-/*
- * @brief Searches for a string in a serialized trie
- *
- * Can be executed on host or device, as long as the data is available
- *
- * @param[in] trie Pointer to the array of nodes that make up the trie
- * @param[in] key Pointer to the start of the string to find
- * @param[in] key_len Length of the string to find
- *
- * @return Boolean value, true if string is found, false otherwise
- */
-__host__ __device__ inline bool serialized_trie_contains(device_span<SerialTrieNode const> trie,
-                                                         device_span<char const> key)
-{
-  if (trie.data() == nullptr || trie.empty()) return false;
-  if (key.empty()) return trie.front().is_leaf;
-  auto curr_node = trie.begin() + 1;
-  for (auto curr_key = key.begin(); curr_key < key.end(); ++curr_key) {
-    // Don't jump away from root node
-    if (curr_key != key.begin()) { curr_node += curr_node->children_offset; }
-    // Search for the next character in the array of children nodes
-    // Nodes are sorted - terminate search if the node is larger or equal
-    while (curr_node->character != trie_terminating_character && curr_node->character < *curr_key) {
-      ++curr_node;
-    }
-    // Could not find the next character, done with the search
-    if (curr_node->character != *curr_key) { return false; }
-  }
-  // Even if the node is present, return true only if that node is at the end of a word
-  return curr_node->is_leaf;
-}
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
new file mode 100644
index 00000000000..1140a08b76b
--- /dev/null
+++ b/cpp/src/io/utilities/trie.cuh
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @brief Serialized trie implementation for C++/CUDA
+ * @file trie.cuh
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+namespace cudf {
+namespace detail {
+static constexpr char trie_terminating_character = '\n';
+
+/**
+ * @brief Node in the serialized trie.
+ *
+ * A serialized trie is an array of nodes. Each node represents a matching character, except for the
+ * last child node, which denotes the end of the children list. Children of a node are stored
+ * contiguously. The `children_offset` member is the offset between the node and its first child.
+ * Matching is successful if all characters are matched and the final node is the last character of
+ * a word (i.e. `is_leaf` is true).
+ *
+ */
+struct serial_trie_node {
+  int16_t children_offset{-1};
+  char character{trie_terminating_character};
+  bool is_leaf{false};
+  explicit serial_trie_node(char c, bool leaf = false) noexcept : character(c), is_leaf(leaf) {}
+};
+
+using trie          = rmm::device_uvector<serial_trie_node>;
+using optional_trie = std::optional<trie>;
+using trie_view     = device_span<serial_trie_node const>;
+
+inline trie_view make_trie_view(optional_trie const& t)
+{
+  if (!t) return {};
+  return trie_view{t->data(), t->size()};
+}
+
+/**
+ * @brief Creates a serialized trie for cache-friendly string search.
+ *
+ * The resulting trie is a compact array - children array size is equal to the
+ * actual number of children nodes, not the size of the alphabet.
+ *
+ * @param keys Array of strings to insert into the trie
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ * @return A host vector of nodes representing the serialized trie
+ */
+trie create_serialized_trie(const std::vector<std::string>& keys, rmm::cuda_stream_view stream);
+
+/*
+ * @brief Searches for a string in a serialized trie.
+ *
+ * Can be executed on host or device, as long as the data is available
+ *
+ * @param trie Pointer to the array of nodes that make up the trie
+ * @param key Pointer to the start of the string to find
+ * @param key_len Length of the string to find
+ *
+ * @return Boolean value; true if string is found, false otherwise
+ */
+__host__ __device__ inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
+                                                         device_span<char const> key)
+{
+  if (trie.data() == nullptr || trie.empty()) return false;
+  if (key.empty()) return trie.front().is_leaf;
+  auto curr_node = trie.begin() + 1;
+  for (auto curr_key = key.begin(); curr_key < key.end(); ++curr_key) {
+    // Don't jump away from root node
+    if (curr_key != key.begin()) { curr_node += curr_node->children_offset; }
+    // Search for the next character in the array of children nodes
+    // Nodes are sorted - terminate search if the node is larger or equal
+    while (curr_node->character != trie_terminating_character && curr_node->character < *curr_key) {
+      ++curr_node;
+    }
+    // Could not find the next character, done with the search
+    if (curr_node->character != *curr_key) { return false; }
+  }
+  // Even if the node is present, return true only if that node is at the end of a word
+  return curr_node->is_leaf;
+}
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index cb401c184ee..37b5f58da22 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -17,27 +17,29 @@
 #include <cudf/utilities/error.hpp>
 
 #include <cuda.h>
-#include <boost/filesystem.hpp>
 #include <jitify2.hpp>
 
+#include <cstddef>
+#include <filesystem>
+
 namespace cudf {
 namespace jit {
 
 // Get the directory in home to use for storing the cache
-boost::filesystem::path get_user_home_cache_dir()
+std::filesystem::path get_user_home_cache_dir()
 {
   auto home_dir = std::getenv("HOME");
   if (home_dir != nullptr) {
-    return boost::filesystem::path(home_dir) / ".cudf";
+    return std::filesystem::path(home_dir) / ".cudf";
   } else {
-    return boost::filesystem::path();
+    return std::filesystem::path();
   }
 }
 
 // Default `LIBCUDF_KERNEL_CACHE_PATH` to `$HOME/.cudf/$CUDF_VERSION`.
 // This definition can be overridden at compile time by specifying a
 // `-DLIBCUDF_KERNEL_CACHE_PATH=/kernel/cache/path` CMake argument.
-// Use `boost::filesystem` for cross-platform path resolution and dir
+// Use `std::filesystem` for cross-platform path resolution and dir
 // creation. This path is used in the `getCacheDir()` function below.
 #if !defined(LIBCUDF_KERNEL_CACHE_PATH)
 #define LIBCUDF_KERNEL_CACHE_PATH get_user_home_cache_dir()
@@ -57,12 +59,12 @@ boost::filesystem::path get_user_home_cache_dir()
  * are used and if $HOME is not defined, returns an empty path and file
  * caching is not used.
  */
-boost::filesystem::path get_cache_dir()
+std::filesystem::path get_cache_dir()
 {
   // The environment variable always overrides the
   // default/compile-time value of `LIBCUDF_KERNEL_CACHE_PATH`
   auto kernel_cache_path_env = std::getenv("LIBCUDF_KERNEL_CACHE_PATH");
-  auto kernel_cache_path     = boost::filesystem::path(
+  auto kernel_cache_path     = std::filesystem::path(
     kernel_cache_path_env != nullptr ? kernel_cache_path_env : LIBCUDF_KERNEL_CACHE_PATH);
 
   // Cache path could be empty when env HOME is unset or LIBCUDF_KERNEL_CACHE_PATH is defined to be
@@ -84,10 +86,10 @@ boost::filesystem::path get_cache_dir()
 
     try {
       // `mkdir -p` the kernel cache path if it doesn't exist
-      boost::filesystem::create_directories(kernel_cache_path);
+      std::filesystem::create_directories(kernel_cache_path);
     } catch (const std::exception& e) {
       // if directory creation fails for any reason, return empty path
-      return boost::filesystem::path();
+      return std::filesystem::path();
     }
   }
   return kernel_cache_path;
@@ -102,6 +104,15 @@ std::string get_program_cache_dir()
 #endif
 }
 
+void try_parse_numeric_env_var(std::size_t& result, char const* const env_name)
+{
+  auto value = std::getenv(env_name);
+
+  if (value != nullptr) {
+    result = std::stoull(value);  // fails if env var contains invalid value.
+  }
+}
+
 jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData preprog)
 {
   static std::mutex caches_mutex{};
@@ -112,9 +123,26 @@ jitify2::ProgramCache<>& get_program_cache(jitify2::PreprocessedProgramData prep
   auto existing_cache = caches.find(preprog.name());
 
   if (existing_cache == caches.end()) {
-    auto res = caches.insert(
-      {preprog.name(),
-       std::make_unique<jitify2::ProgramCache<>>(100, preprog, nullptr, get_program_cache_dir())});
+    std::size_t kernel_limit_proc = std::numeric_limits<std::size_t>::max();
+    std::size_t kernel_limit_disk = std::numeric_limits<std::size_t>::max();
+    try_parse_numeric_env_var(kernel_limit_proc, "LIBCUDF_KERNEL_CACHE_LIMIT_PER_PROCESS");
+    try_parse_numeric_env_var(kernel_limit_disk, "LIBCUDF_KERNEL_CACHE_LIMIT_DISK");
+
+    auto cache_dir = get_program_cache_dir();
+
+    if (kernel_limit_disk == 0) {
+      // if kernel_limit_disk is zero, jitify will assign it the value of kernel_limit_proc.
+      // to avoid this, we treat zero as "disable disk caching" by not providing the cache dir.
+      cache_dir = {};
+    }
+
+    auto res = caches.insert({preprog.name(),
+                              std::make_unique<jitify2::ProgramCache<>>(  //
+                                kernel_limit_proc,
+                                preprog,
+                                nullptr,
+                                cache_dir,
+                                kernel_limit_disk)});
 
     existing_cache = res.first;
   }
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 15eb122ef27..2624ea68629 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -25,6 +25,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cstddef>
 #include <iostream>
 #include <numeric>
 
@@ -176,7 +177,7 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
   CUDF_EXPECTS(0 != build_device_table->num_rows(), "Build side table has no rows");
 
   size_type const build_table_num_rows{build_device_table->num_rows()};
-  size_t const hash_table_size = compute_hash_table_size(build_table_num_rows);
+  std::size_t const hash_table_size = compute_hash_table_size(build_table_num_rows);
 
   auto hash_table = multimap_type::create(hash_table_size,
                                           stream,
@@ -228,7 +229,7 @@ probe_join_hash_table(cudf::table_device_view build_table,
                       rmm::cuda_stream_view stream,
                       rmm::mr::device_memory_resource *mr)
 {
-  size_type estimated_size = estimate_join_output_size<JoinKind, multimap_type>(
+  std::size_t estimated_size = estimate_join_output_size<JoinKind, multimap_type>(
     build_table, probe_table, hash_table, compare_nulls, stream);
 
   // If the estimated output size is zero, return immediately
@@ -242,7 +243,7 @@ probe_join_hash_table(cudf::table_device_view build_table,
   // As such we will need to de-allocate memory and re-allocate memory to ensure
   // that the final output is correct.
   rmm::device_scalar<size_type> write_index(0, stream);
-  size_type join_size{0};
+  std::size_t join_size{0};
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
@@ -307,7 +308,11 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
   CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Build column size is too big for hash join");
 
-  _build = std::get<0>(structs::detail::flatten_nested_columns(build, {}, {}));
+  auto flattened_build = structs::detail::flatten_nested_columns(
+    build, {}, {}, structs::detail::column_nullability::FORCE);
+  _build = std::get<0>(flattened_build);
+  // need to store off the owning structures for some of the views in _build
+  _created_null_columns = std::move(std::get<3>(flattened_build));
 
   if (0 == build.num_rows()) { return; }
 
@@ -359,24 +364,26 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe,
   CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Probe column size is too big for hash join");
 
-  auto const _probe = std::get<0>(structs::detail::flatten_nested_columns(probe, {}, {}));
+  auto flattened_probe = structs::detail::flatten_nested_columns(
+    probe, {}, {}, structs::detail::column_nullability::FORCE);
+  auto const flattened_probe_table = std::get<0>(flattened_probe);
 
-  CUDF_EXPECTS(_build.num_columns() == _probe.num_columns(),
+  CUDF_EXPECTS(_build.num_columns() == flattened_probe_table.num_columns(),
                "Mismatch in number of columns to be joined on");
 
-  if (is_trivial_join(_probe, _build, JoinKind)) {
+  if (is_trivial_join(flattened_probe_table, _build, JoinKind)) {
     return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
                           std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
   CUDF_EXPECTS(std::equal(std::cbegin(_build),
                           std::cend(_build),
-                          std::cbegin(_probe),
-                          std::cend(_probe),
+                          std::cbegin(flattened_probe_table),
+                          std::cend(flattened_probe_table),
                           [](const auto &b, const auto &p) { return b.type() == p.type(); }),
                "Mismatch in joining column data types");
 
-  return probe_join_indices<JoinKind>(_probe, compare_nulls, stream, mr);
+  return probe_join_indices<JoinKind>(flattened_probe_table, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index aaa25e8f941..e6df2b58b15 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -33,6 +33,7 @@
 
 #include <thrust/sequence.h>
 
+#include <cstddef>
 #include <limits>
 
 namespace cudf {
@@ -62,14 +63,12 @@ namespace detail {
  * @return An estimate of the size of the output of the join operation
  */
 template <join_kind JoinKind, typename multimap_type>
-size_type estimate_join_output_size(table_device_view build_table,
-                                    table_device_view probe_table,
-                                    multimap_type const& hash_table,
-                                    null_equality compare_nulls,
-                                    rmm::cuda_stream_view stream)
+std::size_t estimate_join_output_size(table_device_view build_table,
+                                      table_device_view probe_table,
+                                      multimap_type const& hash_table,
+                                      null_equality compare_nulls,
+                                      rmm::cuda_stream_view stream)
 {
-  using estimate_size_type = int64_t;  // use 64-bit size so we can detect overflow
-
   const size_type build_table_num_rows{build_table.num_rows()};
   const size_type probe_table_num_rows{probe_table.num_rows()};
 
@@ -100,8 +99,8 @@ size_type estimate_join_output_size(table_device_view build_table,
   if (probe_to_build_ratio > MAX_RATIO) { sample_probe_num_rows = build_table_num_rows; }
 
   // Allocate storage for the counter used to get the size of the join output
-  estimate_size_type h_size_estimate{0};
-  rmm::device_scalar<estimate_size_type> size_estimate(0, stream);
+  std::size_t h_size_estimate{0};
+  rmm::device_scalar<std::size_t> size_estimate(0, stream);
 
   CHECK_CUDA(stream.value());
 
@@ -148,11 +147,6 @@ size_type estimate_join_output_size(table_device_view build_table,
       h_size_estimate = size_estimate.value(stream);
     }
 
-    // Detect overflow
-    CUDF_EXPECTS(h_size_estimate <
-                   static_cast<estimate_size_type>(std::numeric_limits<cudf::size_type>::max()),
-                 "Maximum join output size exceeded");
-
     // If the size estimate is non-zero, then we have a valid estimate and can break
     // If sample_probe_num_rows >= probe_table_num_rows, then we've sampled the entire
     // probe table, in which case the estimate is exact and we can break
@@ -165,12 +159,12 @@ size_type estimate_join_output_size(table_device_view build_table,
       constexpr size_type GROW_RATIO{2};
       sample_probe_num_rows *= GROW_RATIO;
       probe_to_build_ratio =
-        static_cast<size_type>(std::ceil(static_cast<float>(probe_to_build_ratio) / GROW_RATIO));
+        static_cast<size_t>(std::ceil(static_cast<float>(probe_to_build_ratio) / GROW_RATIO));
     }
 
   } while (true);
 
-  return static_cast<cudf::size_type>(h_size_estimate);
+  return h_size_estimate;
 }
 
 /**
@@ -220,6 +214,7 @@ struct hash_join::hash_join_impl {
 
  private:
   cudf::table_view _build;
+  std::vector<std::unique_ptr<cudf::column>> _created_null_columns;
   std::unique_ptr<cudf::detail::multimap_type, std::function<void(cudf::detail::multimap_type*)>>
     _hash_table;
 
diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh
index c353ec2e895..4298706987c 100644
--- a/cpp/src/join/join_kernels.cuh
+++ b/cpp/src/join/join_kernels.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstddef>
 #include <cub/cub.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -119,17 +120,14 @@ __global__ void build_hash_table(multimap_type multi_map,
  * @param[in] probe_table_num_rows The number of rows in the probe table
  * @param[out] output_size The resulting output size
  */
-template <join_kind JoinKind,
-          typename multimap_type,
-          int block_size,
-          typename estimate_size_type = int64_t>
+template <join_kind JoinKind, typename multimap_type, int block_size>
 __global__ void compute_join_output_size(multimap_type multi_map,
                                          table_device_view build_table,
                                          table_device_view probe_table,
                                          row_hash hash_probe,
                                          row_equality check_row_equality,
                                          const cudf::size_type probe_table_num_rows,
-                                         estimate_size_type* output_size)
+                                         std::size_t* output_size)
 {
   // This kernel probes multiple elements in the probe_table and store the number of matches found
   // inside a register. A block reduction is used at the end to calculate the matches per thread
@@ -193,9 +191,9 @@ __global__ void compute_join_output_size(multimap_type multi_map,
     }
   }
 
-  using BlockReduce = cub::BlockReduce<estimate_size_type, block_size>;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  estimate_size_type block_counter = BlockReduce(temp_storage).Sum(thread_counter);
+  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
   if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
@@ -311,7 +309,7 @@ __global__ void probe_hash_table(multimap_type multi_map,
                                  size_type* join_output_l,
                                  size_type* join_output_r,
                                  cudf::size_type* current_idx,
-                                 const cudf::size_type max_size)
+                                 const std::size_t max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
   __shared__ size_type current_idx_shared[num_warps];
diff --git a/cpp/src/join/nested_loop_join.cuh b/cpp/src/join/nested_loop_join.cuh
index 580017a6704..5054305a41a 100644
--- a/cpp/src/join/nested_loop_join.cuh
+++ b/cpp/src/join/nested_loop_join.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -118,7 +118,7 @@ size_type estimate_nested_loop_join_output_size(table_device_view left,
  *
  * @return Join output indices vector pair
  */
-std::pair<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
+std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
 get_base_nested_loop_join_indices(table_view const& left,
                                   table_view const& right,
                                   bool flip_join_indices,
@@ -144,7 +144,8 @@ get_base_nested_loop_join_indices(table_view const& left,
 
   // If the estimated output size is zero, return immediately
   if (estimated_size == 0) {
-    return std::make_pair(rmm::device_vector<size_type>{}, rmm::device_vector<size_type>{});
+    return std::make_pair(rmm::device_uvector<size_type>{0, stream},
+                          rmm::device_uvector<size_type>{0, stream});
   }
 
   // Because we are approximating the number of joined elements, our approximation
@@ -154,22 +155,20 @@ get_base_nested_loop_join_indices(table_view const& left,
   rmm::device_scalar<size_type> write_index(0, stream);
   size_type join_size{0};
 
-  rmm::device_vector<size_type> left_indices;
-  rmm::device_vector<size_type> right_indices;
+  rmm::device_uvector<size_type> left_indices{0, stream};
+  rmm::device_uvector<size_type> right_indices{0, stream};
   auto current_estimated_size = estimated_size;
   do {
-    left_indices.resize(estimated_size);
-    right_indices.resize(estimated_size);
+    left_indices.resize(estimated_size, stream);
+    right_indices.resize(estimated_size, stream);
 
     constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
     detail::grid_1d config(left_table->num_rows(), block_size);
     write_index.set_value_zero(stream);
 
     row_equality equality{*left_table, *right_table, compare_nulls == null_equality::EQUAL};
-    const auto& join_output_l =
-      flip_join_indices ? right_indices.data().get() : left_indices.data().get();
-    const auto& join_output_r =
-      flip_join_indices ? left_indices.data().get() : right_indices.data().get();
+    const auto& join_output_l = flip_join_indices ? right_indices.data() : left_indices.data();
+    const auto& join_output_r = flip_join_indices ? left_indices.data() : right_indices.data();
     nested_loop_join<block_size, DEFAULT_JOIN_CACHE_SIZE>
       <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(*left_table,
                                                                                *right_table,
@@ -187,8 +186,8 @@ get_base_nested_loop_join_indices(table_view const& left,
     estimated_size *= 2;
   } while ((current_estimated_size < join_size));
 
-  left_indices.resize(join_size);
-  right_indices.resize(join_size);
+  left_indices.resize(join_size, stream);
+  right_indices.resize(join_size, stream);
   return std::make_pair(std::move(left_indices), std::move(right_indices));
 }
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 80a1ef9e204..cc34aed33ea 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <hash/concurrent_unordered_map.cuh>
 #include <join/join_common_utils.hpp>
+#include <structs/utilities.hpp>
 
 #include <thrust/distance.h>
 
@@ -31,7 +32,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
@@ -61,17 +61,26 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   auto const left_num_rows  = left_keys.num_rows();
   auto const right_num_rows = right_keys.num_rows();
 
+  // flatten structs for the right and left and use that for the hash table
+  auto right_flattened_tables = structs::detail::flatten_nested_columns(
+    right_keys, {}, {}, structs::detail::column_nullability::FORCE);
+  auto left_flattened_tables = structs::detail::flatten_nested_columns(
+    left_keys, {}, {}, structs::detail::column_nullability::FORCE);
+
+  auto right_flattened_keys = std::get<0>(right_flattened_tables);
+  auto left_flattened_keys  = std::get<0>(left_flattened_tables);
+
   // Only care about existence, so we'll use an unordered map (other joins need a multimap)
   using hash_table_type = concurrent_unordered_map<cudf::size_type, bool, row_hash, row_equality>;
 
   // Create hash table containing all keys found in right table
-  auto right_rows_d            = table_device_view::create(right_keys, stream);
+  auto right_rows_d            = table_device_view::create(right_flattened_keys, stream);
   size_t const hash_table_size = compute_hash_table_size(right_num_rows);
   row_hash hash_build{*right_rows_d};
   row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
 
   // Going to join it with left table
-  auto left_rows_d = table_device_view::create(left_keys, stream);
+  auto left_rows_d = table_device_view::create(left_flattened_keys, stream);
   row_hash hash_probe{*left_rows_d};
   row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
 
@@ -83,12 +92,22 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
                                                 equality_build);
   auto hash_table     = *hash_table_ptr;
 
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     right_num_rows,
-                     [hash_table] __device__(size_type idx) mutable {
-                       hash_table.insert(thrust::make_pair(idx, true));
-                     });
+  // if compare_nulls == UNEQUAL, we can simply ignore any rows that
+  // contain a NULL in any column as they will never compare to equal.
+  auto const row_bitmask = (compare_nulls == null_equality::EQUAL)
+                             ? rmm::device_buffer{}
+                             : cudf::detail::bitmask_and(right_flattened_keys, stream);
+  // skip rows that are null here.
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    right_num_rows,
+    [hash_table, row_bitmask = static_cast<bitmask_type const*>(row_bitmask.data())] __device__(
+      size_type idx) mutable {
+      if (!row_bitmask || cudf::bit_is_set(row_bitmask, idx)) {
+        hash_table.insert(thrust::make_pair(idx, true));
+      }
+    });
 
   //
   // Now we have a hash table, we need to iterate over the rows of the left table
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
new file mode 100644
index 00000000000..c5a28a8ec5f
--- /dev/null
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/combine.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+namespace {
+/**
+ * @brief Concatenate lists within the same row into one list, ignoring any null list during
+ * concatenation.
+ */
+std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
+                                                      bool build_null_mask,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  auto const num_rows = input.size();
+
+  static_assert(std::is_same_v<offset_type, int32_t> && std::is_same_v<size_type, int32_t>);
+  auto out_offsets = make_numeric_column(
+    data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+
+  // The array of int8_t stores validities for the output list elements.
+  auto validities = rmm::device_uvector<int8_t>(build_null_mask ? num_rows : 0, stream);
+
+  auto const d_out_offsets  = out_offsets->mutable_view().template begin<offset_type>();
+  auto const d_row_offsets  = lists_column_view(input).offsets_begin();
+  auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
+  auto const lists_dv_ptr   = column_device_view::create(lists_column_view(input).child());
+
+  // Concatenating the lists at the same row by converting the entry offsets from the child column
+  // into row offsets of the root column. Those entry offsets are subtracted by the first entry
+  // offset to output zero-based offsets.
+  auto const iter = thrust::make_counting_iterator<size_type>(0);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    d_out_offsets,
+                    [d_row_offsets,
+                     d_list_offsets,
+                     lists_dv     = *lists_dv_ptr,
+                     d_validities = validities.begin(),
+                     build_null_mask,
+                     iter] __device__(auto const idx) {
+                      if (build_null_mask) {
+                        // The output row will be null only if all lists on the input row are null.
+                        auto const is_valid = thrust::any_of(thrust::seq,
+                                                             iter + d_row_offsets[idx],
+                                                             iter + d_row_offsets[idx + 1],
+                                                             [&] __device__(auto const list_idx) {
+                                                               return lists_dv.is_valid(list_idx);
+                                                             });
+                        d_validities[idx]   = static_cast<int8_t>(is_valid);
+                      }
+                      auto const start_offset = d_list_offsets[d_row_offsets[0]];
+                      return d_list_offsets[d_row_offsets[idx]] - start_offset;
+                    });
+
+  // The child column of the output lists column is just copied from the input column.
+  auto out_entries = std::make_unique<column>(
+    lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream));
+
+  auto [null_mask, null_count] = [&] {
+    return build_null_mask
+             ? cudf::detail::valid_if(
+                 validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr)
+             : std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+  }();
+
+  return make_lists_column(num_rows,
+                           std::move(out_offsets),
+                           std::move(out_entries),
+                           null_count,
+                           null_count > 0 ? std::move(null_mask) : rmm::device_buffer{},
+                           stream,
+                           mr);
+}
+
+/**
+ * @brief Generate list offsets and list validities for the output lists column.
+ *
+ * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW).
+ */
+std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
+generate_list_offsets_and_validities(column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  auto const num_rows = input.size();
+
+  static_assert(std::is_same_v<offset_type, int32_t> && std::is_same_v<size_type, int32_t>);
+  auto out_offsets = make_numeric_column(
+    data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+
+  auto const lists_of_lists_dv_ptr = column_device_view::create(input);
+  auto const lists_dv_ptr          = column_device_view::create(lists_column_view(input).child());
+  auto const d_out_offsets         = out_offsets->mutable_view().template begin<offset_type>();
+  auto const d_row_offsets         = lists_column_view(input).offsets_begin();
+  auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
+
+  // The array of int8_t stores validities for the output list elements.
+  auto validities = rmm::device_uvector<int8_t>(num_rows, stream);
+
+  // Compute output list sizes and validities.
+  auto const iter = thrust::make_counting_iterator<size_type>(0);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    iter,
+    iter + num_rows,
+    d_out_offsets,
+    [lists_of_lists_dv = *lists_of_lists_dv_ptr,
+     lists_dv          = *lists_dv_ptr,
+     d_row_offsets,
+     d_list_offsets,
+     d_validities = validities.begin(),
+     iter] __device__(auto const idx) {
+      if (d_row_offsets[idx] == d_row_offsets[idx + 1]) {  // This is a null/empty row.
+        d_validities[idx] = static_cast<int8_t>(lists_of_lists_dv.is_valid(idx));
+        return size_type{0};
+      }
+      // The output row will not be null only if all lists on the input row are not null.
+      auto const is_valid =
+        thrust::all_of(thrust::seq,
+                       iter + d_row_offsets[idx],
+                       iter + d_row_offsets[idx + 1],
+                       [&] __device__(auto const list_idx) { return lists_dv.is_valid(list_idx); });
+      d_validities[idx] = static_cast<int8_t>(is_valid);
+      if (!is_valid) { return size_type{0}; }
+
+      // Compute size of the output list as sum of sizes of all lists in the current input row.
+      return d_list_offsets[d_row_offsets[idx + 1]] - d_list_offsets[d_row_offsets[idx]];
+    });
+
+  // Compute offsets from sizes.
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_out_offsets, d_out_offsets + num_rows + 1, d_out_offsets);
+
+  return {std::move(out_offsets), std::move(validities)};
+}
+
+/**
+ * @brief Gather entries from the input lists column, ignoring rows that have null list elements.
+ *
+ * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW).
+ */
+std::unique_ptr<column> gather_list_entries(column_view const& input,
+                                            column_view const& output_list_offsets,
+                                            size_type num_rows,
+                                            size_type num_output_entries,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  auto const child_col      = lists_column_view(input).child();
+  auto const entry_col      = lists_column_view(child_col).child();
+  auto const d_row_offsets  = lists_column_view(input).offsets_begin();
+  auto const d_list_offsets = lists_column_view(child_col).offsets_begin();
+  auto gather_map           = rmm::device_uvector<size_type>(num_output_entries, stream);
+
+  // Fill the gather map with indices of the lists from the child column of the input column.
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_rows,
+    [d_row_offsets,
+     d_list_offsets,
+     d_indices = gather_map.begin(),
+     d_out_list_offsets =
+       output_list_offsets.template begin<offset_type>()] __device__(size_type const idx) {
+      // The output row has been identified as a null/empty list during list size computation.
+      if (d_out_list_offsets[idx + 1] == d_out_list_offsets[idx]) { return; }
+
+      // The indices of the list elements on the row `idx` of the input column.
+      thrust::sequence(thrust::seq,
+                       d_indices + d_out_list_offsets[idx],
+                       d_indices + d_out_list_offsets[idx + 1],
+                       d_list_offsets[d_row_offsets[idx]]);
+    });
+
+  auto result = cudf::detail::gather(table_view{{entry_col}},
+                                     gather_map.begin(),
+                                     gather_map.end(),
+                                     out_of_bounds_policy::DONT_CHECK,
+                                     stream,
+                                     mr);
+  return std::move(result->release()[0]);
+}
+
+std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& input,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::mr::device_memory_resource* mr)
+{
+  // Generate offsets and validities of the output lists column.
+  auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr);
+  auto const offsets_view              = list_offsets->view();
+
+  auto const num_rows = input.size();
+  auto const num_output_entries =
+    cudf::detail::get_value<size_type>(offsets_view, num_rows, stream);
+
+  auto list_entries =
+    gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr);
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+
+  return make_lists_column(num_rows,
+                           std::move(list_offsets),
+                           std::move(list_entries),
+                           null_count,
+                           null_count ? std::move(null_mask) : rmm::device_buffer{},
+                           stream,
+                           mr);
+}
+
+}  // namespace
+
+/**
+ * @copydoc cudf::lists::concatenate_list_elements
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> concatenate_list_elements(column_view const& input,
+                                                  concatenate_null_policy null_policy,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  auto type = input.type();  // Column that is lists of lists.
+  CUDF_EXPECTS(type.id() == type_id::LIST, "Input column must be a lists column.");
+
+  auto col = lists_column_view(input).child();  // Rows, which are lists.
+  type     = col.type();
+  CUDF_EXPECTS(type.id() == type_id::LIST, "Rows of the input column must be lists.");
+
+  col  = lists_column_view(col).child();  // The last level entries what we need to check.
+  type = col.type();
+  CUDF_EXPECTS(type.id() == type_id::LIST || !cudf::is_nested(type),
+               "Entry of the input lists column must be of list or non-nested types.");
+
+  if (input.size() == 0) { return cudf::empty_like(input); }
+
+  bool has_null_list = lists_column_view(input).child().has_nulls();
+
+  return (null_policy == concatenate_null_policy::IGNORE || !has_null_list)
+           ? concatenate_lists_ignore_null(input, has_null_list, stream, mr)
+           : concatenate_lists_nullifying_rows(input, stream, mr);
+}
+
+}  // namespace detail
+
+/**
+ * @copydoc cudf::lists::concatenate_list_elements
+ */
+std::unique_ptr<column> concatenate_list_elements(column_view const& input,
+                                                  concatenate_null_policy null_policy,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::concatenate_list_elements(input, null_policy, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
new file mode 100644
index 00000000000..fdd71aea7bf
--- /dev/null
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/combine.hpp>
+#include <cudf/lists/detail/combine.hpp>
+#include <cudf/lists/detail/interleave_columns.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+/**
+ * @copydoc cudf::lists::concatenate_rows
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> concatenate_rows(table_view const& input,
+                                         concatenate_null_policy null_policy,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column.");
+
+  auto const entry_type = lists_column_view(*input.begin()).child().type();
+  for (auto const& col : input) {
+    CUDF_EXPECTS(col.type().id() == type_id::LIST,
+                 "All columns of the input table must be of lists column type.");
+
+    auto const child_col = lists_column_view(col).child();
+    CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported.");
+    CUDF_EXPECTS(entry_type == child_col.type(),
+                 "The types of entries in the input columns must be the same.");
+  }
+
+  auto const num_rows = input.num_rows();
+  auto const num_cols = input.num_columns();
+  if (num_rows == 0) { return cudf::empty_like(input.column(0)); }
+  if (num_cols == 1) { return std::make_unique<column>(*(input.begin()), stream, mr); }
+
+  // Memory resource for temporary data.
+  auto const default_mr = rmm::mr::get_current_device_resource();
+
+  // Interleave the input table into one column.
+  auto const has_null_mask = std::any_of(
+    std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); });
+  auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream, default_mr);
+
+  // Generate a lists column which has child column is the interleaved_columns.
+  // The new nested lists column will have each row is a list of `num_cols` list elements.
+  static_assert(std::is_same_v<offset_type, int32_t> and std::is_same_v<size_type, int32_t>);
+  auto list_offsets = make_numeric_column(
+    data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, default_mr);
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(num_rows + 1),
+                    list_offsets->mutable_view().template begin<offset_type>(),
+                    [num_cols] __device__(auto const idx) { return idx * num_cols; });
+  auto const nested_lists_col = make_lists_column(num_rows,
+                                                  std::move(list_offsets),
+                                                  std::move(interleaved_columns),
+                                                  0,
+                                                  rmm::device_buffer{},
+                                                  stream,
+                                                  default_mr);
+
+  // Concatenate lists on each row of the nested lists column, producing the desired output.
+  return concatenate_list_elements(nested_lists_col->view(), null_policy, stream, mr);
+}
+
+}  // namespace detail
+
+/**
+ * @copydoc cudf::lists::concatenate_rows
+ */
+std::unique_ptr<column> concatenate_rows(table_view const& lists_columns,
+                                         concatenate_null_policy null_policy,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 7d8ec5fe710..ff4649f4945 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  if (lists.is_empty()) { return cudf::empty_like(lists.parent()); }
+  if (lists.is_empty() or start == end) { return cudf::empty_like(lists.parent()); }
   if (end < 0 || end > lists.size()) end = lists.size();
   CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid slice range.");
   auto lists_count   = end - start;
@@ -70,7 +70,9 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
     (lists.child().type() == cudf::data_type{type_id::LIST})
       ? copy_slice(lists_column_view(lists.child()), start_offset, end_offset, stream, mr)
       : std::make_unique<cudf::column>(
-          cudf::detail::slice(lists.child(), {start_offset, end_offset}, stream).front());
+          cudf::detail::slice(lists.child(), {start_offset, end_offset}, stream).front(),
+          stream,
+          mr);
 
   // Compute the null mask of the result:
   auto null_mask = cudf::detail::copy_bitmask(lists.null_mask(), start, end, stream, mr);
@@ -81,6 +83,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                            cudf::UNKNOWN_NULL_COUNT,
                            std::move(null_mask));
 }
+
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index ba366b3a020..84ca171d455 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -25,7 +25,6 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/transform.h>
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 5f6f1c308ac..3ce0f91fd71 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -36,6 +36,11 @@
 
 namespace cudf {
 namespace detail {
+
+// explode column gather map uses cudf::out_of_bounds_policy::NULLIFY to
+// fill nulls where there are invalid indices
+constexpr size_type InvalidIndex = -1;
+
 namespace {
 
 std::unique_ptr<table> build_table(
@@ -62,27 +67,35 @@ std::unique_ptr<table> build_table(
 
   std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();
 
-  auto inserted = columns.insert(columns.begin() + explode_column_idx,
-                                 explode_col_gather_map
-                                   ? std::move(detail::gather(table_view({sliced_child}),
-                                                              explode_col_gather_map->begin(),
-                                                              explode_col_gather_map->end(),
-                                                              cudf::out_of_bounds_policy::NULLIFY,
-                                                              stream,
-                                                              mr)
-                                                 ->release()[0])
-                                   : std::make_unique<column>(sliced_child, stream, mr));
+  columns.insert(columns.begin() + explode_column_idx,
+                 explode_col_gather_map
+                   ? std::move(detail::gather(table_view({sliced_child}),
+                                              explode_col_gather_map->begin(),
+                                              explode_col_gather_map->end(),
+                                              cudf::out_of_bounds_policy::NULLIFY,
+                                              stream,
+                                              mr)
+                                 ->release()[0])
+                   : std::make_unique<column>(sliced_child, stream, mr));
 
   if (position_array) {
     size_type position_size = position_array->size();
-    // the null mask for position matches the exploded column's gather map, so copy it over
-    rmm::device_buffer nullmask =
-      explode_col_gather_map ? copy_bitmask(*inserted->get()) : rmm::device_buffer(0, stream);
+    // build the null mask for position based on invalid entries in gather map
+    auto nullmask = explode_col_gather_map ? valid_if(
+                                               explode_col_gather_map->begin(),
+                                               explode_col_gather_map->end(),
+                                               [] __device__(auto i) { return i != InvalidIndex; },
+                                               stream,
+                                               mr)
+                                           : std::pair<rmm::device_buffer, size_type>{
+                                               rmm::device_buffer(0, stream), size_type{0}};
+
     columns.insert(columns.begin() + explode_column_idx,
                    std::make_unique<column>(data_type(type_to_id<size_type>()),
                                             position_size,
                                             position_array->release(),
-                                            std::move(nullmask)));
+                                            std::move(nullmask.first),
+                                            nullmask.second));
   }
 
   return std::make_unique<table>(std::move(columns));
@@ -243,8 +256,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
                              : offsets[idx] + null_or_empty_offset_p[idx] - 1;
       gather_map_p[invalid_index] = idx;
 
-      // negative one to indicate a null value
-      explode_col_gather_map_p[invalid_index] = -1;
+      explode_col_gather_map_p[invalid_index] = InvalidIndex;
       if (include_position) { position_array[invalid_index] = 0; }
     }
   };
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
new file mode 100644
index 00000000000..222c37507c4
--- /dev/null
+++ b/cpp/src/lists/interleave_columns.cu
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace lists {
+namespace detail {
+namespace {
+/**
+ * @brief Generate list offsets and list validities for the output lists column from the table_view
+ * of the input lists columns.
+ */
+std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
+generate_list_offsets_and_validities(table_view const& input,
+                                     bool has_null_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  auto const num_cols         = input.num_columns();
+  auto const num_rows         = input.num_rows();
+  auto const num_output_lists = num_rows * num_cols;
+  auto const table_dv_ptr     = table_device_view::create(input);
+
+  // The output offsets column.
+  static_assert(sizeof(offset_type) == sizeof(int32_t));
+  static_assert(sizeof(size_type) == sizeof(int32_t));
+  auto list_offsets = make_numeric_column(
+    data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr);
+  auto const d_offsets = list_offsets->mutable_view().template begin<offset_type>();
+
+  // The array of int8_t to store validities for list elements.
+  auto validities = rmm::device_uvector<int8_t>(has_null_mask ? num_output_lists : 0, stream);
+
+  // Compute list sizes and validities.
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_output_lists),
+    d_offsets,
+    [num_cols,
+     table_dv     = *table_dv_ptr,
+     d_validities = validities.begin(),
+     has_null_mask] __device__(size_type const idx) {
+      auto const col_id     = idx % num_cols;
+      auto const list_id    = idx / num_cols;
+      auto const& lists_col = table_dv.column(col_id);
+      if (has_null_mask) { d_validities[idx] = static_cast<int8_t>(lists_col.is_valid(list_id)); }
+      auto const list_offsets =
+        lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+        lists_col.offset();
+      return list_offsets[list_id + 1] - list_offsets[list_id];
+    });
+
+  // Compute offsets from sizes.
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + num_output_lists + 1, d_offsets);
+
+  return {std::move(list_offsets), std::move(validities)};
+}
+
+/**
+ * @brief Compute string sizes, string validities, and interleave string lists functor.
+ *
+ * This functor is executed twice. In the first pass, the sizes and validities of the output strings
+ * will be computed. In the second pass, this will interleave the lists of strings of the given
+ * table containing those lists.
+ */
+struct compute_string_sizes_and_interleave_lists_fn {
+  table_device_view const table_dv;
+
+  // Store list offsets of the output lists column.
+  offset_type const* const dst_list_offsets;
+
+  // Flag to specify whether to compute string validities.
+  bool const has_null_mask;
+
+  // Store offsets of the strings.
+  offset_type* d_offsets{nullptr};
+
+  // If d_chars == nullptr: only compute sizes and validities of the output strings.
+  // If d_chars != nullptr: only interleave lists of strings.
+  char* d_chars{nullptr};
+
+  // We need to set `1` or `0` for the validities of the strings in the child column.
+  int8_t* d_validities{nullptr};
+
+  __device__ void operator()(size_type const idx)
+  {
+    auto const num_cols = table_dv.num_columns();
+    auto const col_id   = idx % num_cols;
+    auto const list_id  = idx / num_cols;
+
+    auto const& lists_col = table_dv.column(col_id);
+    if (has_null_mask and lists_col.is_null(list_id)) { return; }
+
+    auto const list_offsets =
+      lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+      lists_col.offset();
+    auto const& str_col = lists_col.child(lists_column_view::child_column_index);
+    auto const str_offsets =
+      str_col.child(strings_column_view::offsets_column_index).template data<offset_type>();
+
+    // The range of indices of the strings within the source list.
+    auto const start_str_idx = list_offsets[list_id];
+    auto const end_str_idx   = list_offsets[list_id + 1];
+
+    // read_idx and write_idx are indices of string elements.
+    size_type write_idx = dst_list_offsets[idx];
+
+    if (not d_chars) {  // just compute sizes and validities of strings within a list
+      for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
+        if (has_null_mask) {
+          d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
+        }
+        d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
+      }
+    } else {  // just copy the entire memory region containing all strings in the list
+      // start_byte and end_byte are indices of character of the string elements.
+      auto const start_byte = str_offsets[start_str_idx];
+      auto const end_byte   = str_offsets[end_str_idx];
+      if (start_byte < end_byte) {
+        auto const input_ptr =
+          str_col.child(strings_column_view::chars_column_index).template data<char>() + start_byte;
+        auto const output_ptr = d_chars + d_offsets[write_idx];
+        thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
+      }
+    }
+  }
+};
+
+/**
+ * @brief Struct used in type_dispatcher to interleave list entries of the input lists columns and
+ * output the results into a destination column.
+ */
+struct interleave_list_entries_fn {
+  template <class T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, std::unique_ptr<column>> operator()(
+    table_view const& input,
+    column_view const& output_list_offsets,
+    size_type num_output_lists,
+    size_type num_output_entries,
+    bool data_has_null_mask,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const noexcept
+  {
+    auto const table_dv_ptr = table_device_view::create(input);
+    auto const comp_fn      = compute_string_sizes_and_interleave_lists_fn{
+      *table_dv_ptr, output_list_offsets.template begin<offset_type>(), data_has_null_mask};
+
+    if (data_has_null_mask) {
+      auto [offsets_column, chars_column, null_mask, null_count] =
+        cudf::strings::detail::make_strings_children_with_null_mask(
+          comp_fn, num_output_lists, num_output_entries, stream, mr);
+      return make_strings_column(num_output_entries,
+                                 std::move(offsets_column),
+                                 std::move(chars_column),
+                                 null_count,
+                                 std::move(null_mask),
+                                 stream,
+                                 mr);
+    }
+
+    auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
+      comp_fn, num_output_lists, num_output_entries, stream, mr);
+    return make_strings_column(num_output_entries,
+                               std::move(offsets_column),
+                               std::move(chars_column),
+                               0,
+                               rmm::device_buffer{},
+                               stream,
+                               mr);
+  }
+
+  template <class T>
+  std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<column>> operator()(
+    table_view const& input,
+    column_view const& output_list_offsets,
+    size_type num_output_lists,
+    size_type num_output_entries,
+    bool data_has_null_mask,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const noexcept
+  {
+    auto const num_cols     = input.num_columns();
+    auto const num_rows     = input.num_rows();
+    auto const table_dv_ptr = table_device_view::create(input);
+
+    // The output child column.
+    auto output        = allocate_like(lists_column_view(*input.begin()).child(),
+                                num_output_entries,
+                                mask_allocation_policy::NEVER,
+                                stream,
+                                mr);
+    auto output_dv_ptr = mutable_column_device_view::create(*output);
+
+    // The array of int8_t to store entry validities.
+    auto validities =
+      rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
+
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      num_output_lists,
+      [num_cols,
+       table_dv     = *table_dv_ptr,
+       d_validities = validities.begin(),
+       d_offsets    = output_list_offsets.template begin<offset_type>(),
+       d_output     = output_dv_ptr->template begin<T>(),
+       data_has_null_mask] __device__(size_type const idx) {
+        auto const col_id     = idx % num_cols;
+        auto const list_id    = idx / num_cols;
+        auto const& lists_col = table_dv.column(col_id);
+        auto const list_offsets =
+          lists_col.child(lists_column_view::offsets_column_index).template data<offset_type>() +
+          lists_col.offset();
+        auto const& data_col = lists_col.child(lists_column_view::child_column_index);
+
+        // The range of indices of the entries within the source list.
+        auto const start_idx = list_offsets[list_id];
+        auto const end_idx   = list_offsets[list_id + 1];
+
+        auto const write_start = d_offsets[idx];
+
+        // Fill the validities array if necessary.
+        if (data_has_null_mask) {
+          for (auto read_idx = start_idx, write_idx = write_start; read_idx < end_idx;
+               ++read_idx, ++write_idx) {
+            d_validities[write_idx] = static_cast<int8_t>(data_col.is_valid(read_idx));
+          }
+        }
+
+        // Do a copy for the entire list entries.
+        auto const input_ptr =
+          reinterpret_cast<char const*>(data_col.template data<T>() + start_idx);
+        auto const output_ptr = reinterpret_cast<char*>(&d_output[write_start]);
+        thrust::copy(
+          thrust::seq, input_ptr, input_ptr + sizeof(T) * (end_idx - start_idx), output_ptr);
+      });
+
+    if (data_has_null_mask) {
+      auto [null_mask, null_count] = cudf::detail::valid_if(
+        validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+      if (null_count > 0) { output->set_null_mask(null_mask, null_count); }
+    }
+
+    return output;
+  }
+
+  template <class T>
+  std::enable_if_t<not std::is_same_v<T, cudf::string_view> and not cudf::is_fixed_width<T>(),
+                   std::unique_ptr<column>>
+  operator()(table_view const&,
+             column_view const&,
+             size_type,
+             size_type,
+             bool,
+             rmm::cuda_stream_view,
+             rmm::mr::device_memory_resource*) const
+  {
+    // Currently, only support string_view and fixed-width types
+    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
+  }
+};
+
+}  // anonymous namespace
+
+/**
+ * @copydoc cudf::lists::detail::interleave_columns
+ *
+ */
+std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           bool has_null_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  auto const entry_type = lists_column_view(*input.begin()).child().type();
+  for (auto const& col : input) {
+    CUDF_EXPECTS(col.type().id() == type_id::LIST,
+                 "All columns of the input table must be of lists column type.");
+
+    auto const child_col = lists_column_view(col).child();
+    CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported.");
+    CUDF_EXPECTS(entry_type == child_col.type(),
+                 "The types of entries in the input columns must be the same.");
+  }
+
+  if (input.num_rows() == 0) { return cudf::empty_like(input.column(0)); }
+  if (input.num_columns() == 1) { return std::make_unique<column>(*(input.begin()), stream, mr); }
+
+  // Generate offsets of the output lists column.
+  auto [list_offsets, list_validities] =
+    generate_list_offsets_and_validities(input, has_null_mask, stream, mr);
+  auto const offsets_view = list_offsets->view();
+
+  // Copy entries from the input lists columns to the output lists column - this needed to be
+  // specialized for different types.
+  auto const num_output_lists = input.num_rows() * input.num_columns();
+  auto const num_output_entries =
+    cudf::detail::get_value<offset_type>(offsets_view, num_output_lists, stream);
+  auto const data_has_null_mask =
+    std::any_of(std::cbegin(input), std::cend(input), [](auto const& col) {
+      return col.child(lists_column_view::child_column_index).nullable();
+    });
+  auto list_entries = type_dispatcher<dispatch_storage_type>(entry_type,
+                                                             interleave_list_entries_fn{},
+                                                             input,
+                                                             offsets_view,
+                                                             num_output_lists,
+                                                             num_output_entries,
+                                                             data_has_null_mask,
+                                                             stream,
+                                                             mr);
+
+  if (not has_null_mask) {
+    return make_lists_column(num_output_lists,
+                             std::move(list_offsets),
+                             std::move(list_entries),
+                             0,
+                             rmm::device_buffer{},
+                             stream,
+                             mr);
+  }
+
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  return make_lists_column(num_output_lists,
+                           std::move(list_offsets),
+                           std::move(list_entries),
+                           null_count,
+                           null_count ? std::move(null_mask) : rmm::device_buffer{},
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+}  // namespace lists
+}  // namespace cudf
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index ebf5e07f76a..9c1cbd8a8cc 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,75 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/gather.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/sequence.h>
 
 namespace cudf {
+namespace lists {
+namespace detail {
+
+std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
+                                                            size_type size,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr)
+{
+  if (size == 0) {
+    return make_lists_column(0,
+                             make_empty_column(data_type{type_to_id<offset_type>()}),
+                             empty_like(value.view()),
+                             0,
+                             cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr),
+                             stream,
+                             mr);
+  }
+  auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource();
+
+  // Handcraft a 1-row column
+  auto offsets = make_numeric_column(
+    data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr_final);
+  auto m_offsets = offsets->mutable_view();
+  thrust::sequence(rmm::exec_policy(stream),
+                   m_offsets.begin<size_type>(),
+                   m_offsets.end<size_type>(),
+                   0,
+                   value.view().size());
+  size_type null_count = value.is_valid(stream) ? 0 : 1;
+  auto null_mask_state = null_count ? mask_state::ALL_NULL : mask_state::UNALLOCATED;
+  auto null_mask       = cudf::detail::create_null_mask(1, null_mask_state, stream, mr_final);
+
+  if (size == 1) {
+    auto child = std::make_unique<column>(value.view(), stream, mr_final);
+    return make_lists_column(
+      1, std::move(offsets), std::move(child), null_count, std::move(null_mask), stream, mr_final);
+  }
+
+  auto children_views   = std::vector<column_view>{offsets->view(), value.view()};
+  auto one_row_col_view = column_view(data_type{type_id::LIST},
+                                      1,
+                                      nullptr,
+                                      static_cast<bitmask_type const*>(null_mask.data()),
+                                      null_count,
+                                      0,
+                                      children_views);
+
+  auto begin = thrust::make_constant_iterator(0);
+  auto res   = cudf::detail::gather(table_view({one_row_col_view}),
+                                  begin,
+                                  begin + size,
+                                  out_of_bounds_policy::DONT_CHECK,
+                                  stream,
+                                  mr_final);
+  return std::move(res->release()[0]);
+}
+
+}  // namespace detail
+}  // namespace lists
 
 /**
  * @copydoc cudf::make_lists_column
@@ -44,8 +109,8 @@ std::unique_ptr<column> make_lists_column(size_type num_rows,
   children.emplace_back(std::move(child_column));
   return std::make_unique<column>(cudf::data_type{type_id::LIST},
                                   num_rows,
-                                  rmm::device_buffer{0, stream, mr},
-                                  null_mask,
+                                  rmm::device_buffer{},
+                                  std::move(null_mask),
                                   null_count,
                                   std::move(children));
 }
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 3bbbc9b16b7..b085d1e77d1 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -157,6 +157,20 @@ struct SegmentedSortColumn {
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
   {
+    // the average list size at which to prefer radixsort:
+    constexpr cudf::size_type MIN_AVG_LIST_SIZE_FOR_RADIXSORT{100};
+
+    if ((child.size() / offsets.size()) < MIN_AVG_LIST_SIZE_FOR_RADIXSORT) {
+      auto child_table = segmented_sort_by_key(table_view{{child}},
+                                               table_view{{child}},
+                                               offsets,
+                                               {column_order},
+                                               {null_precedence},
+                                               stream,
+                                               mr);
+      return std::move(child_table->release().front());
+    }
+
     auto output =
       cudf::detail::allocate_like(child, child.size(), mask_allocation_policy::NEVER, stream, mr);
     mutable_column_view mutable_output_view = output->mutable_view();
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 24c0af12938..028b3e58be3 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -15,9 +15,11 @@
  */
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/merge.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/merge.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/strings/detail/merge.cuh>
@@ -25,13 +27,13 @@
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/merge.h>
-#include <thrust/tuple.h>
+#include <thrust/pair.h>
 
 #include <queue>
 #include <vector>
@@ -143,6 +145,12 @@ void materialize_bitmask(column_view const& left_col,
   CHECK_CUDA(stream.value());
 }
 
+struct side_index_generator {
+  side _side;
+
+  __device__ index_type operator()(size_type i) const noexcept { return index_type{_side, i}; }
+};
+
 /**
  * @brief Generates the row indices and source side (left or right) in accordance with the index
  * columns.
@@ -158,66 +166,51 @@ void materialize_bitmask(column_view const& left_col,
  * (defaults to true)
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  *
- * @return A vector of merged indices
+ * @return A device_uvector of merged indices
  */
-rmm::device_vector<index_type> generate_merged_indices(
-  table_view const& left_table,
-  table_view const& right_table,
-  std::vector<order> const& column_order,
-  std::vector<null_order> const& null_precedence,
-  bool nullable                = true,
-  rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+index_vector generate_merged_indices(table_view const& left_table,
+                                     table_view const& right_table,
+                                     std::vector<order> const& column_order,
+                                     std::vector<null_order> const& null_precedence,
+                                     bool nullable                = true,
+                                     rmm::cuda_stream_view stream = rmm::cuda_stream_default)
 {
   const size_type left_size  = left_table.num_rows();
   const size_type right_size = right_table.num_rows();
   const size_type total_size = left_size + right_size;
 
-  thrust::constant_iterator<side> left_side(side::LEFT);
-  thrust::constant_iterator<side> right_side(side::RIGHT);
-
-  auto left_indices  = thrust::make_counting_iterator(static_cast<size_type>(0));
-  auto right_indices = thrust::make_counting_iterator(static_cast<size_type>(0));
-
-  auto left_begin_zip_iterator =
-    thrust::make_zip_iterator(thrust::make_tuple(left_side, left_indices));
-  auto right_begin_zip_iterator =
-    thrust::make_zip_iterator(thrust::make_tuple(right_side, right_indices));
-
-  auto left_end_zip_iterator =
-    thrust::make_zip_iterator(thrust::make_tuple(left_side + left_size, left_indices + left_size));
-  auto right_end_zip_iterator = thrust::make_zip_iterator(
-    thrust::make_tuple(right_side + right_size, right_indices + right_size));
+  auto left_gen    = side_index_generator{side::LEFT};
+  auto right_gen   = side_index_generator{side::RIGHT};
+  auto left_begin  = cudf::detail::make_counting_transform_iterator(0, left_gen);
+  auto right_begin = cudf::detail::make_counting_transform_iterator(0, right_gen);
 
-  rmm::device_vector<index_type> merged_indices(total_size);
+  index_vector merged_indices(total_size, stream);
 
   auto lhs_device_view = table_device_view::create(left_table, stream);
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
-  rmm::device_vector<order> d_column_order(column_order);
+  auto d_column_order = cudf::detail::make_device_uvector_async(column_order, stream);
 
   if (nullable) {
-    rmm::device_vector<null_order> d_null_precedence(null_precedence);
+    auto d_null_precedence = cudf::detail::make_device_uvector_async(null_precedence, stream);
 
-    auto ineq_op =
-      detail::row_lexicographic_tagged_comparator<true>(*lhs_device_view,
-                                                        *rhs_device_view,
-                                                        d_column_order.data().get(),
-                                                        d_null_precedence.data().get());
+    auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
+      *lhs_device_view, *rhs_device_view, d_column_order.data(), d_null_precedence.data());
     thrust::merge(rmm::exec_policy(stream),
-                  left_begin_zip_iterator,
-                  left_end_zip_iterator,
-                  right_begin_zip_iterator,
-                  right_end_zip_iterator,
+                  left_begin,
+                  left_begin + left_size,
+                  right_begin,
+                  right_begin + right_size,
                   merged_indices.begin(),
                   ineq_op);
   } else {
     auto ineq_op = detail::row_lexicographic_tagged_comparator<false>(
-      *lhs_device_view, *rhs_device_view, d_column_order.data().get());
+      *lhs_device_view, *rhs_device_view, d_column_order.data());
     thrust::merge(rmm::exec_policy(stream),
-                  left_begin_zip_iterator,
-                  left_end_zip_iterator,
-                  right_begin_zip_iterator,
-                  right_end_zip_iterator,
+                  left_begin,
+                  left_begin + left_size,
+                  right_begin,
+                  right_begin + right_size,
                   merged_indices.begin(),
                   ineq_op);
   }
@@ -297,7 +290,6 @@ struct column_merger {
                       row_order_.end(),
                       merged_view.begin<Element>(),
                       [d_lcol, d_rcol] __device__(index_type const& index_pair) {
-                        // When C++17, use structure bindings
                         auto side  = thrust::get<0>(index_pair);
                         auto index = thrust::get<1>(index_pair);
                         return side == side::LEFT ? d_lcol[index] : d_rcol[index];
@@ -309,7 +301,7 @@ struct column_merger {
     if (lcol.has_nulls() || rcol.has_nulls()) {
       // resolve null mask:
       //
-      materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream);
+      materialize_bitmask(lcol, rcol, merged_view, row_order_.data(), stream);
     }
 
     return merged_col;
@@ -335,7 +327,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
                                                    mr);
   if (lcol.has_nulls() || rcol.has_nulls()) {
     auto merged_view = column->mutable_view();
-    materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream);
+    materialize_bitmask(lcol, rcol, merged_view, row_order_.data(), stream);
   }
   return column;
 }
@@ -353,7 +345,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(
   // set the validity mask
   if (lcol.has_nulls() || rcol.has_nulls()) {
     auto merged_view = result->mutable_view();
-    materialize_bitmask(lcol, rcol, merged_view, row_order_.data().get(), stream);
+    materialize_bitmask(lcol, rcol, merged_view, row_order_.data(), stream);
   }
   return result;
 }
@@ -377,7 +369,7 @@ table_ptr_type merge(cudf::table_view const& left_table,
 
   // extract merged row order according to indices:
   //
-  rmm::device_vector<index_type> merged_indices = generate_merged_indices(
+  auto const merged_indices = generate_merged_indices(
     index_left_view, index_right_view, column_order, null_precedence, nullable);
 
   // create merged table:
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 209e2d16f87..5401b3981b6 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -22,11 +22,13 @@
 #include <cudf/detail/scatter.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/partitioning.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
@@ -355,20 +357,20 @@ void copy_block_partitions_impl(InputIter const input,
     scanned_block_partition_sizes);
 }
 
-rmm::device_vector<size_type> compute_gather_map(size_type num_rows,
-                                                 size_type num_partitions,
-                                                 size_type const* row_partition_numbers,
-                                                 size_type const* row_partition_offset,
-                                                 size_type const* block_partition_sizes,
-                                                 size_type const* scanned_block_partition_sizes,
-                                                 size_type grid_size,
-                                                 rmm::cuda_stream_view stream)
+rmm::device_uvector<size_type> compute_gather_map(size_type num_rows,
+                                                  size_type num_partitions,
+                                                  size_type const* row_partition_numbers,
+                                                  size_type const* row_partition_offset,
+                                                  size_type const* block_partition_sizes,
+                                                  size_type const* scanned_block_partition_sizes,
+                                                  size_type grid_size,
+                                                  rmm::cuda_stream_view stream)
 {
   auto sequence = thrust::make_counting_iterator(0);
-  rmm::device_vector<size_type> gather_map(num_rows);
+  rmm::device_uvector<size_type> gather_map(num_rows, stream);
 
   copy_block_partitions_impl(sequence,
-                             gather_map.data().get(),
+                             gather_map.begin(),
                              num_rows,
                              num_partitions,
                              row_partition_numbers,
@@ -464,7 +466,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   auto grid_size = util::div_rounding_up_safe(num_rows, rows_per_block);
 
   // Allocate array to hold which partition each row belongs to
-  auto row_partition_numbers = rmm::device_vector<size_type>(num_rows);
+  auto row_partition_numbers = rmm::device_uvector<size_type>(num_rows, stream);
 
   // Array to hold the size of each partition computed by each block
   //  i.e., { {block0 partition0 size, block1 partition0 size, ...},
@@ -472,14 +474,17 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   //          ...
   //          {block0 partition(num_partitions-1) size, block1
   //          partition(num_partitions -1) size, ...} }
-  auto block_partition_sizes = rmm::device_vector<size_type>(grid_size * num_partitions);
+  auto block_partition_sizes = rmm::device_uvector<size_type>(grid_size * num_partitions, stream);
 
-  auto scanned_block_partition_sizes = rmm::device_vector<size_type>(grid_size * num_partitions);
+  auto scanned_block_partition_sizes =
+    rmm::device_uvector<size_type>(grid_size * num_partitions, stream);
 
   // Holds the total number of rows in each partition
-  auto global_partition_sizes = rmm::device_vector<size_type>(num_partitions, size_type{0});
+  auto global_partition_sizes =
+    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_partitions, stream);
 
-  auto row_partition_offset = rmm::device_vector<size_type>(num_rows);
+  auto row_partition_offset =
+    cudf::detail::make_zeroed_device_uvector_async<size_type>(num_rows, stream);
 
   auto const device_input = table_device_view::create(table_to_hash, stream);
   auto const hasher       = row_hasher<hash_function, hash_has_nulls>(*device_input, seed);
@@ -502,10 +507,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
                                                       num_rows,
                                                       num_partitions,
                                                       partitioner_type(num_partitions),
-                                                      row_partition_numbers.data().get(),
-                                                      row_partition_offset.data().get(),
-                                                      block_partition_sizes.data().get(),
-                                                      global_partition_sizes.data().get());
+                                                      row_partition_numbers.data(),
+                                                      row_partition_offset.data(),
+                                                      block_partition_sizes.data(),
+                                                      global_partition_sizes.data());
   } else {
     // Determines how the mapping between hash value and partition number is
     // computed
@@ -522,10 +527,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
                                                       num_rows,
                                                       num_partitions,
                                                       partitioner_type(num_partitions),
-                                                      row_partition_numbers.data().get(),
-                                                      row_partition_offset.data().get(),
-                                                      block_partition_sizes.data().get(),
-                                                      global_partition_sizes.data().get());
+                                                      row_partition_numbers.data(),
+                                                      row_partition_offset.data(),
+                                                      block_partition_sizes.data(),
+                                                      global_partition_sizes.data());
   }
 
   // Compute exclusive scan of all blocks' partition sizes in-place to determine
@@ -533,25 +538,20 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          block_partition_sizes.begin(),
                          block_partition_sizes.end(),
-                         scanned_block_partition_sizes.data().get());
+                         scanned_block_partition_sizes.data());
 
   // Compute exclusive scan of size of each partition to determine offset
   // location of each partition in final output.
   // TODO This can be done independently on a separate stream
-  size_type* scanned_global_partition_sizes{global_partition_sizes.data().get()};
   thrust::exclusive_scan(rmm::exec_policy(stream),
                          global_partition_sizes.begin(),
                          global_partition_sizes.end(),
-                         scanned_global_partition_sizes);
+                         global_partition_sizes.begin());
 
   // Copy the result of the exclusive scan to the output offsets array
   // to indicate the starting point for each partition in the output
-  std::vector<size_type> partition_offsets(num_partitions);
-  CUDA_TRY(cudaMemcpyAsync(partition_offsets.data(),
-                           scanned_global_partition_sizes,
-                           num_partitions * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+  auto const partition_offsets =
+    cudf::detail::make_std_vector_async(global_partition_sizes, stream);
 
   // When the number of partitions is less than a threshold, we can apply an
   // optimization using shared memory to copy values to the output buffer.
@@ -559,23 +559,16 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   if (use_optimization) {
     std::vector<std::unique_ptr<column>> output_cols(input.num_columns());
 
-    // NOTE these pointers are non-const to workaround lambda capture bug in
-    // gcc 5.4
-    auto row_partition_numbers_ptr{row_partition_numbers.data().get()};
-    auto row_partition_offset_ptr{row_partition_offset.data().get()};
-    auto block_partition_sizes_ptr{block_partition_sizes.data().get()};
-    auto scanned_block_partition_sizes_ptr{scanned_block_partition_sizes.data().get()};
-
     // Copy input to output by partition per column
-    std::transform(input.begin(), input.end(), output_cols.begin(), [=](auto const& col) {
+    std::transform(input.begin(), input.end(), output_cols.begin(), [&](auto const& col) {
       return cudf::type_dispatcher<dispatch_storage_type>(col.type(),
                                                           copy_block_partitions_dispatcher{},
                                                           col,
                                                           num_partitions,
-                                                          row_partition_numbers_ptr,
-                                                          row_partition_offset_ptr,
-                                                          block_partition_sizes_ptr,
-                                                          scanned_block_partition_sizes_ptr,
+                                                          row_partition_numbers.data(),
+                                                          row_partition_offset.data(),
+                                                          block_partition_sizes.data(),
+                                                          scanned_block_partition_sizes.data(),
                                                           grid_size,
                                                           stream,
                                                           mr);
@@ -585,10 +578,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
       // Use copy_block_partitions to compute a gather map
       auto gather_map = compute_gather_map(num_rows,
                                            num_partitions,
-                                           row_partition_numbers_ptr,
-                                           row_partition_offset_ptr,
-                                           block_partition_sizes_ptr,
-                                           scanned_block_partition_sizes_ptr,
+                                           row_partition_numbers.data(),
+                                           row_partition_offset.data(),
+                                           block_partition_sizes.data(),
+                                           scanned_block_partition_sizes.data(),
                                            grid_size,
                                            stream);
 
@@ -597,13 +590,14 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
         input, gather_map.begin(), output_cols, detail::gather_bitmask_op::DONT_CHECK, stream, mr);
     }
 
-    auto output{std::make_unique<table>(std::move(output_cols))};
-    return std::make_pair(std::move(output), std::move(partition_offsets));
+    stream.synchronize();  // Async D2H copy must finish before returning host vec
+    return std::make_pair(std::make_unique<table>(std::move(output_cols)),
+                          std::move(partition_offsets));
   } else {
     // Compute a scatter map from input to output such that the output rows are
     // sorted by partition number
-    auto row_output_locations{row_partition_numbers.data().get()};
-    auto scanned_block_partition_sizes_ptr{scanned_block_partition_sizes.data().get()};
+    auto row_output_locations{row_partition_numbers.data()};
+    auto scanned_block_partition_sizes_ptr{scanned_block_partition_sizes.data()};
     compute_row_output_locations<<<grid_size,
                                    block_size,
                                    num_partitions * sizeof(size_type),
@@ -614,6 +608,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     auto output = detail::scatter(
       input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr);
 
+    stream.synchronize();  // Async D2H copy must finish before returning host vec
     return std::make_pair(std::move(output), std::move(partition_offsets));
   }
 }
@@ -648,7 +643,7 @@ struct dispatch_map_type {
              rmm::mr::device_memory_resource* mr) const
   {
     // Build a histogram of the number of rows in each partition
-    rmm::device_vector<size_type> histogram(num_partitions + 1);
+    rmm::device_uvector<size_type> histogram(num_partitions + 1, stream);
     std::size_t temp_storage_bytes{};
     std::size_t const num_levels = num_partitions + 1;
     size_type const lower_level  = 0;
@@ -656,7 +651,7 @@ struct dispatch_map_type {
     cub::DeviceHistogram::HistogramEven(nullptr,
                                         temp_storage_bytes,
                                         partition_map.begin<MapType>(),
-                                        histogram.data().get(),
+                                        histogram.data(),
                                         num_levels,
                                         lower_level,
                                         upper_level,
@@ -668,7 +663,7 @@ struct dispatch_map_type {
     cub::DeviceHistogram::HistogramEven(temp_storage.data(),
                                         temp_storage_bytes,
                                         partition_map.begin<MapType>(),
-                                        histogram.data().get(),
+                                        histogram.data(),
                                         num_levels,
                                         lower_level,
                                         upper_level,
@@ -680,13 +675,12 @@ struct dispatch_map_type {
     thrust::exclusive_scan(
       rmm::exec_policy(stream), histogram.begin(), histogram.end(), histogram.begin());
 
-    // Copy offsets to host
-    std::vector<size_type> partition_offsets(histogram.size());
-    thrust::copy(histogram.begin(), histogram.end(), partition_offsets.begin());
+    // Copy offsets to host before the transform below modifies the histogram
+    auto const partition_offsets = cudf::detail::make_std_vector_sync(histogram, stream);
 
     // Unfortunately need to materialize the scatter map because
     // `detail::scatter` requires multiple passes through the iterator
-    rmm::device_vector<MapType> scatter_map(partition_map.size());
+    rmm::device_uvector<MapType> scatter_map(partition_map.size(), stream);
 
     // For each `partition_map[i]`, atomically increment the corresponding
     // partition offset to determine `i`s location in the output
@@ -694,7 +688,7 @@ struct dispatch_map_type {
                       partition_map.begin<MapType>(),
                       partition_map.end<MapType>(),
                       scatter_map.begin(),
-                      [offsets = histogram.data().get()] __device__(auto partition_number) {
+                      [offsets = histogram.data()] __device__(auto partition_number) {
                         return atomicAdd(&offsets[partition_number], 1);
                       });
 
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index e516d0a51a5..80beb6e715c 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -27,7 +28,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
@@ -44,8 +45,7 @@
 #include <vector>
 
 namespace {
-template <typename T>
-using VectorT = rmm::device_vector<T>;
+
 /**
  * @brief Handles the "degenerate" case num_partitions >= num_rows.
  *
@@ -84,7 +84,6 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
   auto nrows = input.num_rows();
 
   // iterator for partition index rotated right by start_partition positions:
-  //
   auto rotated_iter_begin = thrust::make_transform_iterator(
     thrust::make_counting_iterator<cudf::size_type>(0),
     [num_partitions, start_partition] __device__(auto index) {
@@ -92,7 +91,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
     });
 
   if (num_partitions == nrows) {
-    VectorT<cudf::size_type> partition_offsets(num_partitions, cudf::size_type{0});
+    rmm::device_uvector<cudf::size_type> partition_offsets(num_partitions, stream);
     thrust::sequence(rmm::exec_policy(stream), partition_offsets.begin(), partition_offsets.end());
 
     auto uniq_tbl = cudf::detail::gather(input,
@@ -102,25 +101,14 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                                          stream,
                                          mr);
 
-    auto ret_pair =
-      std::make_pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
-
-    CUDA_TRY(cudaMemcpyAsync(ret_pair.second.data(),
-                             partition_offsets.data().get(),
-                             sizeof(cudf::size_type) * num_partitions,
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
-
-    stream.synchronize();
-
-    return ret_pair;
+    return std::make_pair(std::move(uniq_tbl),
+                          cudf::detail::make_std_vector_sync(partition_offsets, stream));
   } else {  //( num_partitions > nrows )
-    VectorT<cudf::size_type> d_row_indices(nrows, cudf::size_type{0});
+    rmm::device_uvector<cudf::size_type> d_row_indices(nrows, stream);
 
     // copy rotated right partition indexes that
     // fall in the interval [0, nrows):
     //(this relies on a _stable_ copy_if())
-    //
     thrust::copy_if(rmm::exec_policy(stream),
                     rotated_iter_begin,
                     rotated_iter_begin + num_partitions,
@@ -128,7 +116,6 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                     [nrows] __device__(auto index) { return (index < nrows); });
 
     //...and then use the result, d_row_indices, as gather map:
-    //
     auto uniq_tbl = cudf::detail::gather(input,
                                          d_row_indices.begin(),
                                          d_row_indices.end(),  // map
@@ -136,34 +123,22 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
                                          stream,
                                          mr);
 
-    auto ret_pair =
-      std::make_pair(std::move(uniq_tbl), std::vector<cudf::size_type>(num_partitions));
-
     // offsets (part 1: compute partition sizes);
     // iterator for number of edges of the transposed bipartite graph;
     // this composes rotated_iter transform (above) iterator with
     // calculating number of edges of transposed bi-graph:
-    //
     auto nedges_iter_begin = thrust::make_transform_iterator(
       rotated_iter_begin, [nrows] __device__(auto index) { return (index < nrows ? 1 : 0); });
 
     // offsets (part 2: compute partition offsets):
-    //
-    VectorT<cudf::size_type> partition_offsets(num_partitions, cudf::size_type{0});
+    rmm::device_uvector<cudf::size_type> partition_offsets(num_partitions, stream);
     thrust::exclusive_scan(rmm::exec_policy(stream),
                            nedges_iter_begin,
                            nedges_iter_begin + num_partitions,
                            partition_offsets.begin());
 
-    CUDA_TRY(cudaMemcpyAsync(ret_pair.second.data(),
-                             partition_offsets.data().get(),
-                             sizeof(cudf::size_type) * num_partitions,
-                             cudaMemcpyDeviceToHost,
-                             stream.value()));
-
-    stream.synchronize();
-
-    return ret_pair;
+    return std::make_pair(std::move(uniq_tbl),
+                          cudf::detail::make_std_vector_sync(partition_offsets, stream));
   }
 }
 }  // namespace
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 715c8c211d7..e591df0123c 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
+#include <quantiles/quantiles_util.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <quantiles/quantiles_util.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -43,7 +45,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
     return detail::select_quantile<size_type>(selector, size, q, interp);
   };
 
-  rmm::device_vector<double> q_device{q};
+  auto const q_device = cudf::detail::make_device_uvector_async(q, stream);
 
   auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup);
 
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 43dd86b307f..083b0da8cf3 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -58,11 +58,11 @@ struct reduce_dispatch_functor {
         break;
       case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr); break;
       case aggregation::VARIANCE: {
-        auto var_agg = static_cast<var_aggregation const *>(agg.get());
+        auto var_agg = dynamic_cast<var_aggregation const *>(agg.get());
         return reduction::variance(col, output_dtype, var_agg->_ddof, stream, mr);
       } break;
       case aggregation::STD: {
-        auto var_agg = static_cast<std_aggregation const *>(agg.get());
+        auto var_agg = dynamic_cast<std_aggregation const *>(agg.get());
         return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr);
       } break;
       case aggregation::MEDIAN: {
@@ -73,7 +73,7 @@ struct reduce_dispatch_functor {
         return get_element(*col_ptr, 0, stream, mr);
       } break;
       case aggregation::QUANTILE: {
-        auto quantile_agg = static_cast<quantile_aggregation const *>(agg.get());
+        auto quantile_agg = dynamic_cast<quantile_aggregation const *>(agg.get());
         CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
         auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream, mr);
@@ -89,7 +89,7 @@ struct reduce_dispatch_functor {
         return get_element(*col_ptr, 0, stream, mr);
       } break;
       case aggregation::NUNIQUE: {
-        auto nunique_agg = static_cast<nunique_aggregation const *>(agg.get());
+        auto nunique_agg = dynamic_cast<nunique_aggregation const *>(agg.get());
         return make_fixed_width_scalar(
           detail::distinct_count(
             col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream),
@@ -97,7 +97,7 @@ struct reduce_dispatch_functor {
           mr);
       } break;
       case aggregation::NTH_ELEMENT: {
-        auto nth_agg = static_cast<nth_element_aggregation const *>(agg.get());
+        auto nth_agg = dynamic_cast<nth_element_aggregation const *>(agg.get());
         return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr);
       } break;
       default: CUDF_FAIL("Unsupported reduction operator");
diff --git a/cpp/src/reductions/scan.cu b/cpp/src/reductions/scan.cu
deleted file mode 100644
index c3aadf47794..00000000000
--- a/cpp/src/reductions/scan.cu
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/device_operators.cuh>
-#include <cudf/null_mask.hpp>
-#include <cudf/reduction.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Dispatcher for running Scan operation on input column
- * Dispatches scan operation on `Op` and creates output column
- *
- * @tparam Op device binary operator
- */
-template <typename Op>
-struct scan_dispatcher {
- private:
-  template <typename T>
-  static constexpr bool is_string_supported()
-  {
-    return std::is_same<T, string_view>::value &&
-           (std::is_same<Op, cudf::DeviceMin>::value || std::is_same<Op, cudf::DeviceMax>::value);
-  }
-  // return true if T is arithmetic type (including bool)
-  template <typename T>
-  static constexpr bool is_supported()
-  {
-    return std::is_arithmetic<T>::value || is_string_supported<T>() || is_fixed_point<T>();
-  }
-
-  // for arithmetic types
-  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, T>* = nullptr>
-  auto exclusive_scan(const column_view& input_view,
-                      null_policy null_handling,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
-  {
-    const size_type size = input_view.size();
-    auto output_column =
-      detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr);
-    if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr),
-                                   input_view.null_count());
-    }
-    mutable_column_view output = output_column->mutable_view();
-    auto d_input               = column_device_view::create(input_view, stream);
-
-    auto input =
-      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           input,
-                           input + size,
-                           output.data<T>(),
-                           Op::template identity<T>(),
-                           Op{});
-
-    CHECK_CUDA(stream.value());
-    return output_column;
-  }
-
-  // for string type
-  template <typename T, std::enable_if_t<is_string_supported<T>(), T>* = nullptr>
-  std::unique_ptr<column> exclusive_scan(const column_view& input_view,
-                                         null_policy null_handling,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("String types supports only inclusive min/max for `cudf::scan`");
-  }
-
-  rmm::device_buffer mask_inclusive_scan(const column_view& input_view,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-  {
-    rmm::device_buffer mask =
-      detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
-    auto d_input = column_device_view::create(input_view, stream);
-    auto v       = detail::make_validity_iterator(*d_input);
-    auto first_null_position =
-      thrust::find_if_not(
-        rmm::exec_policy(stream), v, v + input_view.size(), thrust::identity<bool>{}) -
-      v;
-    cudf::set_null_mask(
-      static_cast<cudf::bitmask_type*>(mask.data()), 0, first_null_position, true);
-    cudf::set_null_mask(
-      static_cast<cudf::bitmask_type*>(mask.data()), first_null_position, input_view.size(), false);
-    return mask;
-  }
-
-  // for arithmetic types
-  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, T>* = nullptr>
-  auto inclusive_scan(const column_view& input_view,
-                      null_policy null_handling,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
-  {
-    const size_type size = input_view.size();
-    auto output_column =
-      detail::allocate_like(input_view, size, mask_allocation_policy::NEVER, stream, mr);
-    if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr),
-                                   input_view.null_count());
-    } else {
-      if (input_view.nullable()) {
-        output_column->set_null_mask(mask_inclusive_scan(input_view, stream, mr),
-                                     cudf::UNKNOWN_NULL_COUNT);
-      }
-    }
-
-    auto d_input               = column_device_view::create(input_view, stream);
-    mutable_column_view output = output_column->mutable_view();
-
-    auto const input =
-      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
-    thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, output.data<T>(), Op{});
-
-    CHECK_CUDA(stream.value());
-    return output_column;
-  }
-
-  // for string type
-  template <typename T, std::enable_if_t<is_string_supported<T>(), T>* = nullptr>
-  std::unique_ptr<column> inclusive_scan(const column_view& input_view,
-                                         null_policy null_handling,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
-  {
-    const size_type size = input_view.size();
-    rmm::device_uvector<T> result(size, stream);
-
-    auto d_input = column_device_view::create(input_view, stream);
-
-    auto input =
-      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
-    thrust::inclusive_scan(rmm::exec_policy(stream), input, input + size, result.data(), Op{});
-
-    CHECK_CUDA(stream.value());
-
-    auto output_column =
-      cudf::make_strings_column(result, Op::template identity<string_view>(), stream, mr);
-    if (null_handling == null_policy::EXCLUDE) {
-      output_column->set_null_mask(detail::copy_bitmask(input_view, stream, mr),
-                                   input_view.null_count());
-    } else {
-      if (input_view.nullable()) {
-        output_column->set_null_mask(mask_inclusive_scan(input_view, stream, mr),
-                                     cudf::UNKNOWN_NULL_COUNT);
-      }
-    }
-    return output_column;
-  }
-
- public:
-  /**
-   * @brief creates new column from input column by applying scan operation
-   *
-   * @param input     input column view
-   * @param inclusive inclusive or exclusive scan
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param mr Device memory resource used to allocate the returned column's device memory
-   * @return
-   *
-   * @tparam T type of input column
-   */
-  template <typename T, typename std::enable_if_t<is_supported<T>(), T>* = nullptr>
-  std::unique_ptr<column> operator()(const column_view& input,
-                                     scan_type inclusive,
-                                     null_policy null_handling,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    auto output = inclusive == scan_type::INCLUSIVE
-                    ? inclusive_scan<T>(input, null_handling, stream, mr)
-                    : exclusive_scan<T>(input, null_handling, stream, mr);
-
-    if (null_handling == null_policy::EXCLUDE) {
-      CUDF_EXPECTS(input.null_count() == output->null_count(),
-                   "Input / output column null count mismatch");
-    }
-
-    return output;
-  }
-
-  template <typename T, typename std::enable_if_t<!is_supported<T>(), T>* = nullptr>
-  std::unique_ptr<column> operator()(const column_view& input,
-                                     scan_type inclusive,
-                                     null_policy null_handling,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("Non-arithmetic types not supported for `cudf::scan`");
-  }
-};
-
-std::unique_ptr<column> scan(
-  const column_view& input,
-  std::unique_ptr<aggregation> const& agg,
-  scan_type inclusive,
-  null_policy null_handling,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  CUDF_EXPECTS(
-    is_numeric(input.type()) || is_compound(input.type()) || is_fixed_point(input.type()),
-    "Unexpected non-numeric or non-string type.");
-
-  switch (agg->kind) {
-    case aggregation::SUM:
-      return cudf::type_dispatcher<dispatch_storage_type>(input.type(),
-                                                          scan_dispatcher<cudf::DeviceSum>(),
-                                                          input,
-                                                          inclusive,
-                                                          null_handling,
-                                                          stream,
-                                                          mr);
-    case aggregation::MIN:
-      return cudf::type_dispatcher<dispatch_storage_type>(input.type(),
-                                                          scan_dispatcher<cudf::DeviceMin>(),
-                                                          input,
-                                                          inclusive,
-                                                          null_handling,
-                                                          stream,
-                                                          mr);
-    case aggregation::MAX:
-      return cudf::type_dispatcher<dispatch_storage_type>(input.type(),
-                                                          scan_dispatcher<cudf::DeviceMax>(),
-                                                          input,
-                                                          inclusive,
-                                                          null_handling,
-                                                          stream,
-                                                          mr);
-    case aggregation::PRODUCT:
-      // a product scan on a decimal type with non-zero scale would result in each element having
-      // a different scale, and because scale is stored once per column, this is not possible
-      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64 cannot support product scan");
-      return cudf::type_dispatcher<dispatch_storage_type>(input.type(),
-                                                          scan_dispatcher<cudf::DeviceProduct>(),
-                                                          input,
-                                                          inclusive,
-                                                          null_handling,
-                                                          stream,
-                                                          mr);
-    default: CUDF_FAIL("Unsupported aggregation operator for scan");
-  }
-}
-}  // namespace detail
-
-std::unique_ptr<column> scan(const column_view& input,
-                             std::unique_ptr<aggregation> const& agg,
-                             scan_type inclusive,
-                             null_policy null_handling,
-                             rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::scan(input, agg, inclusive, null_handling, rmm::cuda_stream_default, mr);
-}
-
-}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
new file mode 100644
index 00000000000..f40a3fd5c75
--- /dev/null
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/scan.hpp>
+#include <cudf/reduction.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+
+std::unique_ptr<column> scan(column_view const& input,
+                             std::unique_ptr<aggregation> const& agg,
+                             scan_type inclusive,
+                             null_policy null_handling,
+                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return inclusive == scan_type::EXCLUSIVE
+           ? detail::scan_exclusive(input, agg, null_handling, rmm::cuda_stream_default, mr)
+           : detail::scan_inclusive(input, agg, null_handling, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
new file mode 100644
index 00000000000..39fed60735f
--- /dev/null
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+template <template <typename> typename DispatchFn>
+std::unique_ptr<column> scan_agg_dispatch(const column_view& input,
+                                          std::unique_ptr<aggregation> const& agg,
+                                          null_policy null_handling,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(
+    is_numeric(input.type()) || is_compound(input.type()) || is_fixed_point(input.type()),
+    "Unexpected non-numeric or non-string type.");
+
+  switch (agg->kind) {
+    case aggregation::SUM:
+      return cudf::type_dispatcher<dispatch_storage_type>(
+        input.type(), DispatchFn<cudf::DeviceSum>(), input, null_handling, stream, mr);
+    case aggregation::MIN:
+      return cudf::type_dispatcher<dispatch_storage_type>(
+        input.type(), DispatchFn<cudf::DeviceMin>(), input, null_handling, stream, mr);
+    case aggregation::MAX:
+      return cudf::type_dispatcher<dispatch_storage_type>(
+        input.type(), DispatchFn<cudf::DeviceMax>(), input, null_handling, stream, mr);
+    case aggregation::PRODUCT:
+      // a product scan on a decimal type with non-zero scale would result in each element having
+      // a different scale, and because scale is stored once per column, this is not possible
+      if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64 cannot support product scan");
+      return cudf::type_dispatcher<dispatch_storage_type>(
+        input.type(), DispatchFn<cudf::DeviceProduct>(), input, null_handling, stream, mr);
+    default: CUDF_FAIL("Unsupported aggregation operator for scan");
+  }
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
new file mode 100644
index 00000000000..41418c2ec14
--- /dev/null
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scan.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/null_mask.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/scan.h>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+/**
+ * @brief Dispatcher for running a scan operation on an input column
+ *
+ * @tparam Op device binary operator (e.g. min, max, sum)
+ */
+template <typename Op>
+struct scan_dispatcher {
+ public:
+  /**
+   * @brief Creates a new column from input column by applying exclusive scan operation
+   *
+   * @tparam T type of input column
+   *
+   * @param input  Input column view
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   * @return Output column with scan results
+   */
+  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value, T>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& input,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto output_column =
+      detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
+    mutable_column_view output = output_column->mutable_view();
+
+    auto d_input  = column_device_view::create(input, stream);
+    auto identity = Op::template identity<T>();
+
+    auto begin = make_null_replacement_iterator(*d_input, identity, input.has_nulls());
+    thrust::exclusive_scan(
+      rmm::exec_policy(stream), begin, begin + input.size(), output.data<T>(), identity, Op{});
+
+    CHECK_CUDA(stream.value());
+    return output_column;
+  }
+
+  template <typename T, typename std::enable_if_t<!std::is_arithmetic<T>::value, T>* = nullptr>
+  std::unique_ptr<column> operator()(const column_view& input,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL("Non-arithmetic types not supported for exclusive scan");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> scan_exclusive(const column_view& input,
+                                       std::unique_ptr<aggregation> const& agg,
+                                       null_policy null_handling,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  auto output = scan_agg_dispatch<scan_dispatcher>(input, agg, null_handling, stream, mr);
+
+  if (null_handling == null_policy::EXCLUDE) {
+    output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count());
+  }
+
+  return output;
+}
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
new file mode 100644
index 00000000000..39fe7e6fa27
--- /dev/null
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scan.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/null_mask.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/scan.h>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+rmm::device_buffer mask_inclusive_scan(const column_view& input_view,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  rmm::device_buffer mask =
+    detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
+  auto d_input   = column_device_view::create(input_view, stream);
+  auto valid_itr = detail::make_validity_iterator(*d_input);
+
+  auto first_null_position = thrust::find_if_not(rmm::exec_policy(stream),
+                                                 valid_itr,
+                                                 valid_itr + input_view.size(),
+                                                 thrust::identity<bool>{}) -
+                             valid_itr;
+  cudf::set_null_mask(static_cast<cudf::bitmask_type*>(mask.data()), 0, first_null_position, true);
+  cudf::set_null_mask(
+    static_cast<cudf::bitmask_type*>(mask.data()), first_null_position, input_view.size(), false);
+  return mask;
+}
+
+/**
+ * @brief Dispatcher for running Scan operation on input column
+ *
+ * @tparam Op device binary operator
+ */
+template <typename Op>
+struct scan_dispatcher {
+ private:
+  template <typename T>
+  static constexpr bool is_string_supported()
+  {
+    return std::is_same<T, string_view>::value &&
+           (std::is_same<Op, cudf::DeviceMin>::value || std::is_same<Op, cudf::DeviceMax>::value);
+  }
+
+  template <typename T>
+  static constexpr bool is_supported()
+  {
+    return std::is_arithmetic<T>::value || is_string_supported<T>();
+  }
+
+  // for arithmetic types
+  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value, T>* = nullptr>
+  auto inclusive_scan(const column_view& input_view,
+                      null_policy null_handling,
+                      rmm::cuda_stream_view stream,
+                      rmm::mr::device_memory_resource* mr)
+  {
+    auto output_column = detail::allocate_like(
+      input_view, input_view.size(), mask_allocation_policy::NEVER, stream, mr);
+    mutable_column_view result = output_column->mutable_view();
+
+    auto d_input = column_device_view::create(input_view, stream);
+    auto const begin =
+      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream), begin, begin + input_view.size(), result.data<T>(), Op{});
+
+    CHECK_CUDA(stream.value());
+    return output_column;
+  }
+
+  // for string type: only MIN and MAX are supported
+  template <typename T, std::enable_if_t<is_string_supported<T>(), T>* = nullptr>
+  std::unique_ptr<column> inclusive_scan(const column_view& input_view,
+                                         null_policy null_handling,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+  {
+    auto d_input = column_device_view::create(input_view, stream);
+
+    rmm::device_uvector<T> result(input_view.size(), stream);
+    auto begin =
+      make_null_replacement_iterator(*d_input, Op::template identity<T>(), input_view.has_nulls());
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream), begin, begin + input_view.size(), result.data(), Op{});
+
+    CHECK_CUDA(stream.value());
+    return cudf::make_strings_column(result, Op::template identity<string_view>(), stream, mr);
+  }
+
+ public:
+  /**
+   * @brief creates new column from input column by applying scan operation
+   *
+   * @param input     input column view
+   * @param inclusive inclusive or exclusive scan
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   * @return
+   *
+   * @tparam T type of input column
+   */
+  template <typename T, typename std::enable_if_t<is_supported<T>(), T>* = nullptr>
+  std::unique_ptr<column> operator()(const column_view& input,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return inclusive_scan<T>(input, null_handling, stream, mr);
+  }
+
+  template <typename T, typename std::enable_if_t<!is_supported<T>(), T>* = nullptr>
+  std::unique_ptr<column> operator()(const column_view& input,
+                                     null_policy null_handling,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL("Non-arithmetic types not supported for inclusive scan");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> scan_inclusive(
+  const column_view& input,
+  std::unique_ptr<aggregation> const& agg,
+  null_policy null_handling,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto output = scan_agg_dispatch<scan_dispatcher>(input, agg, null_handling, stream, mr);
+
+  if (null_handling == null_policy::EXCLUDE) {
+    output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count());
+  } else if (input.nullable()) {
+    output->set_null_mask(mask_inclusive_scan(input, stream, mr), cudf::UNKNOWN_NULL_COUNT);
+  }
+
+  return output;
+}
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index ba1b8abef15..3c7a05abd4e 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -40,8 +40,8 @@ std::unique_ptr<cudf::scalar> cudf::reduction::standard_deviation(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
   return cudf::type_dispatcher(col_type, reducer(), col, output_dtype, ddof, stream, mr);
 #else
-  // workaround for bug 200529165 which causes compilation error only at device
-  // debug build the bug will be fixed at cuda 10.2
+  // workaround for bug 200529165 which causes compilation error only at device debug build
+  // hopefully the bug will be fixed in future cuda version (still failing in 11.2)
   CUDF_FAIL("var/std reductions are not supported at debug build.");
 #endif
 }
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index 6734fd1eaf1..2565e472661 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -39,8 +39,8 @@ std::unique_ptr<cudf::scalar> cudf::reduction::variance(column_view const& col,
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
   return cudf::type_dispatcher(col_type, reducer(), col, output_dtype, ddof, stream, mr);
 #else
-  // workaround for bug 200529165 which causes compilation error only at device
-  // debug build the bug will be fixed at cuda 10.2
+  // workaround for bug 200529165 which causes compilation error only at device debug build
+  // hopefully the bug will be fixed in future cuda version (still failing in 11.2)
   CUDF_FAIL("var/std reductions are not supported at debug build.");
 #endif
 }
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 1f7e0672404..bf2ae63a99d 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/detail/search.hpp>
@@ -65,19 +66,20 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> form_offsets_and_cha
 
   auto d_offsets = offsets_column->view().template data<size_type>();
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+    cudf::strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
 
   return std::make_pair(std::move(offsets_column), std::move(chars_column));
 }
 
-template <typename ScalarIterator>
+template <typename OptionalScalarIterator, typename ReplaceScalarIterator>
 std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& input,
-                                                  ScalarIterator const& lo_itr,
-                                                  ScalarIterator const& lo_replace_itr,
-                                                  ScalarIterator const& hi_itr,
-                                                  ScalarIterator const& hi_replace_itr,
+                                                  OptionalScalarIterator const& lo_itr,
+                                                  ReplaceScalarIterator const& lo_replace_itr,
+                                                  OptionalScalarIterator const& hi_itr,
+                                                  ReplaceScalarIterator const& hi_replace_itr,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
@@ -88,18 +90,16 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
   // build offset column
   auto offsets_transformer = [lo_itr, hi_itr, lo_replace_itr, hi_replace_itr] __device__(
                                string_view element, bool is_valid = true) {
-    const auto d_lo         = (*lo_itr).first;
-    const auto d_hi         = (*hi_itr).first;
-    const auto d_lo_replace = (*lo_replace_itr).first;
-    const auto d_hi_replace = (*hi_replace_itr).first;
-    const auto lo_valid     = (*lo_itr).second;
-    const auto hi_valid     = (*hi_itr).second;
+    const auto d_lo         = (*lo_itr).value_or(element);
+    const auto d_hi         = (*hi_itr).value_or(element);
+    const auto d_lo_replace = *(*lo_replace_itr);
+    const auto d_hi_replace = *(*hi_replace_itr);
     size_type bytes         = 0;
 
     if (is_valid) {
-      if (lo_valid and element < d_lo) {
+      if (element < d_lo) {
         bytes = d_lo_replace.size_bytes();
-      } else if (hi_valid and d_hi < element) {
+      } else if (d_hi < element) {
         bytes = d_hi_replace.size_bytes();
       } else {
         bytes = element.size_bytes();
@@ -121,16 +121,14 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
       size_type idx) {
       if (d_input.is_null(idx)) { return; }
       auto input_element      = d_input.element<string_view>(idx);
-      const auto d_lo         = (*lo_itr).first;
-      const auto d_hi         = (*hi_itr).first;
-      const auto d_lo_replace = (*lo_replace_itr).first;
-      const auto d_hi_replace = (*hi_replace_itr).first;
-      const auto lo_valid     = (*lo_itr).second;
-      const auto hi_valid     = (*hi_itr).second;
-
-      if (lo_valid and input_element < d_lo) {
+      const auto d_lo         = (*lo_itr).value_or(input_element);
+      const auto d_hi         = (*hi_itr).value_or(input_element);
+      const auto d_lo_replace = *(*lo_replace_itr);
+      const auto d_hi_replace = *(*hi_replace_itr);
+
+      if (input_element < d_lo) {
         memcpy(d_chars + d_offsets[idx], d_lo_replace.data(), d_lo_replace.size_bytes());
-      } else if (hi_valid and d_hi < input_element) {
+      } else if (d_hi < input_element) {
         memcpy(d_chars + d_offsets[idx], d_hi_replace.data(), d_hi_replace.size_bytes());
       } else {
         memcpy(d_chars + d_offsets[idx], input_element.data(), input_element.size_bytes());
@@ -151,13 +149,13 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
                              mr);
 }
 
-template <typename T, typename ScalarIterator>
+template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>
 std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamper(
   column_view const& input,
-  ScalarIterator const& lo_itr,
-  ScalarIterator const& lo_replace_itr,
-  ScalarIterator const& hi_itr,
-  ScalarIterator const& hi_replace_itr,
+  OptionalScalarIterator const& lo_itr,
+  ReplaceScalarIterator const& lo_replace_itr,
+  OptionalScalarIterator const& hi_itr,
+  ReplaceScalarIterator const& hi_replace_itr,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
@@ -172,49 +170,39 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   auto scalar_zip_itr =
     thrust::make_zip_iterator(thrust::make_tuple(lo_itr, lo_replace_itr, hi_itr, hi_replace_itr));
 
-  auto trans = [] __device__(auto element_validity_pair, auto scalar_tuple) {
-    if (element_validity_pair.second) {
-      auto lo_validity_pair = thrust::get<0>(scalar_tuple);
-      auto hi_validity_pair = thrust::get<2>(scalar_tuple);
-      if (lo_validity_pair.second and (element_validity_pair.first < lo_validity_pair.first)) {
-        return thrust::get<1>(scalar_tuple).first;
-      } else if (hi_validity_pair.second and
-                 (element_validity_pair.first > hi_validity_pair.first)) {
-        return thrust::get<3>(scalar_tuple).first;
+  auto trans = [] __device__(auto element_optional, auto scalar_tuple) {
+    if (element_optional.has_value()) {
+      auto lo_optional = thrust::get<0>(scalar_tuple);
+      auto hi_optional = thrust::get<2>(scalar_tuple);
+      if (lo_optional.has_value() and (*element_optional < *lo_optional)) {
+        return *(thrust::get<1>(scalar_tuple));
+      } else if (hi_optional.has_value() and (*element_optional > *hi_optional)) {
+        return *(thrust::get<3>(scalar_tuple));
       }
     }
 
-    return element_validity_pair.first;
+    return *element_optional;
   };
 
-  if (input.has_nulls()) {
-    auto input_pair_iterator = make_pair_iterator<T, true>(*input_device_view);
-    thrust::transform(rmm::exec_policy(stream),
-                      input_pair_iterator,
-                      input_pair_iterator + input.size(),
-                      scalar_zip_itr,
-                      output_device_view->begin<T>(),
-                      trans);
-  } else {
-    auto input_pair_iterator = make_pair_iterator<T, false>(*input_device_view);
-    thrust::transform(rmm::exec_policy(stream),
-                      input_pair_iterator,
-                      input_pair_iterator + input.size(),
-                      scalar_zip_itr,
-                      output_device_view->begin<T>(),
-                      trans);
-  }
+  auto input_pair_iterator =
+    make_optional_iterator<T>(*input_device_view, contains_nulls::DYNAMIC{}, input.has_nulls());
+  thrust::transform(rmm::exec_policy(stream),
+                    input_pair_iterator,
+                    input_pair_iterator + input.size(),
+                    scalar_zip_itr,
+                    output_device_view->begin<T>(),
+                    trans);
 
   return output;
 }
 
-template <typename T, typename ScalarIterator>
+template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>
 std::enable_if_t<std::is_same<T, string_view>::value, std::unique_ptr<cudf::column>> clamper(
   column_view const& input,
-  ScalarIterator const& lo_itr,
-  ScalarIterator const& lo_replace_itr,
-  ScalarIterator const& hi_itr,
-  ScalarIterator const& hi_replace_itr,
+  OptionalScalarIterator const& lo_itr,
+  ReplaceScalarIterator const& lo_replace_itr,
+  OptionalScalarIterator const& hi_itr,
+  ReplaceScalarIterator const& hi_replace_itr,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
@@ -223,13 +211,13 @@ std::enable_if_t<std::is_same<T, string_view>::value, std::unique_ptr<cudf::colu
 
 }  // namespace
 
-template <typename T, typename ScalarIterator>
+template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>
 std::unique_ptr<column> clamp(
   column_view const& input,
-  ScalarIterator const& lo_itr,
-  ScalarIterator const& lo_replace_itr,
-  ScalarIterator const& hi_itr,
-  ScalarIterator const& hi_replace_itr,
+  OptionalScalarIterator const& lo_itr,
+  ReplaceScalarIterator const& lo_replace_itr,
+  OptionalScalarIterator const& hi_itr,
+  ReplaceScalarIterator const& hi_replace_itr,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -249,10 +237,10 @@ struct dispatch_clamp {
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
-    auto lo_itr         = make_pair_iterator<T>(lo);
-    auto hi_itr         = make_pair_iterator<T>(hi);
-    auto lo_replace_itr = make_pair_iterator<T>(lo_replace);
-    auto hi_replace_itr = make_pair_iterator<T>(hi_replace);
+    auto lo_itr         = make_optional_iterator<T>(lo, contains_nulls::YES{});
+    auto hi_itr         = make_optional_iterator<T>(hi, contains_nulls::YES{});
+    auto lo_replace_itr = make_optional_iterator<T>(lo_replace, contains_nulls::NO{});
+    auto hi_replace_itr = make_optional_iterator<T>(hi_replace, contains_nulls::NO{});
 
     return clamp<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
   }
@@ -336,8 +324,8 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
   auto contents           = new_indices->release();
   auto indices_column     = std::make_unique<column>(indices_type,
                                                  static_cast<size_type>(output_size),
-                                                 *(contents.data.release()),
-                                                 rmm::device_buffer{0, stream, mr},
+                                                 std::move(*(contents.data.release())),
+                                                 rmm::device_buffer{},
                                                  0);
 
   // take the keys from the matched column allocated using mr
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index b34b0928847..d6cf7d2c385 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -146,7 +146,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, 0, mr);
+  return detail::replace_nans(input, replacement, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> replace_nans(column_view const& input,
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 4cf6899116d..f55696317f4 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
+#include <cudf/detail/replace/nulls.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/dictionary/detail/replace.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -40,10 +41,13 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
+#include <thrust/functional.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
+#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace {  // anonymous
@@ -225,8 +229,8 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
     cudf::data_type(cudf::type_id::INT32), input.size(), cudf::mask_state::UNALLOCATED, stream);
 
   auto sizes_view         = sizes->mutable_view();
-  auto device_in          = cudf::column_device_view::create(input);
-  auto device_replacement = cudf::column_device_view::create(replacement);
+  auto device_in          = cudf::column_device_view::create(input, stream);
+  auto device_replacement = cudf::column_device_view::create(replacement, stream);
 
   rmm::device_buffer valid_bits =
     cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
@@ -245,14 +249,13 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
     sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
   auto offsets_view = offsets->mutable_view();
 
-  int32_t size;
-  CUDA_TRY(cudaMemcpyAsync(
-    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value()));
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_view, offsets_view.size() - 1, stream);
 
   // Allocate chars array and output null mask
   cudf::size_type null_count = input.size() - valid_counter.value(stream);
   std::unique_ptr<cudf::column> output_chars =
-    cudf::strings::detail::create_chars_child_column(input.size(), null_count, size, stream, mr);
+    cudf::strings::detail::create_chars_child_column(input.size(), bytes, stream, mr);
 
   auto output_chars_view = output_chars->mutable_view();
 
@@ -357,22 +360,6 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   return cudf::dictionary::detail::replace_nulls(dict_input, replacement, stream, mr);
 }
 
-/**
- * @brief Functor used by `inclusive_scan` to determine the index to gather from in
- *        the result column. When current row in input column is NULL, return previous
- *        accumulated index, otherwise return the current index. The second element in
- *        the return tuple is discarded.
- */
-struct replace_policy_functor {
-  __device__ thrust::tuple<cudf::size_type, bool> operator()(
-    thrust::tuple<cudf::size_type, bool> const& lhs,
-    thrust::tuple<cudf::size_type, bool> const& rhs)
-  {
-    return thrust::get<1>(rhs) ? thrust::make_tuple(thrust::get<0>(rhs), true)
-                               : thrust::make_tuple(thrust::get<0>(lhs), true);
-  }
-};
-
 /**
  * @brief Function used by replace_nulls policy
  */
@@ -391,7 +378,7 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
   auto gm_begin = thrust::make_zip_iterator(
     thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
 
-  auto func = replace_policy_functor();
+  auto func = cudf::detail::replace_policy_functor();
   if (replace_policy == cudf::replace_policy::PRECEDING) {
     thrust::inclusive_scan(
       rmm::exec_policy(stream), in_begin, in_begin + input.size(), gm_begin, func);
@@ -415,6 +402,7 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
 
 namespace cudf {
 namespace detail {
+
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index cb142c2c1e2..6ddf7a584ba 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -37,6 +37,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/replace.hpp>
@@ -412,15 +413,13 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
     sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
   auto offsets_view   = offsets->mutable_view();
   auto device_offsets = cudf::mutable_column_device_view::create(offsets_view);
-  int32_t size;
-  CUDA_TRY(cudaMemcpyAsync(
-    &size, offsets_view.end<int32_t>() - 1, sizeof(int32_t), cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_view, offsets_view.size() - 1, stream);
 
   // Allocate chars array and output null mask
-  cudf::size_type null_count                 = input_col.size() - valid_counter.value(stream);
-  std::unique_ptr<cudf::column> output_chars = cudf::strings::detail::create_chars_child_column(
-    input_col.size(), null_count, size, stream, mr);
+  cudf::size_type null_count = input_col.size() - valid_counter.value(stream);
+  std::unique_ptr<cudf::column> output_chars =
+    cudf::strings::detail::create_chars_child_column(input_col.size(), bytes, stream, mr);
 
   auto output_chars_view = output_chars->mutable_view();
   auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view);
@@ -471,7 +470,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   auto null_count     = new_indices->null_count();
   auto contents       = new_indices->release();
   auto indices_column = std::make_unique<cudf::column>(
-    indices_type, input.size(), *(contents.data.release()), rmm::device_buffer{0, stream, mr}, 0);
+    indices_type, input.size(), std::move(*(contents.data.release())), rmm::device_buffer{}, 0);
   std::unique_ptr<cudf::column> keys_column(std::move(matched_input->release().children.back()));
   return cudf::make_dictionary_column(std::move(keys_column),
                                       std::move(indices_column),
@@ -497,8 +496,8 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
     "Columns type mismatch");
   CUDF_EXPECTS(values_to_replace.has_nulls() == false, "values_to_replace must not have nulls");
 
-  if (0 == input_col.size() || 0 == values_to_replace.size() || 0 == replacement_values.size()) {
-    return std::make_unique<cudf::column>(input_col);
+  if (input_col.is_empty() or values_to_replace.is_empty() or replacement_values.is_empty()) {
+    return std::make_unique<cudf::column>(input_col, stream, mr);
   }
 
   return cudf::type_dispatcher<dispatch_storage_type>(input_col.type(),
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 3e2cb4ac02f..9024584a16b 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include <strings/utilities.cuh>
-
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/detail/interleave_columns.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 
@@ -30,11 +30,23 @@ namespace detail {
 namespace {
 struct interleave_columns_functor {
   template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_fixed_width<T>() and not std::is_same<T, cudf::string_view>::value,
+  std::enable_if_t<not cudf::is_fixed_width<T>() and
+                     not std::is_same<T, cudf::string_view>::value and
+                     not std::is_same<T, cudf::list_view>::value,
                    std::unique_ptr<cudf::column>>
   operator()(Args&&... args)
   {
-    CUDF_FAIL("interleave_columns not supported for dictionary and list types.");
+    CUDF_FAIL("Called `interleave_columns` on none-supported data type.");
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same<T, cudf::list_view>::value, std::unique_ptr<cudf::column>>
+  operator()(table_view const& lists_columns,
+             bool create_mask,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
+  {
+    return lists::detail::interleave_columns(lists_columns, create_mask, stream, mr);
   }
 
   template <typename T>
@@ -57,7 +69,7 @@ struct interleave_columns_functor {
     auto d_table     = *table;
     auto num_strings = num_columns * strings_count;
 
-    std::pair<rmm::device_buffer, size_type> valid_mask{{}, 0};
+    std::pair<rmm::device_buffer, size_type> valid_mask{};
     if (create_mask) {
       // Create resulting null mask
       valid_mask = cudf::detail::valid_if(
@@ -90,9 +102,9 @@ struct interleave_columns_functor {
     auto d_results_offsets = offsets_column->view().template data<int32_t>();
 
     // Create the chars column
-    size_type bytes = thrust::device_pointer_cast(d_results_offsets)[num_strings];
-    auto chars_column =
-      strings::detail::create_chars_child_column(num_strings, null_count, bytes, stream, mr);
+    auto const bytes =
+      cudf::detail::get_value<int32_t>(offsets_column->view(), num_strings, stream);
+    auto chars_column = strings::detail::create_chars_child_column(num_strings, bytes, stream, mr);
     // Fill the chars column
     auto d_results_chars = chars_column->mutable_view().data<char>();
     thrust::for_each_n(
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index ca4913c1843..eee73d7b258 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -14,20 +14,24 @@
  * limitations under the License.
  */
 
+#include "range_window_bounds_detail.hpp"
 #include "rolling_detail.cuh"
 #include "rolling_jit_detail.hpp"
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/rolling.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 
 namespace cudf {
-
 std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                column_view const& input,
                                                size_type preceding_window,
                                                size_type following_window,
                                                size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
+                                               rolling_aggregation const& aggr,
                                                rmm::mr::device_memory_resource* mr)
 {
   return grouped_rolling_window(group_keys,
@@ -44,7 +48,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds preceding_window,
                                                window_bounds following_window,
                                                size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
+                                               rolling_aggregation const& aggr,
                                                rmm::mr::device_memory_resource* mr)
 {
   return grouped_rolling_window(group_keys,
@@ -63,7 +67,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type preceding_window,
                                                size_type following_window,
                                                size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
+                                               rolling_aggregation const& aggr,
                                                rmm::mr::device_memory_resource* mr)
 {
   return grouped_rolling_window(group_keys,
@@ -84,13 +88,13 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds preceding_window_bounds,
                                                window_bounds following_window_bounds,
                                                size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
+                                               rolling_aggregation const& aggr,
                                                rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  if (input.is_empty()) return empty_like(input);
+  if (input.is_empty()) { return cudf::detail::empty_output_for_rolling_aggregation(input, aggr); }
 
   CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()),
                "Size mismatch between group_keys and input vector.");
@@ -146,13 +150,12 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                d_group_labels  = group_labels.data(),
                                following_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
-    auto group_end =
-      d_group_offsets[group_label +
-                      1];  // Cannot fall off the end, since offsets is capped with `input.size()`.
+    auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
+                                                          // is capped with `input.size()`.
     return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
   };
 
-  if (aggr->kind == aggregation::CUDA || aggr->kind == aggregation::PTX) {
+  if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
     cudf::detail::preceding_window_wrapper grouped_preceding_window{
       group_offsets.data(), group_labels.data(), preceding_window};
 
@@ -189,7 +192,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds preceding_window_bounds,
                                                window_bounds following_window_bounds,
                                                size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
+                                               rolling_aggregation const& aggr,
                                                rmm::mr::device_memory_resource* mr)
 {
   return detail::grouped_rolling_window(group_keys,
@@ -205,57 +208,78 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
 
 namespace {
 
-bool is_supported_range_frame_unit(cudf::data_type const& data_type)
+/// For order-by columns of signed types, bounds calculation might cause accidental
+/// overflow/underflows. This needs to be detected and handled appropriately
+/// for signed and unsigned types.
+
+/**
+ * @brief Add `delta` to value, and cap at numeric_limits::max(), for signed types.
+ */
+template <typename T, std::enable_if_t<std::numeric_limits<T>::is_signed>* = nullptr>
+__device__ T add_safe(T const& value, T const& delta)
 {
-  auto id = data_type.id();
-  return id == cudf::type_id::TIMESTAMP_DAYS || id == cudf::type_id::TIMESTAMP_SECONDS ||
-         id == cudf::type_id::TIMESTAMP_MILLISECONDS ||
-         id == cudf::type_id::TIMESTAMP_MICROSECONDS || id == cudf::type_id::TIMESTAMP_NANOSECONDS;
+  // delta >= 0.
+  return (value < 0 || (std::numeric_limits<T>::max() - value) >= delta)
+           ? (value + delta)
+           : std::numeric_limits<T>::max();
 }
 
-/// Fetches multiplication factor to normalize window sizes, depending on the datatype of the
-/// timestamp column. Used for time-based rolling-window operations. E.g. If the timestamp column is
-/// in TIMESTAMP_SECONDS, and the window sizes are specified in DAYS, the window size needs to be
-/// multiplied by `24*60*60`, before comparisons with the timestamps.
-size_t multiplication_factor(cudf::data_type const& data_type)
+/**
+ * @brief Add `delta` to value, and cap at numeric_limits::max(), for unsigned types.
+ */
+template <typename T, std::enable_if_t<!std::numeric_limits<T>::is_signed>* = nullptr>
+__device__ T add_safe(T const& value, T const& delta)
 {
-  // Assume timestamps.
-  switch (data_type.id()) {
-    case cudf::type_id::TIMESTAMP_DAYS: return 1L;
-    case cudf::type_id::TIMESTAMP_SECONDS: return 24L * 60 * 60;
-    case cudf::type_id::TIMESTAMP_MILLISECONDS: return 24L * 60 * 60 * 1000;
-    case cudf::type_id::TIMESTAMP_MICROSECONDS: return 24L * 60 * 60 * 1000 * 1000;
-    case cudf::type_id::TIMESTAMP_NANOSECONDS: return 24L * 60 * 60 * 1000 * 1000 * 1000;
-    default:
-      CUDF_FAIL("Unexpected data-type for timestamp-based rolling window operation!");
-      return {};
-  }
+  // delta >= 0.
+  return ((std::numeric_limits<T>::max() - value) >= delta) ? (value + delta)
+                                                            : std::numeric_limits<T>::max();
+}
+
+/**
+ * @brief Subtract `delta` from value, and cap at numeric_limits::min(), for signed types.
+ */
+template <typename T, std::enable_if_t<std::numeric_limits<T>::is_signed>* = nullptr>
+__device__ T subtract_safe(T const& value, T const& delta)
+{
+  // delta >= 0;
+  return (value >= 0 || (value - std::numeric_limits<T>::min()) >= delta)
+           ? (value - delta)
+           : std::numeric_limits<T>::min();
+}
+
+/**
+ * @brief Subtract `delta` from value, and cap at numeric_limits::min(), for unsigned types.
+ */
+template <typename T, std::enable_if_t<!std::numeric_limits<T>::is_signed>* = nullptr>
+__device__ T subtract_safe(T const& value, T const& delta)
+{
+  // delta >= 0;
+  return ((value - std::numeric_limits<T>::min()) >= delta) ? (value - delta)
+                                                            : std::numeric_limits<T>::min();
 }
 
-/// Given a single, ungrouped timestamp column, return the indices corresponding
-/// to the first null timestamp, and (one past) the last null timestamp.
+/// Given a single, ungrouped order-by column, return the indices corresponding
+/// to the first null element, and (one past) the last null timestamp.
 /// The input column is sorted, with all null values clustered either
 /// at the beginning of the column or at the end.
 /// If no null values are founds, null_begin and null_end are 0.
-std::tuple<size_type, size_type> get_null_bounds_for_timestamp_column(
-  column_view const& timestamp_column)
+std::tuple<size_type, size_type> get_null_bounds_for_orderby_column(
+  column_view const& orderby_column)
 {
-  auto const num_rows  = timestamp_column.size();
-  auto const num_nulls = timestamp_column.null_count();
+  auto const num_rows  = orderby_column.size();
+  auto const num_nulls = orderby_column.null_count();
 
   if (num_nulls == num_rows || num_nulls == 0) {
     // Short-circuit: All nulls, or no nulls.
     return std::make_tuple(0, num_nulls);
   }
 
-  auto const first_row_is_null = timestamp_column.null_count(0, 1) == 1;
+  auto const first_row_is_null = orderby_column.null_count(0, 1) == 1;
 
   return first_row_is_null ? std::make_tuple(0, num_nulls)
                            : std::make_tuple(num_rows - num_nulls, num_rows);
 }
 
-using TimeT = int64_t;  // Timestamp representations normalized to int64_t.
-
 template <typename Calculator>
 std::unique_ptr<column> expand_to_column(Calculator const& calc,
                                          size_type const& num_rows,
@@ -273,28 +297,28 @@ std::unique_ptr<column> expand_to_column(Calculator const& calc,
   return window_column;
 }
 
-/// Time-range window computation, with
+/// Range window computation, with
 ///   1. no grouping keys specified
-///   2. timetamps in ASCENDING order.
+///   2. rows in ASCENDING order.
 /// Treat as one single group.
-std::unique_ptr<column> time_range_window_ASC(column_view const& input,
-                                              column_view const& timestamp_column,
-                                              TimeT preceding_window,
-                                              bool preceding_window_is_unbounded,
-                                              TimeT following_window,
-                                              bool following_window_is_unbounded,
-                                              size_type min_periods,
-                                              std::unique_ptr<aggregation> const& aggr,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+template <typename T>
+std::unique_ptr<column> range_window_ASC(column_view const& input,
+                                         column_view const& orderby_column,
+                                         T preceding_window,
+                                         bool preceding_window_is_unbounded,
+                                         T following_window,
+                                         bool following_window_is_unbounded,
+                                         size_type min_periods,
+                                         rolling_aggregation const& aggr,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
-  size_type nulls_begin_idx, nulls_end_idx;
-  std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column);
+  auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
 
   auto preceding_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     d_timestamps = timestamp_column.data<TimeT>(),
+    [nulls_begin_idx = h_nulls_begin_idx,
+     nulls_end_idx   = h_nulls_end_idx,
+     d_orderby       = orderby_column.data<T>(),
      preceding_window,
      preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
     if (preceding_window_is_unbounded) {
@@ -308,28 +332,28 @@ std::unique_ptr<column> time_range_window_ASC(column_view const& input,
       return idx - nulls_begin_idx + 1;
     }
 
-    // timestamp[idx] not null. Binary search the group, excluding null group.
+    // orderby[idx] not null. Binary search the group, excluding null group.
     // If nulls_begin_idx == 0, either
     //  1. NULLS FIRST ordering: Binary search starts where nulls_end_idx.
     //  2. NO NULLS: Binary search starts at 0 (also nulls_end_idx).
     // Otherwise, NULLS LAST ordering. Start at 0.
-    auto group_start                = nulls_begin_idx == 0 ? nulls_end_idx : 0;
-    auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window;
+    auto group_start      = nulls_begin_idx == 0 ? nulls_end_idx : 0;
+    auto lowest_in_window = subtract_safe(d_orderby[idx], preceding_window);
 
-    return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq,
-                                                       d_timestamps + group_start,
-                                                       d_timestamps + idx,
-                                                       lowest_timestamp_in_window)) +
+    return ((d_orderby + idx) - thrust::lower_bound(thrust::seq,
+                                                    d_orderby + group_start,
+                                                    d_orderby + idx,
+                                                    lowest_in_window)) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
   auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
 
   auto following_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     num_rows     = input.size(),
-     d_timestamps = timestamp_column.data<TimeT>(),
+    [nulls_begin_idx = h_nulls_begin_idx,
+     nulls_end_idx   = h_nulls_end_idx,
+     num_rows        = input.size(),
+     d_orderby       = orderby_column.data<T>(),
      following_window,
      following_window_is_unbounded] __device__(size_type idx) -> size_type {
     if (following_window_is_unbounded) { return num_rows - idx - 1; }
@@ -339,68 +363,65 @@ std::unique_ptr<column> time_range_window_ASC(column_view const& input,
       return nulls_end_idx - idx - 1;
     }
 
-    // timestamp[idx] not null. Binary search the group, excluding null group.
+    // orderby[idx] not null. Binary search the group, excluding null group.
     // If nulls_begin_idx == 0, either
     //  1. NULLS FIRST ordering: Binary search ends at num_rows.
     //  2. NO NULLS: Binary search also ends at num_rows.
     // Otherwise, NULLS LAST ordering. End at nulls_begin_idx.
 
-    auto group_end                   = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
-    auto highest_timestamp_in_window = d_timestamps[idx] + following_window;
+    auto group_end         = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
+    auto highest_in_window = add_safe(d_orderby[idx], following_window);
 
-    return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + group_end,
-                                highest_timestamp_in_window) -
-            (d_timestamps + idx)) -
+    return (thrust::upper_bound(
+              thrust::seq, d_orderby + idx, d_orderby + group_end, highest_in_window) -
+            (d_orderby + idx)) -
            1;
   };
 
   auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
 
-  return cudf::rolling_window(
-    input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+  return cudf::detail::rolling_window(
+    input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr);
 }
 
-/// Given a timestamp column grouped as specified in group_offsets,
-/// return the following two vectors:
-///  1. Vector with one entry per group, indicating the offset in the group
-///     where the null values begin.
-///  2. Vector with one entry per group, indicating the offset in the group
-///     where the null values end. (i.e. 1 past the last null.)
-/// Each group in the input timestamp column must be sorted,
-/// with null values clustered at either the start or the end of each group.
-/// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0).
-std::tuple<rmm::device_vector<size_type>, rmm::device_vector<size_type>>
-get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
-                                     rmm::device_uvector<size_type> const& group_offsets)
+// Given an orderby column grouped as specified in group_offsets,
+// return the following two vectors:
+//  1. Vector with one entry per group, indicating the offset in the group
+//     where the null values begin.
+//  2. Vector with one entry per group, indicating the offset in the group
+//     where the null values end. (i.e. 1 past the last null.)
+// Each group in the input orderby column must be sorted,
+// with null values clustered at either the start or the end of each group.
+// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0).
+std::tuple<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>>
+get_null_bounds_for_orderby_column(column_view const& orderby_column,
+                                   cudf::device_span<size_type const> group_offsets,
+                                   rmm::cuda_stream_view stream)
 {
-  // For each group, the null values are themselves clustered
-  // at the beginning or the end of the group.
+  // For each group, the null values are clustered at the beginning or the end of the group.
   // These nulls cannot participate, except in their own window.
 
-  // If the input has n groups, group_offsets will have n+1 values.
-  // null_start and null_end should eventually have 1 entry per group.
-  auto null_start = rmm::device_vector<size_type>(group_offsets.begin(), group_offsets.end() - 1);
-  auto null_end   = rmm::device_vector<size_type>(group_offsets.begin(), group_offsets.end() - 1);
+  auto num_groups = group_offsets.size() - 1;
+
+  if (orderby_column.has_nulls()) {
+    auto null_start = rmm::device_uvector<size_type>(num_groups, stream);
+    auto null_end   = rmm::device_uvector<size_type>(num_groups, stream);
 
-  if (timestamp_column.has_nulls()) {
-    auto p_timestamps_device_view = column_device_view::create(timestamp_column);
-    auto num_groups               = group_offsets.size() - 1;
+    auto p_orderby_device_view = column_device_view::create(orderby_column);
 
     // Null timestamps exist. Find null bounds, per group.
     thrust::for_each(
-      thrust::device,
+      rmm::exec_policy(stream),
       thrust::make_counting_iterator(static_cast<size_type>(0)),
       thrust::make_counting_iterator(static_cast<size_type>(num_groups)),
-      [d_timestamps    = *p_timestamps_device_view,
+      [d_orderby       = *p_orderby_device_view,
        d_group_offsets = group_offsets.data(),
        d_null_start    = null_start.data(),
        d_null_end      = null_end.data()] __device__(auto group_label) {
         auto group_start           = d_group_offsets[group_label];
         auto group_end             = d_group_offsets[group_label + 1];
-        auto first_element_is_null = d_timestamps.is_null_nocheck(group_start);
-        auto last_element_is_null  = d_timestamps.is_null_nocheck(group_end - 1);
+        auto first_element_is_null = d_orderby.is_null_nocheck(group_start);
+        auto last_element_is_null  = d_orderby.is_null_nocheck(group_end - 1);
         if (!first_element_is_null && !last_element_is_null) {
           // Short circuit: No nulls.
           d_null_start[group_label] = group_start;
@@ -416,7 +437,7 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
             thrust::seq,
             thrust::make_counting_iterator(group_start),
             thrust::make_counting_iterator(group_end),
-            [&d_timestamps] __device__(auto i) { return d_timestamps.is_null_nocheck(i); });
+            [&d_orderby] __device__(auto i) { return d_orderby.is_null_nocheck(i); });
         } else {
           // NULLS LAST.
           d_null_end[group_label]   = group_end;
@@ -424,39 +445,47 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column,
             thrust::seq,
             thrust::make_counting_iterator(group_start),
             thrust::make_counting_iterator(group_end),
-            [&d_timestamps] __device__(auto i) { return d_timestamps.is_valid_nocheck(i); });
+            [&d_orderby] __device__(auto i) { return d_orderby.is_valid_nocheck(i); });
         }
       });
-  }
 
-  return std::make_tuple(std::move(null_start), std::move(null_end));
+    return std::make_tuple(std::move(null_start), std::move(null_end));
+  } else {
+    // The returned vectors have num_groups items, but the input offsets have num_groups+1
+    // Drop the last element using a span
+    auto group_offsets_span =
+      cudf::device_span<cudf::size_type const>(group_offsets.data(), num_groups);
+
+    // When there are no nulls, just copy the input group offsets to the output.
+    return std::make_tuple(cudf::detail::make_device_uvector_async(group_offsets_span, stream),
+                           cudf::detail::make_device_uvector_async(group_offsets_span, stream));
+  }
 }
 
-// Time-range window computation, for timestamps in ASCENDING order.
-std::unique_ptr<column> time_range_window_ASC(
-  column_view const& input,
-  column_view const& timestamp_column,
-  rmm::device_uvector<cudf::size_type> const& group_offsets,
-  rmm::device_uvector<cudf::size_type> const& group_labels,
-  TimeT preceding_window,
-  bool preceding_window_is_unbounded,
-  TimeT following_window,
-  bool following_window_is_unbounded,
-  size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+// Range window computation, for orderby column in ASCENDING order.
+template <typename T>
+std::unique_ptr<column> range_window_ASC(column_view const& input,
+                                         column_view const& orderby_column,
+                                         rmm::device_uvector<cudf::size_type> const& group_offsets,
+                                         rmm::device_uvector<cudf::size_type> const& group_labels,
+                                         T preceding_window,
+                                         bool preceding_window_is_unbounded,
+                                         T following_window,
+                                         bool following_window_is_unbounded,
+                                         size_type min_periods,
+                                         rolling_aggregation const& aggr,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
-  rmm::device_vector<size_type> null_start, null_end;
-  std::tie(null_start, null_end) =
-    get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
+  auto [null_start, null_end] =
+    get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
 
   auto preceding_calculator =
     [d_group_offsets = group_offsets.data(),
      d_group_labels  = group_labels.data(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
+     d_orderby       = orderby_column.data<T>(),
+     d_nulls_begin   = null_start.data(),
+     d_nulls_end     = null_end.data(),
      preceding_window,
      preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
     auto group_label = d_group_labels[idx];
@@ -473,19 +502,19 @@ std::unique_ptr<column> time_range_window_ASC(
       return idx - nulls_begin + 1;
     }
 
-    // timestamp[idx] not null. Search must exclude the null group.
+    // orderby[idx] not null. Search must exclude the null group.
     // If nulls_begin == group_start, either of the following is true:
     //  1. NULLS FIRST ordering: Search must begin at nulls_end.
     //  2. NO NULLS: Search must begin at group_start (which also equals nulls_end.)
     // Otherwise, NULLS LAST ordering. Search must start at nulls group_start.
     auto search_start = nulls_begin == group_start ? nulls_end : group_start;
 
-    auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window;
+    auto lowest_in_window = subtract_safe(d_orderby[idx], preceding_window);
 
-    return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq,
-                                                       d_timestamps + search_start,
-                                                       d_timestamps + idx,
-                                                       lowest_timestamp_in_window)) +
+    return ((d_orderby + idx) - thrust::lower_bound(thrust::seq,
+                                                    d_orderby + search_start,
+                                                    d_orderby + idx,
+                                                    lowest_in_window)) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
@@ -494,16 +523,15 @@ std::unique_ptr<column> time_range_window_ASC(
   auto following_calculator =
     [d_group_offsets = group_offsets.data(),
      d_group_labels  = group_labels.data(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
+     d_orderby       = orderby_column.data<T>(),
+     d_nulls_begin   = null_start.data(),
+     d_nulls_end     = null_end.data(),
      following_window,
      following_window_is_unbounded] __device__(size_type idx) -> size_type {
     auto group_label = d_group_labels[idx];
     auto group_start = d_group_offsets[group_label];
-    auto group_end =
-      d_group_offsets[group_label +
-                      1];  // Cannot fall off the end, since offsets is capped with `input.size()`.
+    auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
+                                                          // is capped with `input.size()`.
     auto nulls_begin = d_nulls_begin[group_label];
     auto nulls_end   = d_nulls_end[group_label];
 
@@ -516,51 +544,49 @@ std::unique_ptr<column> time_range_window_ASC(
       return nulls_end - idx - 1;
     }
 
-    // timestamp[idx] not null. Search must exclude the null group.
+    // orderby[idx] not null. Search must exclude the null group.
     // If nulls_begin == group_start, either of the following is true:
     //  1. NULLS FIRST ordering: Search ends at group_end.
     //  2. NO NULLS: Search ends at group_end.
     // Otherwise, NULLS LAST ordering. Search ends at nulls_begin.
     auto search_end = nulls_begin == group_start ? group_end : nulls_begin;
 
-    auto highest_timestamp_in_window = d_timestamps[idx] + following_window;
+    auto highest_in_window = add_safe(d_orderby[idx], following_window);
 
-    return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + search_end,
-                                highest_timestamp_in_window) -
-            (d_timestamps + idx)) -
+    return (thrust::upper_bound(
+              thrust::seq, d_orderby + idx, d_orderby + search_end, highest_in_window) -
+            (d_orderby + idx)) -
            1;
   };
 
   auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
 
-  return cudf::rolling_window(
-    input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+  return cudf::detail::rolling_window(
+    input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr);
 }
 
-/// Time-range window computation, with
+/// Range window computation, with
 ///   1. no grouping keys specified
-///   2. timetamps in DESCENDING order.
+///   2. rows in DESCENDING order.
 /// Treat as one single group.
-std::unique_ptr<column> time_range_window_DESC(column_view const& input,
-                                               column_view const& timestamp_column,
-                                               TimeT preceding_window,
-                                               bool preceding_window_is_unbounded,
-                                               TimeT following_window,
-                                               bool following_window_is_unbounded,
-                                               size_type min_periods,
-                                               std::unique_ptr<aggregation> const& aggr,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+template <typename T>
+std::unique_ptr<column> range_window_DESC(column_view const& input,
+                                          column_view const& orderby_column,
+                                          T preceding_window,
+                                          bool preceding_window_is_unbounded,
+                                          T following_window,
+                                          bool following_window_is_unbounded,
+                                          size_type min_periods,
+                                          rolling_aggregation const& aggr,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
-  size_type nulls_begin_idx, nulls_end_idx;
-  std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column);
+  auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
 
   auto preceding_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     d_timestamps = timestamp_column.data<TimeT>(),
+    [nulls_begin_idx = h_nulls_begin_idx,
+     nulls_end_idx   = h_nulls_end_idx,
+     d_orderby       = orderby_column.data<T>(),
      preceding_window,
      preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
     if (preceding_window_is_unbounded) {
@@ -574,30 +600,30 @@ std::unique_ptr<column> time_range_window_DESC(column_view const& input,
       return idx - nulls_begin_idx + 1;
     }
 
-    // timestamp[idx] not null. Binary search the group, excluding null group.
+    // orderby[idx] not null. Binary search the group, excluding null group.
     // If nulls_begin_idx == 0, either
     //  1. NULLS FIRST ordering: Binary search starts where nulls_end_idx.
     //  2. NO NULLS: Binary search starts at 0 (also nulls_end_idx).
     // Otherwise, NULLS LAST ordering. Start at 0.
-    auto group_start                 = nulls_begin_idx == 0 ? nulls_end_idx : 0;
-    auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window;
+    auto group_start       = nulls_begin_idx == 0 ? nulls_end_idx : 0;
+    auto highest_in_window = add_safe(d_orderby[idx], preceding_window);
 
-    return ((d_timestamps + idx) -
+    return ((d_orderby + idx) -
             thrust::lower_bound(thrust::seq,
-                                d_timestamps + group_start,
-                                d_timestamps + idx,
-                                highest_timestamp_in_window,
-                                thrust::greater<decltype(highest_timestamp_in_window)>())) +
+                                d_orderby + group_start,
+                                d_orderby + idx,
+                                highest_in_window,
+                                thrust::greater<decltype(highest_in_window)>())) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
   auto preceding_column = expand_to_column(preceding_calculator, input.size(), stream, mr);
 
   auto following_calculator =
-    [nulls_begin_idx,
-     nulls_end_idx,
-     num_rows     = input.size(),
-     d_timestamps = timestamp_column.data<TimeT>(),
+    [nulls_begin_idx = h_nulls_begin_idx,
+     nulls_end_idx   = h_nulls_end_idx,
+     num_rows        = input.size(),
+     d_orderby       = orderby_column.data<T>(),
      following_window,
      following_window_is_unbounded] __device__(size_type idx) -> size_type {
     if (following_window_is_unbounded) { return (num_rows - idx) - 1; }
@@ -607,55 +633,54 @@ std::unique_ptr<column> time_range_window_DESC(column_view const& input,
       return nulls_end_idx - idx - 1;
     }
 
-    // timestamp[idx] not null. Search must exclude null group.
+    // orderby[idx] not null. Search must exclude null group.
     // If nulls_begin_idx = 0, either
     //  1. NULLS FIRST ordering: Search ends at num_rows.
     //  2. NO NULLS: Search also ends at num_rows.
     // Otherwise, NULLS LAST ordering: End at nulls_begin_idx.
 
-    auto group_end                  = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
-    auto lowest_timestamp_in_window = d_timestamps[idx] - following_window;
+    auto group_end        = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx;
+    auto lowest_in_window = subtract_safe(d_orderby[idx], following_window);
 
     return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + group_end,
-                                lowest_timestamp_in_window,
-                                thrust::greater<decltype(lowest_timestamp_in_window)>()) -
-            (d_timestamps + idx)) -
+                                d_orderby + idx,
+                                d_orderby + group_end,
+                                lowest_in_window,
+                                thrust::greater<decltype(lowest_in_window)>()) -
+            (d_orderby + idx)) -
            1;
   };
 
   auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
 
-  return cudf::rolling_window(
-    input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+  return cudf::detail::rolling_window(
+    input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr);
 }
 
-// Time-range window computation, for timestamps in DESCENDING order.
-std::unique_ptr<column> time_range_window_DESC(
-  column_view const& input,
-  column_view const& timestamp_column,
-  rmm::device_uvector<cudf::size_type> const& group_offsets,
-  rmm::device_uvector<cudf::size_type> const& group_labels,
-  TimeT preceding_window,
-  bool preceding_window_is_unbounded,
-  TimeT following_window,
-  bool following_window_is_unbounded,
-  size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+// Range window computation, for rows in DESCENDING order.
+template <typename T>
+std::unique_ptr<column> range_window_DESC(column_view const& input,
+                                          column_view const& orderby_column,
+                                          rmm::device_uvector<cudf::size_type> const& group_offsets,
+                                          rmm::device_uvector<cudf::size_type> const& group_labels,
+                                          T preceding_window,
+                                          bool preceding_window_is_unbounded,
+                                          T following_window,
+                                          bool following_window_is_unbounded,
+                                          size_type min_periods,
+                                          rolling_aggregation const& aggr,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
-  rmm::device_vector<size_type> null_start, null_end;
-  std::tie(null_start, null_end) =
-    get_null_bounds_for_timestamp_column(timestamp_column, group_offsets);
+  auto [null_start, null_end] =
+    get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
 
   auto preceding_calculator =
     [d_group_offsets = group_offsets.data(),
      d_group_labels  = group_labels.data(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
+     d_orderby       = orderby_column.data<T>(),
+     d_nulls_begin   = null_start.data(),
+     d_nulls_end     = null_end.data(),
      preceding_window,
      preceding_window_is_unbounded] __device__(size_type idx) -> size_type {
     auto group_label = d_group_labels[idx];
@@ -672,21 +697,21 @@ std::unique_ptr<column> time_range_window_DESC(
       return idx - nulls_begin + 1;
     }
 
-    // timestamp[idx] not null. Search must exclude the null group.
+    // orderby[idx] not null. Search must exclude the null group.
     // If nulls_begin == group_start, either of the following is true:
     //  1. NULLS FIRST ordering: Search must begin at nulls_end.
     //  2. NO NULLS: Search must begin at group_start (which also equals nulls_end.)
     // Otherwise, NULLS LAST ordering. Search must start at nulls group_start.
     auto search_start = nulls_begin == group_start ? nulls_end : group_start;
 
-    auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window;
+    auto highest_in_window = add_safe(d_orderby[idx], preceding_window);
 
-    return ((d_timestamps + idx) -
+    return ((d_orderby + idx) -
             thrust::lower_bound(thrust::seq,
-                                d_timestamps + search_start,
-                                d_timestamps + idx,
-                                highest_timestamp_in_window,
-                                thrust::greater<decltype(highest_timestamp_in_window)>())) +
+                                d_orderby + search_start,
+                                d_orderby + idx,
+                                highest_in_window,
+                                thrust::greater<decltype(highest_in_window)>())) +
            1;  // Add 1, for `preceding` to account for current row.
   };
 
@@ -695,9 +720,9 @@ std::unique_ptr<column> time_range_window_DESC(
   auto following_calculator =
     [d_group_offsets = group_offsets.data(),
      d_group_labels  = group_labels.data(),
-     d_timestamps    = timestamp_column.data<TimeT>(),
-     d_nulls_begin   = null_start.data().get(),
-     d_nulls_end     = null_end.data().get(),
+     d_orderby       = orderby_column.data<T>(),
+     d_nulls_begin   = null_start.data(),
+     d_nulls_end     = null_end.data(),
      following_window,
      following_window_is_unbounded] __device__(size_type idx) -> size_type {
     auto group_label = d_group_labels[idx];
@@ -715,119 +740,216 @@ std::unique_ptr<column> time_range_window_DESC(
       return nulls_end - idx - 1;
     }
 
-    // timestamp[idx] not null. Search must exclude the null group.
+    // orderby[idx] not null. Search must exclude the null group.
     // If nulls_begin == group_start, either of the following is true:
     //  1. NULLS FIRST ordering: Search ends at group_end.
     //  2. NO NULLS: Search ends at group_end.
     // Otherwise, NULLS LAST ordering. Search ends at nulls_begin.
     auto search_end = nulls_begin == group_start ? group_end : nulls_begin;
 
-    auto lowest_timestamp_in_window = d_timestamps[idx] - following_window;
+    auto lowest_in_window = subtract_safe(d_orderby[idx], following_window);
 
     return (thrust::upper_bound(thrust::seq,
-                                d_timestamps + idx,
-                                d_timestamps + search_end,
-                                lowest_timestamp_in_window,
-                                thrust::greater<decltype(lowest_timestamp_in_window)>()) -
-            (d_timestamps + idx)) -
+                                d_orderby + idx,
+                                d_orderby + search_end,
+                                lowest_in_window,
+                                thrust::greater<decltype(lowest_in_window)>()) -
+            (d_orderby + idx)) -
            1;
   };
 
   auto following_column = expand_to_column(following_calculator, input.size(), stream, mr);
 
-  if (aggr->kind == aggregation::CUDA || aggr->kind == aggregation::PTX) {
-    CUDF_FAIL("Time ranged rolling window does NOT (yet) support UDF.");
+  if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
+    CUDF_FAIL("Ranged rolling window does NOT (yet) support UDF.");
   } else {
-    return cudf::rolling_window(
-      input, preceding_column->view(), following_column->view(), min_periods, aggr, mr);
+    return cudf::detail::rolling_window(
+      input, preceding_column->view(), following_column->view(), min_periods, aggr, stream, mr);
   }
 }
 
-std::unique_ptr<column> grouped_time_range_rolling_window_impl(
+template <typename OrderByT>
+std::unique_ptr<column> grouped_range_rolling_window_impl(
   column_view const& input,
-  column_view const& timestamp_column,
+  column_view const& orderby_column,
   cudf::order const& timestamp_ordering,
   rmm::device_uvector<cudf::size_type> const& group_offsets,
   rmm::device_uvector<cudf::size_type> const& group_labels,
-  window_bounds preceding_window_in_days,  // TODO: Consider taking offset-type as type_id. Assumes
-                                           // days for now.
-  window_bounds following_window_in_days,
+  range_window_bounds preceding_window,
+  range_window_bounds following_window,
   size_type min_periods,
-  std::unique_ptr<aggregation> const& aggr,
+  rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  TimeT mult_factor{static_cast<TimeT>(multiplication_factor(timestamp_column.type()))};
+  auto preceding_value = detail::range_comparable_value<OrderByT>(preceding_window);
+  auto following_value = detail::range_comparable_value<OrderByT>(following_window);
 
   if (timestamp_ordering == cudf::order::ASCENDING) {
-    return group_offsets.is_empty()
-             ? time_range_window_ASC(input,
-                                     timestamp_column,
-                                     preceding_window_in_days.value * mult_factor,
-                                     preceding_window_in_days.is_unbounded,
-                                     following_window_in_days.value * mult_factor,
-                                     following_window_in_days.is_unbounded,
-                                     min_periods,
-                                     aggr,
-                                     stream,
-                                     mr)
-             : time_range_window_ASC(input,
-                                     timestamp_column,
-                                     group_offsets,
-                                     group_labels,
-                                     preceding_window_in_days.value * mult_factor,
-                                     preceding_window_in_days.is_unbounded,
-                                     following_window_in_days.value * mult_factor,
-                                     following_window_in_days.is_unbounded,
-                                     min_periods,
-                                     aggr,
-                                     stream,
-                                     mr);
+    return group_offsets.is_empty() ? range_window_ASC(input,
+                                                       orderby_column,
+                                                       preceding_value,
+                                                       preceding_window.is_unbounded(),
+                                                       following_value,
+                                                       following_window.is_unbounded(),
+                                                       min_periods,
+                                                       aggr,
+                                                       stream,
+                                                       mr)
+                                    : range_window_ASC(input,
+                                                       orderby_column,
+                                                       group_offsets,
+                                                       group_labels,
+                                                       preceding_value,
+                                                       preceding_window.is_unbounded(),
+                                                       following_value,
+                                                       following_window.is_unbounded(),
+                                                       min_periods,
+                                                       aggr,
+                                                       stream,
+                                                       mr);
   } else {
-    return group_offsets.is_empty()
-             ? time_range_window_DESC(input,
-                                      timestamp_column,
-                                      preceding_window_in_days.value * mult_factor,
-                                      preceding_window_in_days.is_unbounded,
-                                      following_window_in_days.value * mult_factor,
-                                      following_window_in_days.is_unbounded,
-                                      min_periods,
-                                      aggr,
-                                      stream,
-                                      mr)
-             : time_range_window_DESC(input,
-                                      timestamp_column,
-                                      group_offsets,
-                                      group_labels,
-                                      preceding_window_in_days.value * mult_factor,
-                                      preceding_window_in_days.is_unbounded,
-                                      following_window_in_days.value * mult_factor,
-                                      following_window_in_days.is_unbounded,
-                                      min_periods,
-                                      aggr,
-                                      stream,
-                                      mr);
+    return group_offsets.is_empty() ? range_window_DESC(input,
+                                                        orderby_column,
+                                                        preceding_value,
+                                                        preceding_window.is_unbounded(),
+                                                        following_value,
+                                                        following_window.is_unbounded(),
+                                                        min_periods,
+                                                        aggr,
+                                                        stream,
+                                                        mr)
+                                    : range_window_DESC(input,
+                                                        orderby_column,
+                                                        group_offsets,
+                                                        group_labels,
+                                                        preceding_value,
+                                                        preceding_window.is_unbounded(),
+                                                        following_value,
+                                                        following_window.is_unbounded(),
+                                                        min_periods,
+                                                        aggr,
+                                                        stream,
+                                                        mr);
   }
 }
 
+struct dispatch_grouped_range_rolling_window {
+  template <typename OrderByColumnType, typename... Args>
+  std::enable_if_t<!detail::is_supported_order_by_column_type<OrderByColumnType>(),
+                   std::unique_ptr<column>>
+  operator()(Args&&...) const
+  {
+    CUDF_FAIL("Unsupported OrderBy column type.");
+  }
+
+  template <typename OrderByColumnType, typename... Args>
+  std::enable_if_t<detail::is_supported_order_by_column_type<OrderByColumnType>(),
+                   std::unique_ptr<column>>
+  operator()(Args&&... args) const
+  {
+    return grouped_range_rolling_window_impl<OrderByColumnType>(std::forward<Args>(args)...);
+  }
+};
+
+/**
+ * @brief Functor to convert from size_type (number of days) to appropriate duration type.
+ */
+struct to_duration_bounds {
+  template <typename OrderBy, std::enable_if_t<cudf::is_timestamp<OrderBy>(), void>* = nullptr>
+  range_window_bounds operator()(size_type num_days) const
+  {
+    using DurationT = typename OrderBy::duration;
+    return range_window_bounds::get(duration_scalar<DurationT>{duration_D{num_days}, true});
+  }
+
+  template <typename OrderBy, std::enable_if_t<!cudf::is_timestamp<OrderBy>(), void>* = nullptr>
+  range_window_bounds operator()(size_type num_days) const
+  {
+    CUDF_FAIL("Expected timestamp orderby column.");
+  }
+};
+
+/**
+ * @brief Get duration type corresponding to specified timestamp type.
+ */
+data_type get_duration_type_for(cudf::data_type timestamp_type)
+{
+  switch (timestamp_type.id()) {
+    case type_id::TIMESTAMP_DAYS: return data_type{type_id::DURATION_DAYS};
+    case type_id::TIMESTAMP_SECONDS: return data_type{type_id::DURATION_SECONDS};
+    case type_id::TIMESTAMP_MILLISECONDS: return data_type{type_id::DURATION_MILLISECONDS};
+    case type_id::TIMESTAMP_MICROSECONDS: return data_type{type_id::DURATION_MICROSECONDS};
+    case type_id::TIMESTAMP_NANOSECONDS: return data_type{type_id::DURATION_NANOSECONDS};
+    default: CUDF_FAIL("Expected timestamp orderby column.");
+  }
+}
+
+/**
+ * @brief Bridge function to convert from size_type (number of days) to appropriate duration type.
+ *
+ * This helps adapt the old `grouped_time_range_rolling_window()` functions that took a "number of
+ * days" to the new `range_window_bounds` interface.
+ *
+ * @param num_days Window bounds specified in number of days in `size_type`
+ * @param timestamp_type Data-type of the orderby column to which the `num_days` is to be adapted.
+ * @return range_window_bounds A `range_window_bounds` to be used with the new API.
+ */
+range_window_bounds to_range_bounds(cudf::size_type num_days, cudf::data_type timestamp_type)
+{
+  return cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, num_days);
+}
+
+/**
+ * @brief Bridge function to convert from `window_bounds` (in days) to appropriate duration type.
+ *
+ * This helps adapt the old `grouped_time_range_rolling_window()` functions that took a
+ * `window_bounds` to the new `range_window_bounds` interface.
+ *
+ * @param days_bounds The static window-width `window_bounds` object
+ * @param timestamp_type Data-type of the orderby column to which the `num_days` is to be adapted.
+ * @return range_window_bounds A `range_window_bounds` to be used with the new API.
+ */
+range_window_bounds to_range_bounds(cudf::window_bounds const& days_bounds,
+                                    cudf::data_type timestamp_type)
+{
+  return days_bounds.is_unbounded
+           ? range_window_bounds::unbounded(get_duration_type_for(timestamp_type))
+           : cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, days_bounds.value);
+}
+
 }  // namespace
 
 namespace detail {
 
-std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
-                                                          column_view const& timestamp_column,
-                                                          cudf::order const& timestamp_order,
-                                                          column_view const& input,
-                                                          window_bounds preceding_window_in_days,
-                                                          window_bounds following_window_in_days,
-                                                          size_type min_periods,
-                                                          std::unique_ptr<aggregation> const& aggr,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+/**
+ * @copydoc  std::unique_ptr<column> grouped_range_rolling_window(
+ *               table_view const& group_keys,
+ *               column_view const& orderby_column,
+ *               cudf::order const& order,
+ *               column_view const& input,
+ *               range_window_bounds const& preceding,
+ *               range_window_bounds const& following,
+ *               size_type min_periods,
+ *               rolling_aggregation const& aggr,
+ *               rmm::mr::device_memory_resource* mr );
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_keys,
+                                                     column_view const& order_by_column,
+                                                     cudf::order const& order,
+                                                     column_view const& input,
+                                                     range_window_bounds const& preceding,
+                                                     range_window_bounds const& following,
+                                                     size_type min_periods,
+                                                     rolling_aggregation const& aggr,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  if (input.is_empty()) return empty_like(input);
+  if (input.is_empty()) { return cudf::detail::empty_output_for_rolling_aggregation(input, aggr); }
 
   CUDF_EXPECTS((group_keys.num_columns() == 0 || group_keys.num_rows() == input.size()),
                "Size mismatch between group_keys and input vector.");
@@ -844,30 +966,35 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
     group_labels  = index_vector(helper.group_labels(stream), stream);
   }
 
-  // Assumes that `timestamp_column` is actually of a timestamp type.
-  CUDF_EXPECTS(is_supported_range_frame_unit(timestamp_column.type()),
-               "Unsupported data-type for `timestamp`-based rolling window operation!");
-
-  auto is_timestamp_in_days = timestamp_column.type().id() == cudf::type_id::TIMESTAMP_DAYS;
-
-  return grouped_time_range_rolling_window_impl(
-    input,
-    is_timestamp_in_days
-      ? cudf::cast(timestamp_column, cudf::data_type(cudf::type_id::TIMESTAMP_SECONDS), mr)->view()
-      : timestamp_column,
-    timestamp_order,
-    group_offsets,
-    group_labels,
-    preceding_window_in_days,
-    following_window_in_days,
-    min_periods,
-    aggr,
-    stream,
-    mr);
+  return cudf::type_dispatcher(order_by_column.type(),
+                               dispatch_grouped_range_rolling_window{},
+                               input,
+                               order_by_column,
+                               order,
+                               group_offsets,
+                               group_labels,
+                               preceding,
+                               following,
+                               min_periods,
+                               aggr,
+                               stream,
+                               mr);
 }
 
 }  // namespace detail
 
+/**
+ * @copydoc std::unique_ptr<column> grouped_time_range_rolling_window(
+ *              table_view const& group_keys,
+ *              column_view const& timestamp_column,
+ *              cudf::order const& timestamp_order,
+ *              column_view const& input,
+ *              size_type preceding_window_in_days,
+ *              size_type following_window_in_days,
+ *              size_type min_periods,
+ *              rolling_aggregation const& aggr,
+ *              rmm::mr::device_memory_resource* mr);
+ */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
                                                           cudf::order const& timestamp_order,
@@ -875,21 +1002,35 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           size_type preceding_window_in_days,
                                                           size_type following_window_in_days,
                                                           size_type min_periods,
-                                                          std::unique_ptr<aggregation> const& aggr,
+                                                          rolling_aggregation const& aggr,
                                                           rmm::mr::device_memory_resource* mr)
 {
-  return grouped_time_range_rolling_window(group_keys,
-                                           timestamp_column,
-                                           timestamp_order,
-                                           input,
-                                           window_bounds::get(preceding_window_in_days),
-                                           window_bounds::get(following_window_in_days),
-                                           min_periods,
-                                           aggr,
-                                           rmm::cuda_stream_default,
-                                           mr);
+  auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type());
+  auto following = to_range_bounds(following_window_in_days, timestamp_column.type());
+
+  return grouped_range_rolling_window(group_keys,
+                                      timestamp_column,
+                                      timestamp_order,
+                                      input,
+                                      preceding,
+                                      following,
+                                      min_periods,
+                                      aggr,
+                                      mr);
 }
 
+/**
+ * @copydoc std::unique_ptr<column> grouped_time_range_rolling_window(
+ *            table_view const& group_keys,
+ *            column_view const& timestamp_column,
+ *            cudf::order const& timestamp_order,
+ *            column_view const& input,
+ *            window_bounds preceding_window_in_days,
+ *            window_bounds following_window_in_days,
+ *            size_type min_periods,
+ *            rolling_aggregation const& aggr,
+ *            rmm::mr::device_memory_resource* mr);
+ */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
                                                           cudf::order const& timestamp_order,
@@ -897,19 +1038,58 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           window_bounds preceding_window_in_days,
                                                           window_bounds following_window_in_days,
                                                           size_type min_periods,
-                                                          std::unique_ptr<aggregation> const& aggr,
+                                                          rolling_aggregation const& aggr,
                                                           rmm::mr::device_memory_resource* mr)
 {
-  return detail::grouped_time_range_rolling_window(group_keys,
-                                                   timestamp_column,
-                                                   timestamp_order,
-                                                   input,
-                                                   preceding_window_in_days,
-                                                   following_window_in_days,
-                                                   min_periods,
-                                                   aggr,
-                                                   rmm::cuda_stream_default,
-                                                   mr);
+  range_window_bounds preceding =
+    to_range_bounds(preceding_window_in_days, timestamp_column.type());
+  range_window_bounds following =
+    to_range_bounds(following_window_in_days, timestamp_column.type());
+
+  return grouped_range_rolling_window(group_keys,
+                                      timestamp_column,
+                                      timestamp_order,
+                                      input,
+                                      preceding,
+                                      following,
+                                      min_periods,
+                                      aggr,
+                                      rmm::cuda_stream_default,
+                                      mr);
+}
+
+/**
+ * @copydoc  std::unique_ptr<column> grouped_range_rolling_window(
+ *               table_view const& group_keys,
+ *               column_view const& orderby_column,
+ *               cudf::order const& order,
+ *               column_view const& input,
+ *               range_window_bounds const& preceding,
+ *               range_window_bounds const& following,
+ *               size_type min_periods,
+ *               rolling_aggregation const& aggr,
+ *               rmm::mr::device_memory_resource* mr );
+ */
+std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_keys,
+                                                     column_view const& timestamp_column,
+                                                     cudf::order const& timestamp_order,
+                                                     column_view const& input,
+                                                     range_window_bounds const& preceding,
+                                                     range_window_bounds const& following,
+                                                     size_type min_periods,
+                                                     rolling_aggregation const& aggr,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  return detail::grouped_range_rolling_window(group_keys,
+                                              timestamp_column,
+                                              timestamp_order,
+                                              input,
+                                              preceding,
+                                              following,
+                                              min_periods,
+                                              aggr,
+                                              rmm::cuda_stream_default,
+                                              mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/lead_lag_nested_detail.cuh b/cpp/src/rolling/lead_lag_nested_detail.cuh
new file mode 100644
index 00000000000..4cff3053aa2
--- /dev/null
+++ b/cpp/src/rolling/lead_lag_nested_detail.cuh
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/gather.cuh>
+#include <cudf/detail/scatter.cuh>
+#include <cudf/utilities/traits.hpp>
+#include <vector>
+
+namespace cudf::detail {
+namespace {
+
+/**
+ * @brief Predicate to find indices at which LEAD/LAG evaluated to null.
+ */
+template <typename GatherMapIter>
+class is_null_index_predicate_impl {
+ public:
+  is_null_index_predicate_impl(size_type input_size, GatherMapIter gather_)
+    : _null_index{input_size}, _gather{gather_}
+  {
+  }
+
+  bool __device__ operator()(size_type i) const { return _gather[i] == _null_index; }
+
+ private:
+  size_type const _null_index;  // Index value to use to output NULL for LEAD/LAG calculation.
+  GatherMapIter _gather;        // Iterator for gather-map entries.
+};
+
+/**
+ * @brief Helper to construct is_null_index_predicate_impl
+ */
+template <typename GatherMapIter>
+is_null_index_predicate_impl<GatherMapIter> is_null_index_predicate(size_type input_size,
+                                                                    GatherMapIter gather)
+{
+  return is_null_index_predicate_impl<GatherMapIter>{input_size, gather};
+}
+
+}  // namespace
+
+/**
+ * @brief Helper function to calculate LEAD/LAG for nested-type input columns.
+ *
+ * @tparam PrecedingIterator Iterator-type that returns the preceding bounds
+ * @tparam FollowingIterator Iterator-type that returns the following bounds
+ * @param[in] op Aggregation kind.
+ * @param[in] input Nested-type input column for LEAD/LAG calculation
+ * @param[in] default_outputs Default values to use as outputs, if LEAD/LAG
+ *                            offset crosses column/group boundaries
+ * @param[in] preceding Iterator to retrieve preceding window bounds
+ * @param[in] following Iterator to retrieve following window bounds
+ * @param[in] row_offset Lead/Lag offset, indicating which row after/before
+ *                       the current row is to be returned
+ * @param[in] stream CUDA stream for device memory operations/allocations
+ * @param[in] mr device_memory_resource for device memory allocations
+ */
+template <typename PrecedingIter, typename FollowingIter>
+std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
+                                                    column_view const& input,
+                                                    column_view const& default_outputs,
+                                                    PrecedingIter preceding,
+                                                    FollowingIter following,
+                                                    size_type row_offset,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
+               "Unexpected aggregation type in compute_lead_lag_for_nested");
+  CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
+               "Defaults column type must match input column.");  // Because LEAD/LAG.
+
+  CUDF_EXPECTS(default_outputs.is_empty() || (input.size() == default_outputs.size()),
+               "Number of defaults must match input column.");
+
+  // For LEAD(0)/LAG(0), no computation need be performed.
+  // Return copy of input.
+  if (row_offset == 0) { return std::make_unique<column>(input, stream, mr); }
+
+  // Algorithm:
+  //
+  // 1. Construct gather_map with the LEAD/LAG offset applied to the indices.
+  //    E.g. A gather_map of:
+  //        {0, 1, 2, 3, ..., N-3, N-2, N-1}
+  //    would select the input column, unchanged.
+  //
+  //    For LEAD(2), the following gather_map is used:
+  //        {3, 4, 5, 6, ..., N-1, NULL_INDEX, NULL_INDEX}
+  //    where `NULL_INDEX` selects `NULL` for the gather.
+  //
+  //    Similarly, LAG(2) is implemented using the following gather_map:
+  //        {NULL_INDEX, NULL_INDEX, 0, 1, 2...}
+  //
+  // 2. Gather input column based on the gather_map.
+  // 3. If default outputs are available, scatter contents of `default_outputs`
+  //    to all positions where nulls where gathered in step 2.
+  //
+  // Note: Step 3 can be switched to use `copy_if_else()`, once it supports
+  //       nested types.
+
+  auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
+
+  auto gather_map_column =
+    make_numeric_column(size_data_type, input.size(), mask_state::UNALLOCATED, stream);
+  auto gather_map = gather_map_column->mutable_view();
+
+  auto const input_size = input.size();
+  auto const null_index = input.size();
+  if (op == aggregation::LEAD) {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(size_type{0}),
+                      thrust::make_counting_iterator(size_type{input.size()}),
+                      gather_map.begin<size_type>(),
+                      [following, input_size, null_index, row_offset] __device__(size_type i) {
+                        // Note: grouped_*rolling_window() trims preceding/following to
+                        // the beginning/end of the group. `rolling_window()` does not.
+                        // Must trim _following[i] so as not to go past the column end.
+                        auto _following = min(following[i], input_size - i - 1);
+                        return (row_offset > _following) ? null_index : (i + row_offset);
+                      });
+  } else {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(size_type{0}),
+                      thrust::make_counting_iterator(size_type{input.size()}),
+                      gather_map.begin<size_type>(),
+                      [preceding, input_size, null_index, row_offset] __device__(size_type i) {
+                        // Note: grouped_*rolling_window() trims preceding/following to
+                        // the beginning/end of the group. `rolling_window()` does not.
+                        // Must trim _preceding[i] so as not to go past the column start.
+                        auto _preceding = min(preceding[i], i + 1);
+                        return (row_offset > (_preceding - 1)) ? null_index : (i - row_offset);
+                      });
+  }
+
+  auto output_with_nulls =
+    cudf::detail::gather(table_view{std::vector<column_view>{input}},
+                         gather_map_column->view().template begin<size_type>(),
+                         gather_map_column->view().end<size_type>(),
+                         out_of_bounds_policy::NULLIFY,
+                         stream,
+                         mr);
+
+  if (default_outputs.is_empty()) { return std::move(output_with_nulls->release()[0]); }
+
+  // Must scatter defaults.
+  auto scatter_map = rmm::device_uvector<size_type>(input.size(), stream);
+
+  // Find all indices at which LEAD/LAG computed nulls previously.
+  auto scatter_map_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(size_type{0}),
+                    thrust::make_counting_iterator(size_type{input.size()}),
+                    scatter_map.begin(),
+                    is_null_index_predicate(input.size(), gather_map.begin<size_type>()));
+
+  // Bail early, if all LEAD/LAG computations succeeded. No defaults need be substituted.
+  if (scatter_map.is_empty()) { return std::move(output_with_nulls->release()[0]); }
+
+  // Gather only those default values that are to be substituted.
+  auto gathered_defaults =
+    cudf::detail::gather(table_view{std::vector<column_view>{default_outputs}},
+                         scatter_map.begin(),
+                         scatter_map_end,
+                         out_of_bounds_policy::DONT_CHECK,
+                         stream);
+
+  // Scatter defaults into locations where LEAD/LAG computed nulls.
+  auto scattered_results = cudf::detail::scatter(
+    table_view{std::vector<column_view>{gathered_defaults->release()[0]->view()}},
+    scatter_map.begin(),
+    scatter_map_end,
+    table_view{std::vector<column_view>{output_with_nulls->release()[0]->view()}},
+    false,
+    stream,
+    mr);
+  return std::move(scattered_results->release()[0]);
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
new file mode 100644
index 00000000000..92a2216eeef
--- /dev/null
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/wrappers/durations.hpp>
+#include "range_window_bounds_detail.hpp"
+
+namespace cudf {
+namespace {
+
+/**
+ * @brief Factory to (copy) construct scalars.
+ *
+ * Derived types of scalars are cloned, to be adopted by `range_window_bounds`.
+ * This makes it possible to copy construct and copy assign `range_window_bounds` objects.
+ */
+struct range_scalar_constructor {
+  template <typename T, std::enable_if_t<!detail::is_supported_range_type<T>(), void>* = nullptr>
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  {
+    CUDF_FAIL(
+      "Unsupported range type. "
+      "Only Durations and non-boolean integral range types are allowed.");
+  }
+
+  template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  {
+    return std::make_unique<duration_scalar<T>>(
+      static_cast<duration_scalar<T> const&>(range_scalar_));
+  }
+
+  template <typename T,
+            std::enable_if_t<std::is_integral<T>::value && !cudf::is_boolean<T>(), void>* = nullptr>
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  {
+    return std::make_unique<numeric_scalar<T>>(
+      static_cast<numeric_scalar<T> const&>(range_scalar_));
+  }
+};
+
+}  // namespace
+
+range_window_bounds::range_window_bounds(bool is_unbounded_, std::unique_ptr<scalar> range_scalar_)
+  : _is_unbounded{is_unbounded_}, _range_scalar{std::move(range_scalar_)}
+{
+  CUDF_EXPECTS(_range_scalar.get(), "Range window scalar cannot be null.");
+  CUDF_EXPECTS(_is_unbounded || _range_scalar->is_valid(),
+               "Bounded Range window scalar must be valid.");
+}
+
+range_window_bounds range_window_bounds::unbounded(data_type type)
+{
+  return range_window_bounds(true, make_default_constructed_scalar(type));
+}
+
+range_window_bounds range_window_bounds::get(scalar const& scalar_)
+{
+  return range_window_bounds{
+    false, cudf::type_dispatcher(scalar_.type(), range_scalar_constructor{}, scalar_)};
+}
+
+}  // namespace cudf
diff --git a/cpp/src/rolling/range_window_bounds_detail.hpp b/cpp/src/rolling/range_window_bounds_detail.hpp
new file mode 100644
index 00000000000..47c9273a232
--- /dev/null
+++ b/cpp/src/rolling/range_window_bounds_detail.hpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/wrappers/durations.hpp>
+
+namespace cudf {
+namespace detail {
+
+/// Checks if the specified type is supported in a range_window_bounds.
+template <typename RangeType>
+constexpr bool is_supported_range_type()
+{
+  return cudf::is_duration<RangeType>() ||
+         (std::is_integral<RangeType>::value && !cudf::is_boolean<RangeType>());
+}
+
+/// Checks if the specified type is a supported target type,
+/// as an orderby column, for comparisons with a range_window_bounds scalar.
+template <typename ColumnType>
+constexpr bool is_supported_order_by_column_type()
+{
+  return cudf::is_timestamp<ColumnType>() ||
+         (std::is_integral<ColumnType>::value && !cudf::is_boolean<ColumnType>());
+}
+
+/// Range-comparable representation type for an orderby column type.
+/// This is the datatype used for range comparisons.
+///   1. For integral orderby column types `T`, comparisons are done as `T`.
+///      E.g. `range_type_for<int32_t>` == `int32_t`.
+///   2. For timestamp orderby columns:
+///      a. For `TIMESTAMP_DAYS`, the range-type is `DURATION_DAYS`.
+///         Comparisons are done in `int32_t`.
+///      b. For all other timestamp types, comparisons are done in `int64_t`.
+template <typename ColumnType, typename = void>
+struct range_type_impl {
+  using type     = void;
+  using rep_type = void;
+};
+
+template <typename ColumnType>
+struct range_type_impl<
+  ColumnType,
+  std::enable_if_t<std::is_integral<ColumnType>::value && !cudf::is_boolean<ColumnType>(), void>> {
+  using type     = ColumnType;
+  using rep_type = ColumnType;
+};
+
+template <typename TimestampType>
+struct range_type_impl<TimestampType, std::enable_if_t<cudf::is_timestamp<TimestampType>(), void>> {
+  using type     = typename TimestampType::duration;
+  using rep_type = typename type::rep;
+};
+
+template <typename ColumnType>
+using range_type = typename range_type_impl<ColumnType>::type;
+
+template <typename ColumnType>
+using range_rep_type = typename range_type_impl<ColumnType>::rep_type;
+
+namespace {
+
+template <typename T>
+void assert_non_negative(T const& value)
+{
+  if constexpr (std::numeric_limits<T>::is_signed) {
+    CUDF_EXPECTS(value >= T{0}, "Range scalar must be >= 0.");
+  }
+}
+
+template <
+  typename RangeT,
+  typename RepT,
+  std::enable_if_t<std::is_integral<RangeT>::value && !cudf::is_boolean<RangeT>(), void>* = nullptr>
+RepT range_comparable_value_impl(scalar const& range_scalar, rmm::cuda_stream_view stream)
+{
+  auto val = static_cast<numeric_scalar<RangeT> const&>(range_scalar).value(stream);
+  assert_non_negative(val);
+  return val;
+}
+
+template <typename RangeT,
+          typename RepT,
+          std::enable_if_t<cudf::is_duration<RangeT>(), void>* = nullptr>
+RepT range_comparable_value_impl(scalar const& range_scalar, rmm::cuda_stream_view stream)
+{
+  auto val = static_cast<duration_scalar<RangeT> const&>(range_scalar).value(stream).count();
+  assert_non_negative(val);
+  return val;
+}
+
+}  // namespace
+
+/**
+ * @brief Fetch the value of the range_window_bounds scalar, for comparisons
+ *        with an orderby column's rows.
+ *
+ * @tparam OrderByType The type of the orderby column with which the range value will be compared
+ * @param range_bounds The range_window_bounds whose value is to be read
+ * @param stream The CUDA stream for device memory operations
+ * @return RepType Value of the range scalar
+ */
+template <typename OrderByType>
+range_rep_type<OrderByType> range_comparable_value(
+  range_window_bounds const& range_bounds, rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+{
+  auto const& range_scalar = range_bounds.range_scalar();
+  using range_type         = cudf::detail::range_type<OrderByType>;
+
+  CUDF_EXPECTS(range_scalar.type().id() == cudf::type_to_id<range_type>(),
+               "Unexpected range type for specified orderby column.");
+
+  using rep_type = cudf::detail::range_rep_type<OrderByType>;
+  return range_comparable_value_impl<range_type, rep_type>(range_scalar, stream);
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index c187b8720b1..eb81f81ef12 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include "rolling_detail.cuh"
 
 namespace cudf {
@@ -23,7 +24,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type preceding_window,
                                        size_type following_window,
                                        size_type min_periods,
-                                       std::unique_ptr<aggregation> const& agg,
+                                       rolling_aggregation const& agg,
                                        rmm::mr::device_memory_resource* mr)
 {
   auto defaults =
@@ -40,19 +41,20 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type preceding_window,
                                        size_type following_window,
                                        size_type min_periods,
-                                       std::unique_ptr<aggregation> const& agg,
+                                       rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  if (input.is_empty()) return empty_like(input);
+  if (input.is_empty()) { return cudf::detail::empty_output_for_rolling_aggregation(input, agg); }
+
   CUDF_EXPECTS((min_periods >= 0), "min_periods must be non-negative");
 
   CUDF_EXPECTS((default_outputs.is_empty() || default_outputs.size() == input.size()),
                "Defaults column must be either empty or have as many rows as the input column.");
 
-  if (agg->kind == aggregation::CUDA || agg->kind == aggregation::PTX) {
+  if (agg.kind == aggregation::CUDA || agg.kind == aggregation::PTX) {
     return cudf::detail::rolling_window_udf(input,
                                             preceding_window,
                                             "cudf::size_type",
@@ -82,14 +84,15 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& preceding_window,
                                        column_view const& following_window,
                                        size_type min_periods,
-                                       std::unique_ptr<aggregation> const& agg,
+                                       rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  if (preceding_window.is_empty() || following_window.is_empty() || input.is_empty())
-    return empty_like(input);
+  if (preceding_window.is_empty() || following_window.is_empty() || input.is_empty()) {
+    return cudf::detail::empty_output_for_rolling_aggregation(input, agg);
+  }
 
   CUDF_EXPECTS(preceding_window.type().id() == type_id::INT32 &&
                  following_window.type().id() == type_id::INT32,
@@ -98,7 +101,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
   CUDF_EXPECTS(preceding_window.size() == input.size() && following_window.size() == input.size(),
                "preceding_window/following_window size must match input size");
 
-  if (agg->kind == aggregation::CUDA || agg->kind == aggregation::PTX) {
+  if (agg.kind == aggregation::CUDA || agg.kind == aggregation::PTX) {
     return cudf::detail::rolling_window_udf(input,
                                             preceding_window.begin<size_type>(),
                                             "cudf::size_type*",
@@ -130,7 +133,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type preceding_window,
                                        size_type following_window,
                                        size_type min_periods,
-                                       std::unique_ptr<aggregation> const& agg,
+                                       rolling_aggregation const& agg,
                                        rmm::mr::device_memory_resource* mr)
 {
   return detail::rolling_window(input,
@@ -148,7 +151,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& preceding_window,
                                        column_view const& following_window,
                                        size_type min_periods,
-                                       std::unique_ptr<aggregation> const& agg,
+                                       rolling_aggregation const& agg,
                                        rmm::mr::device_memory_resource* mr)
 {
   return detail::rolling_window(
diff --git a/cpp/src/rolling/rolling_collect_list.cuh b/cpp/src/rolling/rolling_collect_list.cuh
new file mode 100644
index 00000000000..0ffafe349b9
--- /dev/null
+++ b/cpp/src/rolling/rolling_collect_list.cuh
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/strings/detail/utilities.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/reduce.h>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+/**
+ * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation
+ *
+ * Given the input column, the preceding/following window bounds, and `min_periods`,
+ * the sizes of each list row may be computed. These values can then be used to
+ * calculate the offsets for the result of `COLLECT_LIST`.
+ *
+ * Note: If `min_periods` exceeds the number of observations for a window, the size
+ * is set to `0` (since the result is `null`).
+ */
+template <typename PrecedingIter, typename FollowingIter>
+std::unique_ptr<column> create_collect_offsets(size_type input_size,
+                                               PrecedingIter preceding_begin,
+                                               FollowingIter following_begin,
+                                               size_type min_periods,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  // Materialize offsets column.
+  auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
+  auto sizes =
+    make_fixed_width_column(size_data_type, input_size, mask_state::UNALLOCATED, stream, mr);
+  auto mutable_sizes = sizes->mutable_view();
+
+  // Consider the following preceding/following values:
+  //    preceding = [1,2,2,2,2]
+  //    following = [1,1,1,1,0]
+  // The sum of the vectors should yield the window sizes:
+  //  prec + foll = [2,3,3,3,2]
+  //
+  // If min_periods=2, all rows have at least `min_periods` observations.
+  // But if min_periods=3, rows at indices 0 and 4 have too few observations, and must return
+  // null. The sizes at these positions must be 0, i.e.
+  //  prec + foll = [0,3,3,3,0]
+  thrust::transform(rmm::exec_policy(stream),
+                    preceding_begin,
+                    preceding_begin + input_size,
+                    following_begin,
+                    mutable_sizes.begin<size_type>(),
+                    [min_periods] __device__(auto const preceding, auto const following) {
+                      return (preceding + following) < min_periods ? 0 : (preceding + following);
+                    });
+
+  // Convert `sizes` to an offsets column, via inclusive_scan():
+  return strings::detail::make_offsets_child_column(
+    sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
+}
+
+/**
+ * @brief Generate mapping of each row in the COLLECT_LIST result's child column
+ * to the index of the row it belongs to.
+ *
+ *  If
+ *         input col == [A,B,C,D,E]
+ *    and  preceding == [1,2,2,2,2],
+ *    and  following == [1,1,1,1,0],
+ *  then,
+ *        collect result       == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ]
+ *   i.e. result offset column == [0,2,5,8,11,13],
+ *    and result child  column == [A,B,A,B,C,B,C,D,C,D,E,D,E].
+ *  Mapping back to `input`    == [0,1,0,1,2,1,2,3,2,3,4,3,4]
+ */
+std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
+                                                           rmm::cuda_stream_view stream,
+                                                           rmm::mr::device_memory_resource* mr)
+{
+  auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
+
+  // First, reduce offsets column by key, to identify the number of times
+  // an offset appears.
+  // Next, scatter the count for each offset (except the first and last),
+  // into a column of N `0`s, where N == number of child rows.
+  // For the example above:
+  //   offsets        == [0, 2, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
+  //
+  // If the above example had an empty list row at index 2,
+  // the same columns would look as follows:
+  //   offsets        == [0, 2, 5, 5, 8, 11, 13]
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
+  //
+  // Note: To correctly handle null list rows at the beginning of
+  // the output column, care must be taken to skip the first `0`
+  // in the offsets column, when running `reduce_by_key()`.
+  // This accounts for the `0` added by default to the offsets
+  // column, marking the beginning of the column.
+
+  auto const num_child_rows{
+    cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
+
+  auto scatter_values =
+    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
+  auto scatter_keys =
+    make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
+  auto reduced_by_key =
+    thrust::reduce_by_key(rmm::exec_policy(stream),
+                          offsets.template begin<size_type>() + 1,  // Skip first 0 in offsets.
+                          offsets.template end<size_type>(),
+                          thrust::make_constant_iterator<size_type>(1),
+                          scatter_keys->mutable_view().template begin<size_type>(),
+                          scatter_values->mutable_view().template begin<size_type>());
+  auto scatter_values_end = reduced_by_key.second;
+  auto scatter_output =
+    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill_n(rmm::exec_policy(stream),
+                 scatter_output->mutable_view().template begin<size_type>(),
+                 num_child_rows,
+                 0);  // [0,0,0,...0]
+  thrust::scatter(rmm::exec_policy(stream),
+                  scatter_values->mutable_view().template begin<size_type>(),
+                  scatter_values_end,
+                  scatter_keys->view().template begin<size_type>(),
+                  scatter_output->mutable_view().template begin<size_type>());  // [0,0,1,0,0,1,...]
+
+  // Next, generate mapping with inclusive_scan() on scatter() result.
+  // For the example above:
+  //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
+  //
+  // For the case with an empty list at index 3:
+  //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
+  //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
+  auto per_row_mapping =
+    make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         scatter_output->view().template begin<size_type>(),
+                         scatter_output->view().template end<size_type>(),
+                         per_row_mapping->mutable_view().template begin<size_type>());
+  return per_row_mapping;
+}
+
+/**
+ * @brief Create gather map to generate the child column of the result of
+ * the `COLLECT_LIST` window aggregation.
+ */
+template <typename PrecedingIter>
+std::unique_ptr<column> create_collect_gather_map(column_view const& child_offsets,
+                                                  column_view const& per_row_mapping,
+                                                  PrecedingIter preceding_iter,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  auto gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
+                                            per_row_mapping.size(),
+                                            mask_state::UNALLOCATED,
+                                            stream,
+                                            mr);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(per_row_mapping.size()),
+    gather_map->mutable_view().template begin<size_type>(),
+    [d_offsets =
+       child_offsets.template begin<size_type>(),  // E.g. [0,   2,     5,     8,     11, 13]
+     d_groups =
+       per_row_mapping.template begin<size_type>(),  // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4]
+     d_prev = preceding_iter] __device__(auto i) {
+      auto group              = d_groups[i];
+      auto group_start_offset = d_offsets[group];
+      auto relative_index     = i - group_start_offset;
+
+      return (group - d_prev[group] + 1) + relative_index;
+    });
+  return gather_map;
+}
+
+/**
+ * @brief Count null entries in result of COLLECT_LIST.
+ */
+size_type count_child_nulls(column_view const& input,
+                            std::unique_ptr<column> const& gather_map,
+                            rmm::cuda_stream_view stream)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_null_nocheck(i);
+  };
+
+  return thrust::count_if(rmm::exec_policy(stream),
+                          gather_map->view().template begin<size_type>(),
+                          gather_map->view().template end<size_type>(),
+                          input_row_is_null);
+}
+
+/**
+ * @brief Purge entries for null inputs from gather_map, and adjust offsets.
+ */
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
+  column_view const& input,
+  column_view const& gather_map,
+  column_view const& offsets,
+  size_type num_child_nulls,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  auto input_device_view = column_device_view::create(input, stream);
+
+  auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
+    return d_input.is_valid_nocheck(i);
+  };
+
+  // Purge entries in gather_map that correspond to null input.
+  auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
+                                                gather_map.size() - num_child_nulls,
+                                                mask_state::UNALLOCATED,
+                                                stream,
+                                                mr);
+  thrust::copy_if(rmm::exec_policy(stream),
+                  gather_map.template begin<size_type>(),
+                  gather_map.template end<size_type>(),
+                  new_gather_map->mutable_view().template begin<size_type>(),
+                  input_row_not_null);
+
+  // Recalculate offsets after null entries are purged.
+  auto new_sizes = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream, mr);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    new_sizes->mutable_view().template begin<size_type>(),
+                    [d_gather_map  = gather_map.template begin<size_type>(),
+                     d_old_offsets = offsets.template begin<size_type>(),
+                     input_row_not_null] __device__(auto i) {
+                      return thrust::count_if(thrust::seq,
+                                              d_gather_map + d_old_offsets[i],
+                                              d_gather_map + d_old_offsets[i + 1],
+                                              input_row_not_null);
+                    });
+
+  auto new_offsets =
+    strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
+                                               new_sizes->view().template end<size_type>(),
+                                               stream,
+                                               mr);
+
+  return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(std::move(new_gather_map),
+                                                                          std::move(new_offsets));
+}
+
+}  // anonymous namespace
+
+template <typename PrecedingIter, typename FollowingIter>
+std::unique_ptr<column> rolling_collect_list(column_view const& input,
+                                             column_view const& default_outputs,
+                                             PrecedingIter preceding_begin_raw,
+                                             FollowingIter following_begin_raw,
+                                             size_type min_periods,
+                                             null_policy null_handling,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(default_outputs.is_empty(),
+               "COLLECT_LIST window function does not support default values.");
+
+  if (input.is_empty()) return empty_like(input);
+
+  // Fix up preceding/following iterators to respect column boundaries,
+  // similar to gpu_rolling().
+  // `rolling_window()` does not fix up preceding/following so as not to read past
+  // column boundaries.
+  // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do.
+  auto preceding_begin = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0), [preceding_begin_raw] __device__(auto i) {
+      return thrust::min(preceding_begin_raw[i], i + 1);
+    });
+  auto following_begin =
+    thrust::make_transform_iterator(thrust::make_counting_iterator<size_type>(0),
+                                    [following_begin_raw, size = input.size()] __device__(auto i) {
+                                      return thrust::min(following_begin_raw[i], size - i - 1);
+                                    });
+
+  // Materialize collect list's offsets.
+  auto offsets =
+    create_collect_offsets(input.size(), preceding_begin, following_begin, min_periods, stream, mr);
+
+  // Map each element of the collect() result's child column
+  // to the index where it appears in the input.
+  auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr);
+
+  // Generate gather map to produce the collect() result's child column.
+  auto gather_map = create_collect_gather_map(
+    offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr);
+
+  // If gather_map collects null elements, and null_policy == EXCLUDE,
+  // those elements must be filtered out, and offsets recomputed.
+  if (null_handling == null_policy::EXCLUDE && input.has_nulls()) {
+    auto num_child_nulls = count_child_nulls(input, gather_map, stream);
+    if (num_child_nulls != 0) {
+      std::tie(gather_map, offsets) =
+        purge_null_entries(input, *gather_map, *offsets, num_child_nulls, stream, mr);
+    }
+  }
+
+  // gather(), to construct child column.
+  auto gather_output =
+    cudf::gather(table_view{std::vector<column_view>{input}}, gather_map->view());
+
+  rmm::device_buffer null_mask;
+  size_type null_count;
+  std::tie(null_mask, null_count) = valid_if(
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(input.size()),
+    [preceding_begin, following_begin, min_periods] __device__(auto i) {
+      return (preceding_begin[i] + following_begin[i]) >= min_periods;
+    },
+    stream,
+    mr);
+
+  return make_lists_column(input.size(),
+                           std::move(offsets),
+                           std::move(gather_output->release()[0]),
+                           null_count,
+                           std::move(null_mask),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index c6439486461..56b8bad0bac 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -16,7 +16,11 @@
 
 #pragma once
 
-#include <rolling/rolling_detail.hpp>
+#include "lead_lag_nested_detail.cuh"
+#include "rolling/rolling_collect_list.cuh"
+#include "rolling/rolling_detail.hpp"
+#include "rolling/rolling_jit_detail.hpp"
+#include "rolling_detail.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -27,7 +31,6 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -35,6 +38,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/lists/detail/drop_list_duplicates.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/types.hpp>
@@ -48,7 +52,6 @@
 
 #include <jit_preprocessed_files/rolling/jit/kernel.cu.jit.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 
@@ -64,312 +67,764 @@
 namespace cudf {
 
 namespace detail {
+
 namespace {  // anonymous
+
 /**
- * @brief Only COUNT_VALID operation is executed and count is updated
- *        depending on `min_periods` and returns true if it was
- *        valid, else false.
+ * @brief Operator for applying a generic (non-specialized) rolling aggregation on a single window.
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::COUNT_VALID>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
+template <typename InputType, aggregation::Kind op>
+struct DeviceRolling {
+  size_type min_periods;
 
-  bool output_is_valid = ((end_index - start_index) >= min_periods);
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = op>
+  static constexpr bool is_supported()
+  {
+    return cudf::detail::is_valid_aggregation<T, O>() && has_corresponding_operator<O>() &&
+           // TODO: Delete all this extra logic once is_valid_aggregation<> cleans up some edge
+           // cases it isn't handling.
+           // MIN/MAX supports all fixed width types
+           (((O == aggregation::MIN || O == aggregation::MAX) && cudf::is_fixed_width<T>()) ||
 
-  if (output_is_valid) {
-    if (!has_nulls) {
-      count = end_index - start_index;
-    } else {
-      count = thrust::count_if(thrust::seq,
-                               thrust::make_counting_iterator(start_index),
-                               thrust::make_counting_iterator(end_index),
-                               [&input](auto i) { return input.is_valid_nocheck(i); });
+            // SUM supports all fixed width types except timestamps
+            ((O == aggregation::SUM) && (cudf::is_fixed_width<T>() && !cudf::is_timestamp<T>())) ||
+
+            // MEAN supports numeric and duration
+            ((O == aggregation::MEAN) && (cudf::is_numeric<T>() || cudf::is_duration<T>())));
+  }
+
+  // operations we do support
+  template <typename T = InputType, aggregation::Kind O = op>
+  DeviceRolling(size_type _min_periods, typename std::enable_if_t<is_supported<T, O>()>* = nullptr)
+    : min_periods(_min_periods)
+  {
+  }
+
+  // operations we don't support
+  template <typename T = InputType, aggregation::Kind O = op>
+  DeviceRolling(size_type _min_periods, typename std::enable_if_t<!is_supported<T, O>()>* = nullptr)
+    : min_periods(_min_periods)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
+  }
+
+  // perform the windowing operation
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    using AggOp = typename corresponding_operator<op>::type;
+    AggOp agg_op;
+
+    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+    // for CUDA 10.0 and below (fixed in CUDA 10.1)
+    volatile cudf::size_type count = 0;
+    OutputType val                 = AggOp::template identity<OutputType>();
+
+    for (size_type j = start_index; j < end_index; j++) {
+      if (!has_nulls || input.is_valid(j)) {
+        OutputType element = input.element<device_storage_type_t<InputType>>(j);
+        val                = agg_op(element, val);
+        count++;
+      }
+    }
+
+    bool output_is_valid = (count >= min_periods);
+
+    // store the output value, one per thread
+    cudf::detail::rolling_store_output_functor<OutputType, op == aggregation::MEAN>{}(
+      output.element<OutputType>(current_index), val, count);
+
+    return output_is_valid;
+  }
+};
+
+/**
+ * @brief Operator for applying an ARGMAX/ARGMIN rolling aggregation on a single window.
+ */
+template <typename InputType, aggregation::Kind op>
+struct DeviceRollingArgMinMax {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = op>
+  static constexpr bool is_supported()
+  {
+    // strictly speaking, I think it would be ok to make this work
+    // for comparable types as well.  but right now the only use case is
+    // for MIN/MAX on strings.
+    return std::is_same<T, cudf::string_view>::value;
+  }
+
+  DeviceRollingArgMinMax(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    using AggOp = typename corresponding_operator<op>::type;
+    AggOp agg_op;
+
+    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+    // for CUDA 10.0 and below (fixed in CUDA 10.1)
+    volatile cudf::size_type count = 0;
+    InputType val                  = AggOp::template identity<InputType>();
+    OutputType val_index = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
+
+    for (size_type j = start_index; j < end_index; j++) {
+      if (!has_nulls || input.is_valid(j)) {
+        InputType element = input.element<InputType>(j);
+        val               = agg_op(element, val);
+        if (val == element) { val_index = j; }
+        count++;
+      }
+    }
+
+    bool output_is_valid = (count >= min_periods);
+    // -1 will help identify null elements while gathering for Min and Max
+    // In case of count, this would be null, so doesn't matter.
+    output.element<OutputType>(current_index) = (output_is_valid) ? val_index : -1;
+
+    // The gather mask shouldn't contain null values, so
+    // always return zero
+    return true;
+  }
+};
+
+/**
+ * @brief Operator for applying a COUNT_VALID rolling aggregation on a single window.
+ */
+template <typename InputType>
+struct DeviceRollingCountValid {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::COUNT_VALID>
+  static constexpr bool is_supported()
+  {
+    return true;
+  }
+
+  DeviceRollingCountValid(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
+    // for CUDA 10.0 and below (fixed in CUDA 10.1)
+    volatile cudf::size_type count = 0;
+
+    bool output_is_valid = ((end_index - start_index) >= min_periods);
+
+    if (output_is_valid) {
+      if (!has_nulls) {
+        count = end_index - start_index;
+      } else {
+        count = thrust::count_if(thrust::seq,
+                                 thrust::make_counting_iterator(start_index),
+                                 thrust::make_counting_iterator(end_index),
+                                 [&input](auto i) { return input.is_valid_nocheck(i); });
+      }
+      output.element<OutputType>(current_index) = count;
     }
+
+    return output_is_valid;
+  }
+};
+
+/**
+ * @brief Operator for applying a COUNT_ALL rolling aggregation on a single window.
+ */
+template <typename InputType>
+struct DeviceRollingCountAll {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::COUNT_ALL>
+  static constexpr bool is_supported()
+  {
+    return true;
+  }
+
+  DeviceRollingCountAll(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    cudf::size_type count = end_index - start_index;
+
+    bool output_is_valid                      = count >= min_periods;
     output.element<OutputType>(current_index) = count;
+
+    return output_is_valid;
+  }
+};
+
+/**
+ * @brief Operator for applying a ROW_NUMBER rolling aggregation on a single window.
+ */
+template <typename InputType>
+struct DeviceRollingRowNumber {
+  size_type min_periods;
+
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::ROW_NUMBER>
+  static constexpr bool is_supported()
+  {
+    return true;
   }
 
-  return output_is_valid;
+  DeviceRollingRowNumber(size_type _min_periods) : min_periods(_min_periods) {}
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& ignored_default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    bool output_is_valid                      = end_index - start_index >= min_periods;
+    output.element<OutputType>(current_index) = current_index - start_index + 1;
+
+    return output_is_valid;
+  }
+};
+
+struct agg_specific_empty_output {
+  template <typename InputType, aggregation::Kind op>
+  std::unique_ptr<column> operator()(column_view const& input, rolling_aggregation const& agg) const
+  {
+    using target_type = cudf::detail::target_type_t<InputType, op>;
+
+    if constexpr (std::is_same_v<cudf::detail::target_type_t<InputType, op>, void>) {
+      CUDF_FAIL("Unsupported combination of column-type and aggregation.");
+    }
+
+    if constexpr (cudf::is_fixed_width<target_type>()) {
+      return cudf::make_empty_column(data_type{type_to_id<target_type>()});
+    }
+
+    if constexpr (op == aggregation::COLLECT_LIST) {
+      return cudf::make_lists_column(
+        0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(input), 0, {});
+    }
+
+    return empty_like(input);
+  }
+};
+
+std::unique_ptr<column> empty_output_for_rolling_aggregation(column_view const& input,
+                                                             rolling_aggregation const& agg)
+{
+  // TODO:
+  //  Ideally, for UDF aggregations, the returned column would match
+  //  the agg's return type. It currently returns empty_like(input), because:
+  //    1. This preserves prior behaviour for empty input columns.
+  //    2. There is insufficient information to construct nested return colums.
+  //       `cudf::make_udf_aggregation()` expresses the return type as a `data_type`
+  //        which cannot express recursively nested types (e.g. `STRUCT<LIST<INT32>>`.)
+  //    3. In any case, UDFs that return nested types are not currently supported.
+  //  Constructing a more accurate return type for UDFs will be taken up
+  //  at a later date.
+  return agg.kind == aggregation::CUDA || agg.kind == aggregation::PTX
+           ? empty_like(input)
+           : cudf::detail::dispatch_type_and_aggregation(
+               input.type(), agg.kind, agg_specific_empty_output{}, input, agg);
 }
 
 /**
- * @brief Only COUNT_ALL operation is executed and count is updated
- *        depending on `min_periods` and returns true if it was
- *        valid, else false.
+ * @brief Operator for applying a LEAD rolling aggregation on a single window.
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::COUNT_ALL>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  cudf::size_type count = end_index - start_index;
+template <typename InputType>
+struct DeviceRollingLead {
+  size_type row_offset;
 
-  bool output_is_valid                      = count >= min_periods;
-  output.element<OutputType>(current_index) = count;
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::LEAD>
+  static constexpr bool is_supported()
+  {
+    return cudf::is_fixed_width<T>();
+  }
 
-  return output_is_valid;
-}
+  template <typename T = InputType, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  DeviceRollingLead(size_type _row_offset) : row_offset(_row_offset)
+  {
+  }
+
+  template <typename T = InputType, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
+  DeviceRollingLead(size_type _row_offset) : row_offset(_row_offset)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
+  }
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    // Offsets have already been normalized.
+
+    // Check if row is invalid.
+    if (row_offset > (end_index - current_index - 1)) {
+      // Invalid row marked. Use default value, if available.
+      if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+
+      output.element<OutputType>(current_index) =
+        default_outputs.element<OutputType>(current_index);
+      return true;
+    }
+
+    // Not an invalid row.
+    auto index   = current_index + row_offset;
+    auto is_null = input.is_null(index);
+    if (!is_null) {
+      output.element<OutputType>(current_index) =
+        input.element<device_storage_type_t<InputType>>(index);
+    }
+    return !is_null;
+  }
+};
 
 /**
- * @brief Calculates row-number of current index within [start_index, end_index). Count is updated
- *        depending on `min_periods`. Returns `true` if it was valid, else `false`.
+ * @brief Operator for applying a LAG rolling aggregation on a single window.
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<op == aggregation::ROW_NUMBER>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  bool output_is_valid                      = end_index - start_index >= min_periods;
-  output.element<OutputType>(current_index) = current_index - start_index + 1;
+template <typename InputType>
+struct DeviceRollingLag {
+  size_type row_offset;
 
-  return output_is_valid;
-}
+  // what operations do we support
+  template <typename T = InputType, aggregation::Kind O = aggregation::LAG>
+  static constexpr bool is_supported()
+  {
+    return cudf::is_fixed_width<T>();
+  }
+
+  template <typename T = InputType, typename std::enable_if_t<is_supported<T>()>* = nullptr>
+  DeviceRollingLag(size_type _row_offset) : row_offset(_row_offset)
+  {
+  }
+
+  template <typename T = InputType, typename std::enable_if_t<!is_supported<T>()>* = nullptr>
+  DeviceRollingLag(size_type _row_offset) : row_offset(_row_offset)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
+  }
+
+  template <typename OutputType, bool has_nulls>
+  bool __device__ operator()(column_device_view const& input,
+                             column_device_view const& default_outputs,
+                             mutable_column_device_view& output,
+                             size_type start_index,
+                             size_type end_index,
+                             size_type current_index)
+  {
+    // Offsets have already been normalized.
+
+    // Check if row is invalid.
+    if (row_offset > (current_index - start_index)) {
+      // Invalid row marked. Use default value, if available.
+      if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+
+      output.element<OutputType>(current_index) =
+        default_outputs.element<OutputType>(current_index);
+      return true;
+    }
+
+    // Not an invalid row.
+    auto index   = current_index - row_offset;
+    auto is_null = input.is_null(index);
+    if (!is_null) {
+      output.element<OutputType>(current_index) =
+        input.element<device_storage_type_t<InputType>>(index);
+    }
+    return !is_null;
+  }
+};
 
 /**
- * @brief LEAD(N): Returns the row from the input column, at the specified offset past the
- *        current row.
- * If the offset crosses the grouping boundary or column boundary for
- * a given row, a "default" value is returned. The "default" value is null, by default.
- *
- * E.g. Consider an input column with the following values and grouping:
- *      [10, 11, 12, 13,   20, 21, 22, 23]
- *      <------G1----->   <------G2------>
- *
- * LEAD(input_col, 1) yields:
- *      [11, 12, 13, null,  21, 22, 23, null]
+ * @brief Maps an `InputType and `aggregation::Kind` value to it's corresponding
+ * rolling window operator.
  *
- * LEAD(input_col, 1, 99) (where 99 indicates the default) yields:
- *      [11, 12, 13, 99,  21, 22, 23, 99]
+ * @tparam InputType The input type to map to its corresponding operator
+ * @tparam k The `aggregation::Kind` value to map to its corresponding operator
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls>
-std::enable_if_t<(op == aggregation::LEAD) && (cudf::is_fixed_width<InputType>()), bool> __device__
-process_rolling_window(column_device_view input,
-                       column_device_view default_outputs,
-                       mutable_column_device_view output,
-                       size_type start_index,
-                       size_type end_index,
-                       size_type current_index,
-                       size_type min_periods,
-                       agg_op device_agg_op)
-{
-  // Offsets have already been normalized.
-  auto row_offset = device_agg_op.row_offset;
+template <typename InputType, aggregation::Kind k>
+struct corresponding_rolling_operator {
+  using type = DeviceRolling<InputType, k>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::ARGMIN> {
+  using type = DeviceRollingArgMinMax<InputType, aggregation::ARGMIN>;
+};
 
-  // Check if row is invalid.
-  if (row_offset > (end_index - current_index - 1)) {
-    // Invalid row marked. Use default value, if available.
-    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::ARGMAX> {
+  using type = DeviceRollingArgMinMax<InputType, aggregation::ARGMAX>;
+};
 
-    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
-    return true;
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::COUNT_VALID> {
+  using type = DeviceRollingCountValid<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::COUNT_ALL> {
+  using type = DeviceRollingCountAll<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::ROW_NUMBER> {
+  using type = DeviceRollingRowNumber<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::Kind::LEAD> {
+  using type = DeviceRollingLead<InputType>;
+};
+
+template <typename InputType>
+struct corresponding_rolling_operator<InputType, aggregation::Kind::LAG> {
+  using type = DeviceRollingLag<InputType>;
+};
+
+/**
+ * @brief Functor for creating a device rolling operator based on input type and aggregation type.
+ */
+template <typename InputType, aggregation::Kind op, typename Enable = void>
+struct create_rolling_operator {
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    CUDF_FAIL("Invalid aggregation/type pair");
   }
+};
 
-  // Not an invalid row.
-  auto index   = current_index + row_offset;
-  auto is_null = input.is_null(index);
-  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
-  return !is_null;
-}
+template <typename InputType, aggregation::Kind op>
+struct create_rolling_operator<
+  InputType,
+  op,
+  std::enable_if_t<corresponding_rolling_operator<InputType, op>::type::is_supported()>> {
+  template <
+    typename T                                                                     = InputType,
+    aggregation::Kind O                                                            = op,
+    std::enable_if_t<O != aggregation::Kind::LEAD && O != aggregation::Kind::LAG>* = nullptr>
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    return typename corresponding_rolling_operator<InputType, op>::type(min_periods);
+  }
+
+  template <typename T                                      = InputType,
+            aggregation::Kind O                             = op,
+            std::enable_if_t<O == aggregation::Kind::LEAD>* = nullptr>
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    return DeviceRollingLead<InputType>{
+      dynamic_cast<cudf::detail::lead_lag_aggregation const&>(agg).row_offset};
+  }
+
+  template <typename T                                     = InputType,
+            aggregation::Kind O                            = op,
+            std::enable_if_t<O == aggregation::Kind::LAG>* = nullptr>
+  auto operator()(size_type min_periods, rolling_aggregation const& agg)
+  {
+    return DeviceRollingLag<InputType>{
+      dynamic_cast<cudf::detail::lead_lag_aggregation const&>(agg).row_offset};
+  }
+};
 
 /**
- * @brief LAG(N): returns the row from the input column at the specified offset preceding
- *        the current row.
- * If the offset crosses the grouping boundary or column boundary for
- * a given row, a "default" value is returned. The "default" value is null, by default.
+ * @brief Rolling window specific implementation of simple_aggregations_collector.
+ *
+ * The purpose of this class is to preprocess incoming aggregation/type pairs and
+ * potentially transform them into other aggregation/type pairs. Typically when this
+ * happens, the equivalent aggregation/type implementation of finalize() will perform
+ * some postprocessing step.
+ *
+ * An example of this would be applying a MIN aggregation to strings.  This cannot be done
+ * directly in the rolling operation, so instead the following happens:
  *
- * E.g. Consider an input column with the following values and grouping:
- *      [10, 11, 12, 13,   20, 21, 22, 23]
- *      <------G1----->   <------G2------>
+ * - the rolling_aggregation_preprocessor transforms the incoming MIN/string pair to
+ *   an ARGMIN/int pair.
+ * - The ARGMIN/int has the rolling operation applied to it, generating a list of indices
+ *   that can then be used as a gather map.
+ * - The rolling_aggregation_postprocessor then takes this gather map and performs a final
+ *   gather() on the input string data to generate the final output.
+ *
+ * Another example is COLLECT_LIST.  COLLECT_LIST is odd in that it doesn't go through the
+ * normal gpu rolling kernel at all.  It has a completely custom implementation.  So the
+ * following happens:
+ *
+ * - the rolling_aggregation_preprocessor transforms the COLLECT_LIST aggregation into nothing,
+ *   since no actual rolling window operation will be performed.
+ * - the rolling_aggregation_postprocessor calls the specialized rolling_collect_list()
+ *   function to generate the final output.
  *
- * LAG(input_col, 2) yields:
- *      [null, null, 10, 11, null, null, 20, 21]
- * LAG(input_col, 2, 99) yields:
- *      [99, 99, 10, 11, 99, 99, 20, 21]
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls>
-std::enable_if_t<(op == aggregation::LAG) && (cudf::is_fixed_width<InputType>()), bool> __device__
-process_rolling_window(column_device_view input,
-                       column_device_view default_outputs,
-                       mutable_column_device_view output,
-                       size_type start_index,
-                       size_type end_index,
-                       size_type current_index,
-                       size_type min_periods,
-                       agg_op device_agg_op)
-{
-  // Offsets have already been normalized.
-  auto row_offset = device_agg_op.row_offset;
+class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggregations_collector {
+ public:
+  using cudf::detail::simple_aggregations_collector::visit;
+
+  // NOTE : all other aggregations are passed through unchanged via the default
+  // visit() function in the simple_aggregations_collector.
+
+  // MIN aggregations with strings are processed in 2 passes. The first pass performs
+  // the rolling operation on a ARGMIN aggregation to generate indices instead of values.
+  // Then a second pass uses those indices to gather the final strings.  This step
+  // translates the the MIN -> ARGMIN aggregation
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::min_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation()
+                                                    : make_min_aggregation());
+    return aggs;
+  }
 
-  // Check if row is invalid.
-  if (row_offset > (current_index - start_index)) {
-    // Invalid row marked. Use default value, if available.
-    if (default_outputs.size() == 0 || default_outputs.is_null(current_index)) { return false; }
+  // MAX aggregations with strings are processed in 2 passes. The first pass performs
+  // the rolling operation on a ARGMAX aggregation to generate indices instead of values.
+  // Then a second pass uses those indices to gather the final strings.  This step
+  // translates the the MAX -> ARGMAX aggregation
+  std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                  cudf::detail::max_aggregation const& agg) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation()
+                                                    : make_max_aggregation());
+    return aggs;
+  }
 
-    output.element<OutputType>(current_index) = default_outputs.element<OutputType>(current_index);
-    return true;
+  // COLLECT_LIST aggregations do not peform a rolling operation at all. They get processed
+  // entirely in the finalize() step.
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, cudf::detail::collect_list_aggregation const& agg) override
+  {
+    return {};
   }
 
-  // Not an invalid row.
-  auto index   = current_index - row_offset;
-  auto is_null = input.is_null(index);
-  if (!is_null) { output.element<OutputType>(current_index) = input.element<InputType>(index); }
-  return !is_null;
-}
+  // COLLECT_SET aggregations do not peform a rolling operation at all. They get processed
+  // entirely in the finalize() step.
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, cudf::detail::collect_set_aggregation const& agg) override
+  {
+    return {};
+  }
+
+  // LEAD and LAG have custom behaviors for non fixed-width types.
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, cudf::detail::lead_lag_aggregation const& agg) override
+  {
+    // no rolling operation for non-fixed width.  just a postprocess step at the end
+    if (!cudf::is_fixed_width(col_type)) { return {}; }
+    // otherwise, pass through
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(agg.clone());
+    return aggs;
+  }
+};
 
 /**
- * @brief Only used for `string_view` type to get ARGMIN and ARGMAX, which
- *        will be used to gather MIN and MAX. And returns true if the
- *        operation was valid, else false.
+ * @brief Rolling window specific implementation of aggregation_finalizer.
+ *
+ * The purpose of this class is to postprocess rolling window data depending on the
+ * aggregation/type pair. See the description of rolling_aggregation_preprocessor for
+ * a detailed description.
+ *
  */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<(op == aggregation::ARGMIN or op == aggregation::ARGMAX) and
-                           std::is_same<InputType, cudf::string_view>::value>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
-  InputType val                  = agg_op::template identity<InputType>();
-  OutputType val_index           = (op == aggregation::ARGMIN) ? ARGMIN_SENTINEL : ARGMAX_SENTINEL;
-
-  for (size_type j = start_index; j < end_index; j++) {
-    if (!has_nulls || input.is_valid(j)) {
-      InputType element = input.element<InputType>(j);
-      val               = agg_op{}(element, val);
-      if (val == element) { val_index = j; }
-      count++;
-    }
+template <typename PrecedingWindowIterator, typename FollowingWindowIterator>
+class rolling_aggregation_postprocessor final : public cudf::detail::aggregation_finalizer {
+ public:
+  using cudf::detail::aggregation_finalizer::visit;
+
+  rolling_aggregation_postprocessor(column_view const& _input,
+                                    column_view const& _default_outputs,
+                                    data_type _result_type,
+                                    PrecedingWindowIterator _preceding_window_begin,
+                                    FollowingWindowIterator _following_window_begin,
+                                    int _min_periods,
+                                    std::unique_ptr<column>&& _intermediate,
+                                    rmm::cuda_stream_view _stream,
+                                    rmm::mr::device_memory_resource* _mr)
+    :
+
+      input(_input),
+      default_outputs(_default_outputs),
+      result_type(_result_type),
+      preceding_window_begin(_preceding_window_begin),
+      following_window_begin(_following_window_begin),
+      min_periods(_min_periods),
+      intermediate(std::move(_intermediate)),
+      result(nullptr),
+      stream(_stream),
+      mr(_mr)
+  {
   }
 
-  bool output_is_valid = (count >= min_periods);
-  // -1 will help identify null elements while gathering for Min and Max
-  // In case of count, this would be null, so doesn't matter.
-  output.element<OutputType>(current_index) = (output_is_valid) ? val_index : -1;
+  // all non-specialized aggregation types simply pass the intermediate result through.
+  void visit(aggregation const& agg) override { result = std::move(intermediate); }
 
-  // The gather mask shouldn't contain null values, so
-  // always return zero
-  return true;
-}
+  // perform a final gather on the generated ARGMIN data
+  void visit(cudf::detail::min_aggregation const& agg) override
+  {
+    if (result_type.id() == type_id::STRING) {
+      // The rows that represent null elements will have negative values in gather map,
+      // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
+      auto output_table = detail::gather(table_view{{input}},
+                                         intermediate->view(),
+                                         cudf::out_of_bounds_policy::NULLIFY,
+                                         detail::negative_index_policy::NOT_ALLOWED,
+                                         stream,
+                                         mr);
+      result            = std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
+    } else {
+      result = std::move(intermediate);
+    }
+  }
 
-/**
- * @brief Operates on only fixed-width types and returns true if the
- *        operation was valid, else false.
- */
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          bool has_nulls,
-          std::enable_if_t<!std::is_same<InputType, cudf::string_view>::value and
-                           !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL ||
-                             op == aggregation::ROW_NUMBER || op == aggregation::LEAD ||
-                             op == aggregation::LAG || op == aggregation::COLLECT_LIST)>* = nullptr>
-bool __device__ process_rolling_window(column_device_view input,
-                                       column_device_view ignored_default_outputs,
-                                       mutable_column_device_view output,
-                                       size_type start_index,
-                                       size_type end_index,
-                                       size_type current_index,
-                                       size_type min_periods)
-{
-  // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-  // for CUDA 10.0 and below (fixed in CUDA 10.1)
-  volatile cudf::size_type count = 0;
-  OutputType val                 = agg_op::template identity<OutputType>();
-
-  for (size_type j = start_index; j < end_index; j++) {
-    if (!has_nulls || input.is_valid(j)) {
-      OutputType element = input.element<InputType>(j);
-      val                = agg_op{}(element, val);
-      count++;
+  // perform a final gather on the generated ARGMAX data
+  void visit(cudf::detail::max_aggregation const& agg) override
+  {
+    if (result_type.id() == type_id::STRING) {
+      // The rows that represent null elements will have negative values in gather map,
+      // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
+      auto output_table = detail::gather(table_view{{input}},
+                                         intermediate->view(),
+                                         cudf::out_of_bounds_policy::NULLIFY,
+                                         detail::negative_index_policy::NOT_ALLOWED,
+                                         stream,
+                                         mr);
+      result            = std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
+    } else {
+      result = std::move(intermediate);
     }
   }
 
-  bool output_is_valid = (count >= min_periods);
+  // perform the actual COLLECT_LIST operation entirely.
+  void visit(cudf::detail::collect_list_aggregation const& agg) override
+  {
+    result = rolling_collect_list(input,
+                                  default_outputs,
+                                  preceding_window_begin,
+                                  following_window_begin,
+                                  min_periods,
+                                  agg._null_handling,
+                                  stream,
+                                  mr);
+  }
+
+  // perform the actual COLLECT_SET operation entirely.
+  void visit(cudf::detail::collect_set_aggregation const& agg) override
+  {
+    auto const collected_list = rolling_collect_list(input,
+                                                     default_outputs,
+                                                     preceding_window_begin,
+                                                     following_window_begin,
+                                                     min_periods,
+                                                     agg._null_handling,
+                                                     stream,
+                                                     rmm::mr::get_current_device_resource());
+
+    result = lists::detail::drop_list_duplicates(lists_column_view(collected_list->view()),
+                                                 null_equality::EQUAL,
+                                                 nan_equality::UNEQUAL,
+                                                 stream,
+                                                 mr);
+  }
 
-  // store the output value, one per thread
-  cudf::detail::rolling_store_output_functor<OutputType, op == aggregation::MEAN>{}(
-    output.element<OutputType>(current_index), val, count);
+  std::unique_ptr<column> get_result()
+  {
+    CUDF_EXPECTS(result != nullptr,
+                 "Calling result on rolling aggregation postprocessor that has not been visited in "
+                 "rolling_window");
+    return std::move(result);
+  }
 
-  return output_is_valid;
-}
+  // LEAD and LAG have custom behaviors for non fixed-width types.
+  void visit(cudf::detail::lead_lag_aggregation const& agg) override
+  {
+    // if this is non-fixed width, run the custom lead-lag code
+    if (!cudf::is_fixed_width(result_type)) {
+      result =
+        cudf::detail::compute_lead_lag_for_nested<PrecedingWindowIterator, FollowingWindowIterator>(
+          agg.kind,
+          input,
+          default_outputs,
+          preceding_window_begin,
+          following_window_begin,
+          agg.row_offset,
+          stream,
+          mr);
+    }
+    // otherwise just pass through the intermediate
+    else {
+      result = std::move(intermediate);
+    }
+  }
+
+ private:
+  column_view input;
+  column_view default_outputs;
+  data_type result_type;
+  PrecedingWindowIterator preceding_window_begin;
+  FollowingWindowIterator following_window_begin;
+  int min_periods;
+  std::unique_ptr<column> intermediate;
+  std::unique_ptr<column> result;
+  rmm::cuda_stream_view stream;
+  rmm::mr::device_memory_resource* mr;
+};
 
 /**
  * @brief Computes the rolling window function
  *
  * @tparam InputType  Datatype of `input`
  * @tparam OutputType  Datatype of `output`
- * @tparam agg_op  A functor that defines the aggregation operation
  * @tparam op The aggregation operator (enum value)
  * @tparam block_size CUDA block size for the kernel
  * @tparam has_nulls true if the input column has nulls
+ * @tparam DeviceRollingOperator An operator that performs a single windowing operation
  * @tparam PrecedingWindowIterator iterator type (inferred)
  * @tparam FollowingWindowIterator iterator type (inferred)
  * @param input Input column device view
+ * @param default_outputs A column of per-row default values to be returned instead
+ *                        of nulls for certain aggregation types.
  * @param output Output column device view
+ * @param output_valid_count Output count of valid values
+ * @param device_operator The operator used to perform a single window operation
  * @param preceding_window_begin[in] Rolling window size iterator, accumulates from
  *                in_col[i-preceding_window] to in_col[i] inclusive
  * @param following_window_begin[in] Rolling window size iterator in the forward
  *                direction, accumulates from in_col[i] to
  *                in_col[i+following_window] inclusive
- * @param min_periods[in]  Minimum number of observations in window required to
- *                have a value, otherwise 0 is stored in the valid bit mask
  */
 template <typename InputType,
           typename OutputType,
-          typename agg_op,
           aggregation::Kind op,
           int block_size,
           bool has_nulls,
+          typename DeviceRollingOperator,
           typename PrecedingWindowIterator,
           typename FollowingWindowIterator>
 __launch_bounds__(block_size) __global__
@@ -377,9 +832,9 @@ __launch_bounds__(block_size) __global__
                    column_device_view default_outputs,
                    mutable_column_device_view output,
                    size_type* __restrict__ output_valid_count,
+                   DeviceRollingOperator device_operator,
                    PrecedingWindowIterator preceding_window_begin,
-                   FollowingWindowIterator following_window_begin,
-                   size_type min_periods)
+                   FollowingWindowIterator following_window_begin)
 {
   size_type i      = blockIdx.x * block_size + threadIdx.x;
   size_type stride = block_size * gridDim.x;
@@ -403,72 +858,8 @@ __launch_bounds__(block_size) __global__
     //       for dynamic and static sizes.
 
     volatile bool output_is_valid = false;
-    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
-      input, default_outputs, output, start_index, end_index, i, min_periods);
-
-    // set the mask
-    cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
-
-    // only one thread writes the mask
-    if (0 == threadIdx.x % cudf::detail::warp_size) {
-      output.set_mask_word(cudf::word_index(i), result_mask);
-      warp_valid_count += __popc(result_mask);
-    }
-
-    // process next element
-    i += stride;
-    active_threads = __ballot_sync(active_threads, i < input.size());
-  }
-
-  // sum the valid counts across the whole block
-  size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
-
-  if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
-}
-
-template <typename InputType,
-          typename OutputType,
-          typename agg_op,
-          aggregation::Kind op,
-          int block_size,
-          bool has_nulls,
-          typename PrecedingWindowIterator,
-          typename FollowingWindowIterator>
-__launch_bounds__(block_size) __global__
-  void gpu_rolling(column_device_view input,
-                   column_device_view default_outputs,
-                   mutable_column_device_view output,
-                   size_type* __restrict__ output_valid_count,
-                   PrecedingWindowIterator preceding_window_begin,
-                   FollowingWindowIterator following_window_begin,
-                   size_type min_periods,
-                   agg_op device_agg_op)
-{
-  size_type i      = blockIdx.x * block_size + threadIdx.x;
-  size_type stride = block_size * gridDim.x;
-
-  size_type warp_valid_count{0};
-
-  auto active_threads = __ballot_sync(0xffffffff, i < input.size());
-  while (i < input.size()) {
-    size_type preceding_window = preceding_window_begin[i];
-    size_type following_window = following_window_begin[i];
-
-    // compute bounds
-    size_type start       = min(input.size(), max(0, i - preceding_window + 1));
-    size_type end         = min(input.size(), max(0, i + following_window + 1));
-    size_type start_index = min(start, end);
-    size_type end_index   = max(start, end);
-
-    // aggregate
-    // TODO: We should explore using shared memory to avoid redundant loads.
-    //       This might require separating the kernel into a special version
-    //       for dynamic and static sizes.
-
-    volatile bool output_is_valid = false;
-    output_is_valid = process_rolling_window<InputType, OutputType, agg_op, op, has_nulls>(
-      input, default_outputs, output, start_index, end_index, i, min_periods, device_agg_op);
+    output_is_valid               = device_operator.template operator()<OutputType, has_nulls>(
+      input, default_outputs, output, start_index, end_index, i);
 
     // set the mask
     cudf::bitmask_type result_mask{__ballot_sync(active_threads, output_is_valid)};
@@ -491,750 +882,151 @@ __launch_bounds__(block_size) __global__
   if (threadIdx.x == 0) { atomicAdd(output_valid_count, block_valid_count); }
 }
 
+/**
+ * @brief Type/aggregation dispatched functor for launching the gpu rolling window
+ *        kernel.
+ */
 template <typename InputType>
 struct rolling_window_launcher {
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  size_type kernel_launcher(column_view const& input,
-                            column_view const& default_outputs,
-                            mutable_column_view& output,
-                            PrecedingWindowIterator preceding_window_begin,
-                            FollowingWindowIterator following_window_begin,
-                            size_type min_periods,
-                            std::unique_ptr<aggregation> const& agg,
-                            rmm::cuda_stream_view stream)
-  {
-    using Type    = device_storage_type_t<T>;
-    using OutType = device_storage_type_t<target_type_t<InputType, op>>;
-
-    constexpr cudf::size_type block_size = 256;
-    cudf::detail::grid_1d grid(input.size(), block_size);
-
-    auto input_device_view           = column_device_view::create(input, stream);
-    auto output_device_view          = mutable_column_device_view::create(output, stream);
-    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
-
-    rmm::device_scalar<size_type> device_valid_count{0, stream};
-
-    if (input.has_nulls()) {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods);
-    } else {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods);
-    }
-
-    size_type valid_count = device_valid_count.value(stream);
-
-    // check the stream for debugging
-    CHECK_CUDA(stream.value());
-
-    return valid_count;
-  }
-
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  size_type kernel_launcher(column_view const& input,
-                            column_view const& default_outputs,
-                            mutable_column_view& output,
-                            PrecedingWindowIterator preceding_window_begin,
-                            FollowingWindowIterator following_window_begin,
-                            size_type min_periods,
-                            std::unique_ptr<aggregation> const& agg,
-                            agg_op const& device_agg_op,
-                            rmm::cuda_stream_view stream)
-  {
-    using Type    = device_storage_type_t<T>;
-    using OutType = device_storage_type_t<target_type_t<InputType, op>>;
-
-    constexpr cudf::size_type block_size = 256;
-    cudf::detail::grid_1d grid(input.size(), block_size);
-
-    auto input_device_view           = column_device_view::create(input, stream);
-    auto output_device_view          = mutable_column_device_view::create(output, stream);
-    auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
-
-    rmm::device_scalar<size_type> device_valid_count{0, stream};
-
-    if (input.has_nulls()) {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, true>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods,
-                                                             device_agg_op);
-    } else {
-      gpu_rolling<Type, OutType, agg_op, op, block_size, false>
-        <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
-                                                             *default_outputs_device_view,
-                                                             *output_device_view,
-                                                             device_valid_count.data(),
-                                                             preceding_window_begin,
-                                                             following_window_begin,
-                                                             min_periods,
-                                                             device_agg_op);
-    }
-
-    size_type valid_count = device_valid_count.value(stream);
-
-    // check the stream for debugging
-    CHECK_CUDA(stream.value());
-
-    return valid_count;
-  }
-
-  // This launch is only for fixed width columns with valid aggregation option
-  // numeric: All
-  // timestamp: MIN, MAX, COUNT_VALID, COUNT_ALL, ROW_NUMBER
-  // string, dictionary, list : COUNT_VALID, COUNT_ALL, ROW_NUMBER
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
+  template <aggregation::Kind op,
             typename PrecedingWindowIterator,
             typename FollowingWindowIterator>
-  std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, op>() and
-                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
+  std::enable_if_t<corresponding_rolling_operator<InputType, op>::type::is_supported(),
                    std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
+  operator()(column_view const& input,
+             column_view const& default_outputs,
+             PrecedingWindowIterator preceding_window_begin,
+             FollowingWindowIterator following_window_begin,
+             int min_periods,
+             rolling_aggregation const& agg,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
-    auto output = make_fixed_width_column(
-      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-    auto valid_count =
-      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
-        input,
-        default_outputs,
-        output_view,
-        preceding_window_begin,
-        following_window_begin,
-        min_periods,
-        agg,
-        stream);
-
-    output->set_null_count(output->size() - valid_count);
-
-    return output;
-  }
+    auto const output_type = target_type(input.type(), op);
+    auto device_operator   = create_rolling_operator<InputType, op>{}(min_periods, agg);
 
-  // This launch is only for string specializations
-  // string: MIN, MAX
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    auto output = make_numeric_column(cudf::data_type{cudf::type_to_id<size_type>()},
-                                      input.size(),
-                                      cudf::mask_state::UNINITIALIZED,
-                                      stream,
-                                      mr);
+    auto output =
+      make_fixed_width_column(output_type, input.size(), mask_state::UNINITIALIZED, stream, mr);
 
     cudf::mutable_column_view output_view = output->mutable_view();
 
-    // Passing the agg_op and aggregation::Kind as constant to group them in pair, else it
-    // evolves to error when try to use agg_op as compiler tries different combinations
-    if (op == aggregation::MIN) {
-      kernel_launcher<T,
-                      DeviceMin,
-                      aggregation::ARGMIN,
-                      PrecedingWindowIterator,
-                      FollowingWindowIterator>(input,
-                                               default_outputs,
-                                               output_view,
-                                               preceding_window_begin,
-                                               following_window_begin,
-                                               min_periods,
-                                               agg,
-                                               stream);
-    } else if (op == aggregation::MAX) {
-      kernel_launcher<T,
-                      DeviceMax,
-                      aggregation::ARGMAX,
-                      PrecedingWindowIterator,
-                      FollowingWindowIterator>(input,
-                                               default_outputs,
-                                               output_view,
-                                               preceding_window_begin,
-                                               following_window_begin,
-                                               min_periods,
-                                               agg,
-                                               stream);
-    } else {
-      CUDF_FAIL("MIN and MAX are the only supported aggregation types for string columns");
-    }
-
-    // The rows that represent null elements will be having negative values in gather map,
-    // and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
-    auto output_table = detail::gather(table_view{{input}},
-                                       output->view(),
-                                       cudf::out_of_bounds_policy::NULLIFY,
-                                       detail::negative_index_policy::NOT_ALLOWED,
-                                       stream,
-                                       mr);
-    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
-  }
-
-  // Deals with invalid column and/or aggregation options
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, op>() and
-                     !cudf::detail::is_rolling_string_specialization<T, agg_op, op>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("Aggregation operator and/or input type combination is invalid");
-  }
+    size_type valid_count{0};
+    {
+      using Type    = device_storage_type_t<InputType>;
+      using OutType = device_storage_type_t<target_type_t<InputType, op>>;
+
+      constexpr cudf::size_type block_size = 256;
+      cudf::detail::grid_1d grid(input.size(), block_size);
+
+      auto input_device_view           = column_device_view::create(input, stream);
+      auto output_device_view          = mutable_column_device_view::create(output_view, stream);
+      auto default_outputs_device_view = column_device_view::create(default_outputs, stream);
+
+      rmm::device_scalar<size_type> device_valid_count{0, stream};
+
+      if (input.has_nulls()) {
+        gpu_rolling<Type, OutType, op, block_size, true>
+          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                               *default_outputs_device_view,
+                                                               *output_device_view,
+                                                               device_valid_count.data(),
+                                                               device_operator,
+                                                               preceding_window_begin,
+                                                               following_window_begin);
+      } else {
+        gpu_rolling<Type, OutType, op, block_size, false>
+          <<<grid.num_blocks, block_size, 0, stream.value()>>>(*input_device_view,
+                                                               *default_outputs_device_view,
+                                                               *output_device_view,
+                                                               device_valid_count.data(),
+                                                               device_operator,
+                                                               preceding_window_begin,
+                                                               following_window_begin);
+      }
 
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<cudf::is_fixed_width<T>() and
-                     (op == aggregation::LEAD || op == aggregation::LAG),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         agg_op const& device_agg_op,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
-                 "Defaults column type must match input column.");  // Because LEAD/LAG.
+      valid_count = device_valid_count.value(stream);
 
-    // For LEAD(0)/LAG(0), no computation need be performed.
-    // Return copy of input.
-    if (0 == static_cast<cudf::detail::lead_lag_aggregation*>(agg.get())->row_offset) {
-      return std::make_unique<column>(input, stream, mr);
+      // check the stream for debugging
+      CHECK_CUDA(stream.value());
     }
 
-    auto output = make_fixed_width_column(
-      target_type(input.type(), op), input.size(), mask_state::UNINITIALIZED, stream, mr);
-
-    cudf::mutable_column_view output_view = output->mutable_view();
-    auto valid_count =
-      kernel_launcher<T, agg_op, op, PrecedingWindowIterator, FollowingWindowIterator>(
-        input,
-        default_outputs,
-        output_view,
-        preceding_window_begin,
-        following_window_begin,
-        min_periods,
-        agg,
-        device_agg_op,
-        stream);
-
     output->set_null_count(output->size() - valid_count);
 
     return output;
   }
 
-  // Deals with invalid column and/or aggregation options
-  template <typename T,
-            typename agg_op,
-            aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<!(op == aggregation::LEAD || op == aggregation::LAG) ||
-                     !cudf::is_fixed_width<T>(),
-                   std::unique_ptr<column>>
-  launch(column_view const& input,
-         column_view const& default_outputs,
-         PrecedingWindowIterator preceding_window_begin,
-         FollowingWindowIterator following_window_begin,
-         size_type min_periods,
-         std::unique_ptr<aggregation> const& agg,
-         agg_op device_agg_op,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL(
-      "Aggregation operator and/or input type combination is invalid: "
-      "LEAD/LAG supported only on fixed-width types");
-  }
-
   template <aggregation::Kind op,
             typename PrecedingWindowIterator,
             typename FollowingWindowIterator>
-  std::enable_if_t<!(op == aggregation::MEAN || op == aggregation::LEAD || op == aggregation::LAG ||
-                     op == aggregation::COLLECT_LIST),
+  std::enable_if_t<!corresponding_rolling_operator<InputType, op>::type::is_supported(),
                    std::unique_ptr<column>>
   operator()(column_view const& input,
              column_view const& default_outputs,
              PrecedingWindowIterator preceding_window_begin,
              FollowingWindowIterator following_window_begin,
-             size_type min_periods,
-             std::unique_ptr<aggregation> const& agg,
+             int min_periods,
+             rolling_aggregation const& agg,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr)
   {
-    CUDF_EXPECTS(default_outputs.is_empty(),
-                 "Only LEAD/LAG window functions support default values.");
-
-    return launch<InputType,
-                  typename corresponding_operator<op>::type,
-                  op,
-                  PrecedingWindowIterator,
-                  FollowingWindowIterator>(input,
-                                           default_outputs,
-                                           preceding_window_begin,
-                                           following_window_begin,
-                                           min_periods,
-                                           agg,
-                                           stream,
-                                           mr);
-  }
-
-  // This variant is just to handle mean
-  template <aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<(op == aggregation::MEAN), std::unique_ptr<column>> operator()(
-    column_view const& input,
-    column_view const& default_outputs,
-    PrecedingWindowIterator preceding_window_begin,
-    FollowingWindowIterator following_window_begin,
-    size_type min_periods,
-    std::unique_ptr<aggregation> const& agg,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    return launch<InputType, cudf::DeviceSum, op, PrecedingWindowIterator, FollowingWindowIterator>(
-      input,
-      default_outputs,
-      preceding_window_begin,
-      following_window_begin,
-      min_periods,
-      agg,
-      stream,
-      mr);
-  }
-
-  template <aggregation::Kind op,
-            typename PrecedingWindowIterator,
-            typename FollowingWindowIterator>
-  std::enable_if_t<(op == aggregation::LEAD || op == aggregation::LAG), std::unique_ptr<column>>
-  operator()(column_view const& input,
-             column_view const& default_outputs,
-             PrecedingWindowIterator preceding_window_begin,
-             FollowingWindowIterator following_window_begin,
-             size_type min_periods,
-             std::unique_ptr<aggregation> const& agg,
-             rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
-  {
-    return launch<InputType,
-                  cudf::DeviceLeadLag,
-                  op,
-                  PrecedingWindowIterator,
-                  FollowingWindowIterator>(
-      input,
-      default_outputs,
-      preceding_window_begin,
-      following_window_begin,
-      min_periods,
-      agg,
-      cudf::DeviceLeadLag{static_cast<cudf::detail::lead_lag_aggregation*>(agg.get())->row_offset},
-      stream,
-      mr);
-  }
-
-  /**
-   * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation
-   *
-   * Given the input column, the preceding/following window bounds, and `min_periods`,
-   * the sizes of each list row may be computed. These values can then be used to
-   * calculate the offsets for the result of `COLLECT_LIST`.
-   *
-   * Note: If `min_periods` exceeds the number of observations for a window, the size
-   * is set to `0` (since the result is `null`).
-   */
-  template <typename PrecedingIter, typename FollowingIter>
-  std::unique_ptr<column> create_collect_offsets(size_type input_size,
-                                                 PrecedingIter preceding_begin,
-                                                 FollowingIter following_begin,
-                                                 size_type min_periods,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-  {
-    // Materialize offsets column.
-    auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
-    auto sizes =
-      make_fixed_width_column(size_data_type, input_size, mask_state::UNALLOCATED, stream, mr);
-    auto mutable_sizes = sizes->mutable_view();
-
-    // Consider the following preceding/following values:
-    //    preceding = [1,2,2,2,2]
-    //    following = [1,1,1,1,0]
-    // The sum of the vectors should yield the window sizes:
-    //  prec + foll = [2,3,3,3,2]
-    //
-    // If min_periods=2, all rows have at least `min_periods` observations.
-    // But if min_periods=3, rows at indices 0 and 4 have too few observations, and must return
-    // null. The sizes at these positions must be 0, i.e.
-    //  prec + foll = [0,3,3,3,0]
-    thrust::transform(rmm::exec_policy(stream),
-                      preceding_begin,
-                      preceding_begin + input_size,
-                      following_begin,
-                      mutable_sizes.begin<size_type>(),
-                      [min_periods] __device__(auto preceding, auto following) {
-                        return (preceding + following) < min_periods ? 0 : (preceding + following);
-                      });
-
-    // Convert `sizes` to an offsets column, via inclusive_scan():
-    return strings::detail::make_offsets_child_column(
-      sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
-  }
-
-  /**
-   * @brief Generate mapping of each row in the COLLECT_LIST result's child column
-   * to the index of the row it belongs to.
-   *
-   *  If
-   *         input col == [A,B,C,D,E]
-   *    and  preceding == [1,2,2,2,2],
-   *    and  following == [1,1,1,1,0],
-   *  then,
-   *        collect result       == [ [A,B], [A,B,C], [B,C,D], [C,D,E], [D,E] ]
-   *   i.e. result offset column == [0,2,5,8,11,13],
-   *    and result child  column == [A,B,A,B,C,B,C,D,C,D,E,D,E].
-   *  Mapping back to `input`    == [0,1,0,1,2,1,2,3,2,3,4,3,4]
-   */
-  std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view const& offsets,
-                                                             rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr)
-  {
-    auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
-
-    // First, reduce offsets column by key, to identify the number of times
-    // an offset appears.
-    // Next, scatter the count for each offset (except the first and last),
-    // into a column of N `0`s, where N == number of child rows.
-    // For the example above:
-    //   offsets        == [0, 2, 5, 8, 11, 13]
-    //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-    //
-    // If the above example had an empty list row at index 2,
-    // the same columns would look as follows:
-    //   offsets        == [0, 2, 5, 5, 8, 11, 13]
-    //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-    //
-    // Note: To correctly handle null list rows at the beginning of
-    // the output column, care must be taken to skip the first `0`
-    // in the offsets column, when running `reduce_by_key()`.
-    // This accounts for the `0` added by default to the offsets
-    // column, marking the beginning of the column.
-
-    auto const num_child_rows{
-      cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
-
-    auto scatter_values =
-      make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-    auto scatter_keys =
-      make_fixed_width_column(size_data_type, offsets.size(), mask_state::UNALLOCATED, stream, mr);
-    auto reduced_by_key =
-      thrust::reduce_by_key(rmm::exec_policy(stream),
-                            offsets.template begin<size_type>() + 1,  // Skip first 0 in offsets.
-                            offsets.template end<size_type>(),
-                            thrust::make_constant_iterator<size_type>(1),
-                            scatter_keys->mutable_view().template begin<size_type>(),
-                            scatter_values->mutable_view().template begin<size_type>());
-    auto scatter_values_end = reduced_by_key.second;
-    auto scatter_output =
-      make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-    thrust::fill_n(rmm::exec_policy(stream),
-                   scatter_output->mutable_view().template begin<size_type>(),
-                   num_child_rows,
-                   0);  // [0,0,0,...0]
-    thrust::scatter(
-      rmm::exec_policy(stream),
-      scatter_values->mutable_view().template begin<size_type>(),
-      scatter_values_end,
-      scatter_keys->view().template begin<size_type>(),
-      scatter_output->mutable_view().template begin<size_type>());  // [0,0,1,0,0,1,...]
-
-    // Next, generate mapping with inclusive_scan() on scatter() result.
-    // For the example above:
-    //   scatter result == [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
-    //   inclusive_scan == [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4]
-    //
-    // For the case with an empty list at index 3:
-    //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
-    //   inclusive_scan == [0, 0, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5]
-    auto per_row_mapping =
-      make_fixed_width_column(size_data_type, num_child_rows, mask_state::UNALLOCATED, stream, mr);
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           scatter_output->view().template begin<size_type>(),
-                           scatter_output->view().template end<size_type>(),
-                           per_row_mapping->mutable_view().template begin<size_type>());
-    return per_row_mapping;
-  }
-
-  /**
-   * @brief Create gather map to generate the child column of the result of
-   * the `COLLECT_LIST` window aggregation.
-   */
-  template <typename PrecedingIter>
-  std::unique_ptr<column> create_collect_gather_map(column_view const& child_offsets,
-                                                    column_view const& per_row_mapping,
-                                                    PrecedingIter preceding_iter,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
-  {
-    auto gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                              per_row_mapping.size(),
-                                              mask_state::UNALLOCATED,
-                                              stream,
-                                              mr);
-    thrust::transform(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(per_row_mapping.size()),
-      gather_map->mutable_view().template begin<size_type>(),
-      [d_offsets =
-         child_offsets.template begin<size_type>(),  // E.g. [0,   2,     5,     8,     11, 13]
-       d_groups =
-         per_row_mapping.template begin<size_type>(),  // E.g. [0,0, 1,1,1, 2,2,2, 3,3,3, 4,4]
-       d_prev = preceding_iter] __device__(auto i) {
-        auto group              = d_groups[i];
-        auto group_start_offset = d_offsets[group];
-        auto relative_index     = i - group_start_offset;
-
-        return (group - d_prev[group] + 1) + relative_index;
-      });
-    return gather_map;
-  }
-
-  /**
-   * @brief Count null entries in result of COLLECT_LIST.
-   */
-  size_type count_child_nulls(column_view const& input,
-                              std::unique_ptr<column> const& gather_map,
-                              rmm::cuda_stream_view stream)
-  {
-    auto input_device_view = column_device_view::create(input, stream);
-
-    auto input_row_is_null = [d_input = *input_device_view] __device__(auto i) {
-      return d_input.is_null_nocheck(i);
-    };
-
-    return thrust::count_if(rmm::exec_policy(stream),
-                            gather_map->view().template begin<size_type>(),
-                            gather_map->view().template end<size_type>(),
-                            input_row_is_null);
-  }
-
-  /**
-   * @brief Purge entries for null inputs from gather_map, and adjust offsets.
-   */
-  std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
-    column_view const& input,
-    column_view const& gather_map,
-    column_view const& offsets,
-    size_type num_child_nulls,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    auto input_device_view = column_device_view::create(input, stream);
-
-    auto input_row_not_null = [d_input = *input_device_view] __device__(auto i) {
-      return d_input.is_valid_nocheck(i);
-    };
-
-    // Purge entries in gather_map that correspond to null input.
-    auto new_gather_map = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                                  gather_map.size() - num_child_nulls,
-                                                  mask_state::UNALLOCATED,
-                                                  stream,
-                                                  mr);
-    thrust::copy_if(rmm::exec_policy(stream),
-                    gather_map.template begin<size_type>(),
-                    gather_map.template end<size_type>(),
-                    new_gather_map->mutable_view().template begin<size_type>(),
-                    input_row_not_null);
-
-    // Recalculate offsets after null entries are purged.
-    auto new_sizes = make_fixed_width_column(
-      data_type{type_to_id<size_type>()}, input.size(), mask_state::UNALLOCATED, stream, mr);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(input.size()),
-                      new_sizes->mutable_view().template begin<size_type>(),
-                      [d_gather_map  = gather_map.template begin<size_type>(),
-                       d_old_offsets = offsets.template begin<size_type>(),
-                       input_row_not_null] __device__(auto i) {
-                        return thrust::count_if(thrust::seq,
-                                                d_gather_map + d_old_offsets[i],
-                                                d_gather_map + d_old_offsets[i + 1],
-                                                input_row_not_null);
-                      });
-
-    auto new_offsets =
-      strings::detail::make_offsets_child_column(new_sizes->view().template begin<size_type>(),
-                                                 new_sizes->view().template end<size_type>(),
-                                                 stream,
-                                                 mr);
-
-    return std::make_pair<std::unique_ptr<column>, std::unique_ptr<column>>(
-      std::move(new_gather_map), std::move(new_offsets));
-  }
-
-  template <aggregation::Kind op, typename PrecedingIter, typename FollowingIter>
-  std::enable_if_t<(op == aggregation::COLLECT_LIST), std::unique_ptr<column>> operator()(
-    column_view const& input,
-    column_view const& default_outputs,
-    PrecedingIter preceding_begin_raw,
-    FollowingIter following_begin_raw,
-    size_type min_periods,
-    std::unique_ptr<aggregation> const& agg,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_EXPECTS(default_outputs.is_empty(),
-                 "COLLECT_LIST window function does not support default values.");
-
-    if (input.is_empty()) return empty_like(input);
-
-    // Fix up preceding/following iterators to respect column boundaries,
-    // similar to gpu_rolling().
-    // `rolling_window()` does not fix up preceding/following so as not to read past
-    // column boundaries.
-    // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do.
-    auto preceding_begin = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0), [preceding_begin_raw] __device__(auto i) {
-        return thrust::min(preceding_begin_raw[i], i + 1);
-      });
-    auto following_begin = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0),
-      [following_begin_raw, size = input.size()] __device__(auto i) {
-        return thrust::min(following_begin_raw[i], size - i - 1);
-      });
-
-    // Materialize collect list's offsets.
-    auto offsets = create_collect_offsets(
-      input.size(), preceding_begin, following_begin, min_periods, stream, mr);
-
-    // Map each element of the collect() result's child column
-    // to the index where it appears in the input.
-    auto per_row_mapping = get_list_child_to_list_row_mapping(offsets->view(), stream, mr);
-
-    // Generate gather map to produce the collect() result's child column.
-    auto gather_map = create_collect_gather_map(
-      offsets->view(), per_row_mapping->view(), preceding_begin, stream, mr);
-
-    // If gather_map collects null elements, and null_policy == EXCLUDE,
-    // those elements must be filtered out, and offsets recomputed.
-    auto null_handling = static_cast<collect_list_aggregation*>(agg.get())->_null_handling;
-    if (null_handling == null_policy::EXCLUDE && input.has_nulls()) {
-      auto num_child_nulls = count_child_nulls(input, gather_map, stream);
-      if (num_child_nulls != 0) {
-        std::tie(gather_map, offsets) =
-          purge_null_entries(input, *gather_map, *offsets, num_child_nulls, stream, mr);
-      }
-    }
-
-    // gather(), to construct child column.
-    auto gather_output =
-      cudf::gather(table_view{std::vector<column_view>{input}}, gather_map->view());
-
-    rmm::device_buffer null_mask;
-    size_type null_count;
-    std::tie(null_mask, null_count) = valid_if(
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(input.size()),
-      [preceding_begin, following_begin, min_periods] __device__(auto i) {
-        return (preceding_begin[i] + following_begin[i]) >= min_periods;
-      },
-      stream,
-      mr);
-
-    return make_lists_column(input.size(),
-                             std::move(offsets),
-                             std::move(gather_output->release()[0]),
-                             null_count,
-                             std::move(null_mask),
-                             stream,
-                             mr);
+    CUDF_FAIL("Invalid aggregation type/pair");
   }
 };
 
+/**
+ * @brief Functor for performing the high level rolling logic.
+ *
+ * This does 3 basic things:
+ *
+ * - It calls the preprocess step on incoming aggregation/type pairs
+ * - It calls the aggregation-dispatched gpu-rolling operation
+ * - It calls the final postprocess step
+ */
 struct dispatch_rolling {
-  template <typename T, typename PrecedingWindowIterator, typename FollowingWindowIterator>
+  template <typename InputType, typename PrecedingWindowIterator, typename FollowingWindowIterator>
   std::unique_ptr<column> operator()(column_view const& input,
                                      column_view const& default_outputs,
                                      PrecedingWindowIterator preceding_window_begin,
                                      FollowingWindowIterator following_window_begin,
                                      size_type min_periods,
-                                     std::unique_ptr<aggregation> const& agg,
+                                     rolling_aggregation const& agg,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return aggregation_dispatcher(agg->kind,
-                                  rolling_window_launcher<T>{},
-                                  input,
-                                  default_outputs,
-                                  preceding_window_begin,
-                                  following_window_begin,
-                                  min_periods,
-                                  agg,
-                                  stream,
-                                  mr);
+    // do any preprocessing of aggregations (eg, MIN -> ARGMIN, COLLECT_LIST -> nothing)
+    rolling_aggregation_preprocessor preprocessor;
+    auto preprocessed_aggs = agg.get_simple_aggregations(input.type(), preprocessor);
+    CUDF_EXPECTS(preprocessed_aggs.size() <= 1,
+                 "Encountered a non-trivial rolling aggregation result");
+
+    // perform the rolling window if we produced an aggregation to use
+    auto intermediate = preprocessed_aggs.size() > 0
+                          ? aggregation_dispatcher(
+                              dynamic_cast<rolling_aggregation const&>(*preprocessed_aggs[0]).kind,
+                              rolling_window_launcher<InputType>{},
+                              input,
+                              default_outputs,
+                              preceding_window_begin,
+                              following_window_begin,
+                              min_periods,
+                              dynamic_cast<rolling_aggregation const&>(*preprocessed_aggs[0]),
+                              stream,
+                              mr)
+                          : nullptr;
+
+    // finalize.
+    auto const result_type = target_type(input.type(), agg.kind);
+    rolling_aggregation_postprocessor postprocessor(input,
+                                                    default_outputs,
+                                                    result_type,
+                                                    preceding_window_begin,
+                                                    following_window_begin,
+                                                    min_periods,
+                                                    std::move(intermediate),
+                                                    stream,
+                                                    mr);
+    agg.finalize(postprocessor);
+    return postprocessor.get_result();
   }
 };
 
@@ -1248,40 +1040,40 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
                                            FollowingWindowIterator following_window,
                                            std::string const& following_window_str,
                                            size_type min_periods,
-                                           std::unique_ptr<aggregation> const& agg,
+                                           rolling_aggregation const& agg,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
 
-  if (input.has_nulls())
+  if (input.has_nulls()) {
     CUDF_FAIL("Currently the UDF version of rolling window does NOT support inputs with nulls.");
+  }
 
   min_periods = std::max(min_periods, 0);
 
-  auto udf_agg = static_cast<udf_aggregation*>(agg.get());
+  auto udf_agg = dynamic_cast<udf_aggregation const&>(agg);
 
-  std::string hash = "prog_rolling." + std::to_string(std::hash<std::string>{}(udf_agg->_source));
+  std::string hash = "prog_rolling." + std::to_string(std::hash<std::string>{}(udf_agg._source));
 
   std::string cuda_source;
-  switch (udf_agg->kind) {
+  switch (udf_agg.kind) {
     case aggregation::Kind::PTX:
       cuda_source +=
-        cudf::jit::parse_single_function_ptx(udf_agg->_source,
-                                             udf_agg->_function_name,
-                                             cudf::jit::get_type_name(udf_agg->_output_type),
+        cudf::jit::parse_single_function_ptx(udf_agg._source,
+                                             udf_agg._function_name,
+                                             cudf::jit::get_type_name(udf_agg._output_type),
                                              {0, 5});  // args 0 and 5 are pointers.
       break;
     case aggregation::Kind::CUDA:
-      cuda_source +=
-        cudf::jit::parse_single_function_cuda(udf_agg->_source, udf_agg->_function_name);
+      cuda_source += cudf::jit::parse_single_function_cuda(udf_agg._source, udf_agg._function_name);
       break;
     default: CUDF_FAIL("Unsupported UDF type.");
   }
 
   std::unique_ptr<column> output = make_numeric_column(
-    udf_agg->_output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
+    udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
 
   auto output_view = output->mutable_view();
   rmm::device_scalar<size_type> device_valid_count{0, stream};
@@ -1290,7 +1082,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
     jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new")  //
       .instantiate(cudf::jit::get_type_name(input.type()),  // list of template arguments
                    cudf::jit::get_type_name(output->type()),
-                   udf_agg->_operator_name,
+                   udf_agg._operator_name,
                    preceding_window_str.c_str(),
                    following_window_str.c_str());
 
@@ -1321,7 +1113,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
  *                               PrecedingWindowIterator preceding_window_begin,
  *                               FollowingWindowIterator following_window_begin,
  *                               size_type min_periods,
- *                               std::unique_ptr<aggregation> const& agg,
+ *                               rolling_aggregation const& agg,
  *                               rmm::mr::device_memory_resource* mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -1332,21 +1124,27 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        PrecedingWindowIterator preceding_window_begin,
                                        FollowingWindowIterator following_window_begin,
                                        size_type min_periods,
-                                       std::unique_ptr<aggregation> const& agg,
+                                       rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
 
-  if (input.is_empty()) return empty_like(input);
+  if (input.is_empty()) { return cudf::detail::empty_output_for_rolling_aggregation(input, agg); }
 
-  if (cudf::is_dictionary(input.type()))
-    CUDF_EXPECTS(agg->kind == aggregation::COUNT_ALL || agg->kind == aggregation::COUNT_VALID ||
-                   agg->kind == aggregation::ROW_NUMBER || agg->kind == aggregation::MIN ||
-                   agg->kind == aggregation::MAX || agg->kind == aggregation::LEAD ||
-                   agg->kind == aggregation::LAG,
+  if (cudf::is_dictionary(input.type())) {
+    CUDF_EXPECTS(agg.kind == aggregation::COUNT_ALL || agg.kind == aggregation::COUNT_VALID ||
+                   agg.kind == aggregation::ROW_NUMBER || agg.kind == aggregation::MIN ||
+                   agg.kind == aggregation::MAX || agg.kind == aggregation::LEAD ||
+                   agg.kind == aggregation::LAG,
                  "Invalid aggregation for dictionary column");
+  }
+
+  if (agg.kind != aggregation::LEAD && agg.kind != aggregation::LAG) {
+    CUDF_EXPECTS(default_outputs.is_empty(),
+                 "Only LEAD/LAG window functions support default values.");
+  }
 
   min_periods = std::max(min_periods, 0);
 
@@ -1364,12 +1162,14 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                       agg,
                                       stream,
                                       mr);
+
   if (!cudf::is_dictionary(input.type())) return output;
 
   // dictionary column post processing
-  if (agg->kind == aggregation::COUNT_ALL || agg->kind == aggregation::COUNT_VALID ||
-      agg->kind == aggregation::ROW_NUMBER)
+  if (agg.kind == aggregation::COUNT_ALL || agg.kind == aggregation::COUNT_VALID ||
+      agg.kind == aggregation::ROW_NUMBER) {
     return output;
+  }
 
   // output is new dictionary indices (including nulls)
   auto keys = std::make_unique<column>(dictionary_column_view(input).keys(), stream, mr);
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index 18bd0ea2217..bd64cc39f47 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -25,65 +25,6 @@ namespace cudf {
 // helper functions - used in the rolling window implementation and tests
 
 namespace detail {
-// return true the aggregation is valid for the specified ColumnType
-// valid aggregations may still be further specialized (eg, is_string_specialized)
-template <typename ColumnType, class AggOp, aggregation::Kind op>
-static constexpr bool is_rolling_supported()
-{
-  if (!cudf::detail::is_valid_aggregation<ColumnType, op>()) {
-    return false;
-  } else if (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>()) {
-    constexpr bool is_comparable_countable_op = std::is_same<AggOp, DeviceMin>::value or
-                                                std::is_same<AggOp, DeviceMax>::value or
-                                                std::is_same<AggOp, DeviceCount>::value;
-
-    constexpr bool is_operation_supported =
-      (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
-      (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-      (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-      (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
-
-    constexpr bool is_valid_numeric_agg =
-      (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>() or
-       is_comparable_countable_op) and
-      is_operation_supported;
-
-    return is_valid_numeric_agg;
-
-  } else if (cudf::is_timestamp<ColumnType>()) {
-    return (op == aggregation::MIN) or (op == aggregation::MAX) or
-           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
-  } else if (cudf::is_fixed_point<ColumnType>()) {
-    return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
-           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
-           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
-  } else if (std::is_same<ColumnType, cudf::string_view>()) {
-    return (op == aggregation::MIN) or (op == aggregation::MAX) or
-           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
-
-  } else if (std::is_same<ColumnType, cudf::list_view>()) {
-    return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
-           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
-  } else if (std::is_same<ColumnType, cudf::struct_view>()) {
-    // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER.
-    return op == aggregation::COLLECT_LIST;
-  } else {
-    return false;
-  }
-}
-
-// return true if this Op is specialized for strings.
-template <typename ColumnType, class AggOp, aggregation::Kind Op>
-static constexpr bool is_rolling_string_specialization()
-{
-  return std::is_same<ColumnType, cudf::string_view>::value and
-         ((aggregation::MIN == Op and std::is_same<AggOp, DeviceMin>::value) or
-          (aggregation::MAX == Op and std::is_same<AggOp, DeviceMax>::value));
-}
 
 // store functor
 template <typename T, bool is_mean = false>
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index fe051b1ffc5..9189634b5d8 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+#include <structs/utilities.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.hpp>
 
@@ -24,6 +29,51 @@
 
 namespace cudf {
 
+scalar::scalar(data_type type,
+               bool is_valid,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
+  : _type(type), _is_valid(is_valid, stream, mr)
+{
+}
+
+scalar::scalar(scalar const& other,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
+  : _type(other.type()), _is_valid(other._is_valid, stream, mr)
+{
+}
+
+data_type scalar::type() const noexcept { return _type; }
+
+void scalar::set_valid(bool is_valid, rmm::cuda_stream_view stream)
+{
+  _is_valid.set_value(is_valid, stream);
+}
+
+bool scalar::is_valid(rmm::cuda_stream_view stream) const { return _is_valid.value(stream); }
+
+bool* scalar::validity_data() { return _is_valid.data(); }
+
+bool const* scalar::validity_data() const { return _is_valid.data(); }
+
+string_scalar::string_scalar() : scalar(data_type(type_id::STRING)) {}
+
+string_scalar::string_scalar(std::string const& string,
+                             bool is_valid,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::STRING), is_valid), _data(string.data(), string.size(), stream, mr)
+{
+}
+
+string_scalar::string_scalar(string_scalar const& other,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar(other, stream, mr), _data(other._data, stream, mr)
+{
+}
+
 string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
@@ -46,6 +96,12 @@ string_scalar::value_type string_scalar::value(rmm::cuda_stream_view stream) con
   return value_type{data(), size()};
 }
 
+size_type string_scalar::size() const { return _data.size(); }
+
+const char* string_scalar::data() const { return static_cast<const char*>(_data.data()); }
+
+string_scalar::operator std::string() const { return this->to_string(rmm::cuda_stream_default); }
+
 std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
   std::string result;
@@ -56,4 +112,459 @@ std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
   return result;
 }
 
+template <typename T>
+fixed_point_scalar<T>::fixed_point_scalar() : scalar(data_type(type_to_id<T>())){};
+
+template <typename T>
+fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
+                                          numeric::scale_type scale,
+                                          bool is_valid,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar{data_type{type_to_id<T>(), static_cast<int32_t>(scale)}, is_valid, stream, mr},
+    _data{value}
+{
+}
+
+template <typename T>
+fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
+                                          bool is_valid,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value}
+{
+}
+
+template <typename T>
+fixed_point_scalar<T>::fixed_point_scalar(T value,
+                                          bool is_valid,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar{data_type{type_to_id<T>(), value.scale()}, is_valid, stream, mr}, _data{value.value()}
+{
+}
+
+template <typename T>
+fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
+                                          numeric::scale_type scale,
+                                          bool is_valid,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr},
+    _data{std::forward<rmm::device_scalar<rep_type>>(data)}
+{
+}
+
+template <typename T>
+fixed_point_scalar<T>::fixed_point_scalar(fixed_point_scalar<T> const& other,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar{other, stream, mr}, _data(other._data, stream, mr)
+{
+}
+
+template <typename T>
+typename fixed_point_scalar<T>::rep_type fixed_point_scalar<T>::value(
+  rmm::cuda_stream_view stream) const
+{
+  return _data.value(stream);
+}
+
+template <typename T>
+T fixed_point_scalar<T>::fixed_point_value(rmm::cuda_stream_view stream) const
+{
+  return value_type{
+    numeric::scaled_integer<rep_type>{_data.value(stream), numeric::scale_type{type().scale()}}};
+}
+
+template <typename T>
+typename fixed_point_scalar<T>::rep_type* fixed_point_scalar<T>::data()
+{
+  return _data.data();
+}
+
+template <typename T>
+typename fixed_point_scalar<T>::rep_type const* fixed_point_scalar<T>::data() const
+{
+  return _data.data();
+}
+
+/**
+ * @brief These define the valid fixed-point scalar types.
+ *
+ * See `is_fixed_point` in @see cudf/utilities/traits.hpp
+ *
+ * Adding a new supported type only requires adding the appropriate line here
+ * and does not require updating the scalar.hpp file.
+ */
+template class fixed_point_scalar<numeric::decimal32>;
+template class fixed_point_scalar<numeric::decimal64>;
+
+namespace detail {
+
+template <typename T>
+fixed_width_scalar<T>::fixed_width_scalar() : scalar(data_type(type_to_id<T>()))
+{
+}
+
+template <typename T>
+fixed_width_scalar<T>::fixed_width_scalar(T value,
+                                          bool is_valid,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data(value, stream, mr)
+{
+}
+
+template <typename T>
+fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
+                                          bool is_valid,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_to_id<T>()), is_valid, stream, mr),
+    _data{std::forward<rmm::device_scalar<T>>(data)}
+{
+}
+
+template <typename T>
+fixed_width_scalar<T>::fixed_width_scalar(fixed_width_scalar<T> const& other,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+  : scalar{other, stream, mr}, _data(other._data, stream, mr)
+{
+}
+
+template <typename T>
+void fixed_width_scalar<T>::set_value(T value, rmm::cuda_stream_view stream)
+{
+  _data.set_value(value, stream);
+  this->set_valid(true, stream);
+}
+
+template <typename T>
+T fixed_width_scalar<T>::value(rmm::cuda_stream_view stream) const
+{
+  return _data.value(stream);
+}
+
+template <typename T>
+T* fixed_width_scalar<T>::data()
+{
+  return _data.data();
+}
+
+template <typename T>
+T const* fixed_width_scalar<T>::data() const
+{
+  return _data.data();
+}
+
+template <typename T>
+fixed_width_scalar<T>::operator value_type() const
+{
+  return this->value(rmm::cuda_stream_default);
+}
+
+/**
+ * @brief These define the valid fixed-width scalar types.
+ *
+ * See `is_fixed_width` in @see cudf/utilities/traits.hpp
+ *
+ * Adding a new supported type only requires adding the appropriate line here
+ * and does not require updating the scalar.hpp file.
+ */
+template class fixed_width_scalar<bool>;
+template class fixed_width_scalar<int8_t>;
+template class fixed_width_scalar<int16_t>;
+template class fixed_width_scalar<int32_t>;
+template class fixed_width_scalar<int64_t>;
+template class fixed_width_scalar<uint8_t>;
+template class fixed_width_scalar<uint16_t>;
+template class fixed_width_scalar<uint32_t>;
+template class fixed_width_scalar<uint64_t>;
+template class fixed_width_scalar<float>;
+template class fixed_width_scalar<double>;
+template class fixed_width_scalar<timestamp_D>;
+template class fixed_width_scalar<timestamp_s>;
+template class fixed_width_scalar<timestamp_ms>;
+template class fixed_width_scalar<timestamp_us>;
+template class fixed_width_scalar<timestamp_ns>;
+template class fixed_width_scalar<duration_D>;
+template class fixed_width_scalar<duration_s>;
+template class fixed_width_scalar<duration_ms>;
+template class fixed_width_scalar<duration_us>;
+template class fixed_width_scalar<duration_ns>;
+
+}  // namespace detail
+
+template <typename T>
+numeric_scalar<T>::numeric_scalar(T value,
+                                  bool is_valid,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+  : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
+{
+}
+
+template <typename T>
+numeric_scalar<T>::numeric_scalar(rmm::device_scalar<T>&& data,
+                                  bool is_valid,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+  : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
+{
+}
+
+template <typename T>
+numeric_scalar<T>::numeric_scalar(numeric_scalar<T> const& other,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+  : detail::fixed_width_scalar<T>{other, stream, mr}
+{
+}
+
+/**
+ * @brief These define the valid numeric scalar types.
+ *
+ * See `is_numeric` in @see cudf/utilities/traits.hpp
+ *
+ * Adding a new supported type only requires adding the appropriate line here
+ * and does not require updating the scalar.hpp file.
+ */
+template class numeric_scalar<bool>;
+template class numeric_scalar<int8_t>;
+template class numeric_scalar<int16_t>;
+template class numeric_scalar<int32_t>;
+template class numeric_scalar<int64_t>;
+template class numeric_scalar<uint8_t>;
+template class numeric_scalar<uint16_t>;
+template class numeric_scalar<uint32_t>;
+template class numeric_scalar<uint64_t>;
+template class numeric_scalar<float>;
+template class numeric_scalar<double>;
+
+template <typename T>
+chrono_scalar<T>::chrono_scalar(T value,
+                                bool is_valid,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+  : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
+{
+}
+
+template <typename T>
+chrono_scalar<T>::chrono_scalar(rmm::device_scalar<T>&& data,
+                                bool is_valid,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+  : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
+{
+}
+
+template <typename T>
+chrono_scalar<T>::chrono_scalar(chrono_scalar<T> const& other,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+  : detail::fixed_width_scalar<T>{other, stream, mr}
+{
+}
+
+/**
+ * @brief These define the valid chrono scalar types.
+ *
+ * See `is_chrono` in @see cudf/utilities/traits.hpp
+ *
+ * Adding a new supported type only requires adding the appropriate line here
+ * and does not require updating the scalar.hpp file.
+ */
+template class chrono_scalar<timestamp_D>;
+template class chrono_scalar<timestamp_s>;
+template class chrono_scalar<timestamp_ms>;
+template class chrono_scalar<timestamp_us>;
+template class chrono_scalar<timestamp_ns>;
+template class chrono_scalar<duration_D>;
+template class chrono_scalar<duration_s>;
+template class chrono_scalar<duration_ms>;
+template class chrono_scalar<duration_us>;
+template class chrono_scalar<duration_ns>;
+
+template <typename T>
+duration_scalar<T>::duration_scalar(rep_type value,
+                                    bool is_valid,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+  : chrono_scalar<T>(T{value}, is_valid, stream, mr)
+{
+}
+
+template <typename T>
+duration_scalar<T>::duration_scalar(duration_scalar<T> const& other,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+  : chrono_scalar<T>{other, stream, mr}
+{
+}
+
+template <typename T>
+typename duration_scalar<T>::rep_type duration_scalar<T>::count()
+{
+  return this->value().count();
+}
+
+/**
+ * @brief These define the valid duration scalar types.
+ *
+ * See `is_duration` in @see cudf/utilities/traits.hpp
+ *
+ * Adding a new supported type only requires adding the appropriate line here
+ * and does not require updating the scalar.hpp file.
+ */
+template class duration_scalar<duration_D>;
+template class duration_scalar<duration_s>;
+template class duration_scalar<duration_ms>;
+template class duration_scalar<duration_us>;
+template class duration_scalar<duration_ns>;
+
+template <typename T>
+typename timestamp_scalar<T>::rep_type timestamp_scalar<T>::ticks_since_epoch()
+{
+  return this->value().time_since_epoch().count();
+}
+
+/**
+ * @brief These define the valid timestamp scalar types.
+ *
+ * See `is_timestamp` in @see cudf/utilities/traits.hpp
+ *
+ * Adding a new supported type only requires adding the appropriate line here
+ * and does not require updating the scalar.hpp file.
+ */
+template class timestamp_scalar<timestamp_D>;
+template class timestamp_scalar<timestamp_s>;
+template class timestamp_scalar<timestamp_ms>;
+template class timestamp_scalar<timestamp_us>;
+template class timestamp_scalar<timestamp_ns>;
+
+template <typename T>
+template <typename D>
+timestamp_scalar<T>::timestamp_scalar(D const& value,
+                                      bool is_valid,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+  : chrono_scalar<T>(T{typename T::duration{value}}, is_valid, stream, mr)
+{
+}
+
+template <typename T>
+timestamp_scalar<T>::timestamp_scalar(timestamp_scalar<T> const& other,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+  : chrono_scalar<T>{other, stream, mr}
+{
+}
+
+#define TS_CTOR(TimestampType, DurationType)                  \
+  template timestamp_scalar<TimestampType>::timestamp_scalar( \
+    DurationType const&, bool, rmm::cuda_stream_view, rmm::mr::device_memory_resource*);
+
+/**
+ * @brief These are the valid combinations of duration types to timestamp types.
+ */
+TS_CTOR(timestamp_D, duration_D)
+TS_CTOR(timestamp_D, int32_t)
+TS_CTOR(timestamp_s, duration_D)
+TS_CTOR(timestamp_s, duration_s)
+TS_CTOR(timestamp_s, int64_t)
+TS_CTOR(timestamp_ms, duration_D)
+TS_CTOR(timestamp_ms, duration_s)
+TS_CTOR(timestamp_ms, duration_ms)
+TS_CTOR(timestamp_ms, int64_t)
+TS_CTOR(timestamp_us, duration_D)
+TS_CTOR(timestamp_us, duration_s)
+TS_CTOR(timestamp_us, duration_ms)
+TS_CTOR(timestamp_us, duration_us)
+TS_CTOR(timestamp_us, int64_t)
+TS_CTOR(timestamp_ns, duration_D)
+TS_CTOR(timestamp_ns, duration_s)
+TS_CTOR(timestamp_ns, duration_ms)
+TS_CTOR(timestamp_ns, duration_us)
+TS_CTOR(timestamp_ns, duration_ns)
+TS_CTOR(timestamp_ns, int64_t)
+
+list_scalar::list_scalar() : scalar(data_type(type_id::LIST)) {}
+
+list_scalar::list_scalar(cudf::column_view const& data,
+                         bool is_valid,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(data, stream, mr)
+{
+}
+
+list_scalar::list_scalar(cudf::column&& data,
+                         bool is_valid,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(std::move(data))
+{
+}
+
+list_scalar::list_scalar(list_scalar const& other,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+  : scalar{other, stream, mr}, _data(other._data, stream, mr)
+{
+}
+
+column_view list_scalar::view() const { return _data.view(); }
+
+struct_scalar::struct_scalar() : scalar(data_type(type_id::STRUCT)) {}
+
+struct_scalar::struct_scalar(table_view const& data,
+                             bool is_valid,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::STRUCT), is_valid, stream, mr), _data(data, stream, mr)
+{
+  init(is_valid, stream, mr);
+}
+
+struct_scalar::struct_scalar(host_span<column_view const> data,
+                             bool is_valid,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
+    _data(table_view{std::vector<column_view>{data.begin(), data.end()}}, stream, mr)
+{
+  init(is_valid, stream, mr);
+}
+
+table_view struct_scalar::view() const { return _data.view(); }
+
+void struct_scalar::init(bool is_valid,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
+{
+  table_view tv = static_cast<table_view>(_data);
+  CUDF_EXPECTS(
+    std::all_of(tv.begin(), tv.end(), [](column_view const& col) { return col.size() == 1; }),
+    "Struct scalar inputs must have exactly 1 row");
+
+  // validity pushdown
+  if (!is_valid) { superimpose_nulls(stream, mr); }
+}
+
+void struct_scalar::superimpose_nulls(rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  // push validity mask down
+  std::vector<bitmask_type> host_validity({0});
+  auto validity = cudf::detail::make_device_uvector_sync(host_validity, stream, mr);
+  auto iter     = thrust::make_counting_iterator(0);
+  std::for_each(iter, iter + _data.num_columns(), [&](size_type i) {
+    cudf::structs::detail::superimpose_parent_nulls(
+      validity.data(), 1, _data.get_column(i), stream, mr);
+  });
+}
+
 }  // namespace cudf
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 5714eaee864..af78d84d874 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,6 +97,27 @@ std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
   return type_dispatcher(type, scalar_construction_helper{}, stream, mr);
 }
 
+std::unique_ptr<scalar> make_list_scalar(column_view elements,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return std::make_unique<list_scalar>(elements, true, stream, mr);
+}
+
+std::unique_ptr<scalar> make_struct_scalar(table_view const& data,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  return std::make_unique<struct_scalar>(data, true, stream, mr);
+}
+
+std::unique_ptr<scalar> make_struct_scalar(host_span<column_view const> data,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  return std::make_unique<struct_scalar>(data, true, stream, mr);
+}
+
 namespace {
 struct default_scalar_functor {
   template <typename T>
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 051d302c710..fbb89bec731 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
@@ -25,10 +26,12 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
+#include <structs/utilities.hpp>
 
 #include <hash/unordered_multiset.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
@@ -75,71 +78,62 @@ std::unique_ptr<column> search_ordered(table_view const& t,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(
+    column_order.empty() or static_cast<std::size_t>(t.num_columns()) == column_order.size(),
+    "Mismatch between number of columns and column order.");
+  CUDF_EXPECTS(
+    null_precedence.empty() or static_cast<std::size_t>(t.num_columns()) == null_precedence.size(),
+    "Mismatch between number of columns and null precedence.");
+
   // Allocate result column
-  std::unique_ptr<column> result = make_numeric_column(
+  auto result = make_numeric_column(
     data_type{type_to_id<size_type>()}, values.num_rows(), mask_state::UNALLOCATED, stream, mr);
-
-  mutable_column_view result_view = result.get()->mutable_view();
+  auto const result_out = result->mutable_view().data<size_type>();
 
   // Handle empty inputs
   if (t.num_rows() == 0) {
-    CUDA_TRY(cudaMemsetAsync(
-      result_view.data<size_type>(), 0, values.num_rows() * sizeof(size_type), stream.value()));
+    CUDA_TRY(cudaMemsetAsync(result_out, 0, values.num_rows() * sizeof(size_type), stream.value()));
     return result;
   }
 
-  if (not column_order.empty()) {
-    CUDF_EXPECTS(static_cast<std::size_t>(t.num_columns()) == column_order.size(),
-                 "Mismatch between number of columns and column order.");
-  }
-
-  if (not null_precedence.empty()) {
-    CUDF_EXPECTS(static_cast<std::size_t>(t.num_columns()) == null_precedence.size(),
-                 "Mismatch between number of columns and null precedence.");
-  }
-
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
-  auto matched  = dictionary::detail::match_dictionaries({t, values}, stream);
-  auto d_t      = table_device_view::create(matched.second.front(), stream);
-  auto d_values = table_device_view::create(matched.second.back(), stream);
-  auto count_it = thrust::make_counting_iterator<size_type>(0);
-
-  rmm::device_vector<order> d_column_order(column_order.begin(), column_order.end());
-  rmm::device_vector<null_order> d_null_precedence(null_precedence.begin(), null_precedence.end());
-
-  if (has_nulls(t) or has_nulls(values)) {
-    auto ineq_op =
-      (find_first)
-        ? row_lexicographic_comparator<true>(
-            *d_t, *d_values, d_column_order.data().get(), d_null_precedence.data().get())
-        : row_lexicographic_comparator<true>(
-            *d_values, *d_t, d_column_order.data().get(), d_null_precedence.data().get());
-
-    launch_search(count_it,
-                  count_it,
-                  t.num_rows(),
-                  values.num_rows(),
-                  result_view.data<size_type>(),
-                  ineq_op,
-                  find_first,
-                  stream);
+  auto const matched = dictionary::detail::match_dictionaries({t, values}, stream);
+
+  // Prepare to flatten the structs column
+  auto const has_null_elements   = has_nested_nulls(t) or has_nested_nulls(values);
+  auto const flatten_nullability = has_null_elements
+                                     ? structs::detail::column_nullability::FORCE
+                                     : structs::detail::column_nullability::MATCH_INCOMING;
+
+  // 0-table_view, 1-column_order, 2-null_precedence, 3-validity_columns
+  auto const t_flattened = structs::detail::flatten_nested_columns(
+    matched.second.front(), column_order, null_precedence, flatten_nullability);
+  auto const values_flattened =
+    structs::detail::flatten_nested_columns(matched.second.back(), {}, {}, flatten_nullability);
+
+  auto const t_d      = table_device_view::create(std::get<0>(t_flattened), stream);
+  auto const values_d = table_device_view::create(std::get<0>(values_flattened), stream);
+  auto const& lhs     = find_first ? *t_d : *values_d;
+  auto const& rhs     = find_first ? *values_d : *t_d;
+
+  auto const& column_order_flattened    = std::get<1>(t_flattened);
+  auto const& null_precedence_flattened = std::get<2>(t_flattened);
+  auto const column_order_dv = detail::make_device_uvector_async(column_order_flattened, stream);
+  auto const null_precedence_dv =
+    detail::make_device_uvector_async(null_precedence_flattened, stream);
+
+  auto const count_it = thrust::make_counting_iterator<size_type>(0);
+  if (has_null_elements) {
+    auto const comp = row_lexicographic_comparator<true>(
+      lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
+    launch_search(
+      count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
   } else {
-    auto ineq_op =
-      (find_first)
-        ? row_lexicographic_comparator<false>(
-            *d_t, *d_values, d_column_order.data().get(), d_null_precedence.data().get())
-        : row_lexicographic_comparator<false>(
-            *d_values, *d_t, d_column_order.data().get(), d_null_precedence.data().get());
-
-    launch_search(count_it,
-                  count_it,
-                  t.num_rows(),
-                  values.num_rows(),
-                  result_view.data<size_type>(),
-                  ineq_op,
-                  find_first,
-                  stream);
+    auto const comp = row_lexicographic_comparator<false>(
+      lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
+    launch_search(
+      count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
   }
 
   return result;
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index d1a1169dae4..c9ab791395d 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -23,7 +24,7 @@
 #include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
@@ -37,18 +38,20 @@ auto is_sorted(cudf::table_view const& in,
 {
   // 0-table_view, 1-column_order, 2-null_precedence, 3-validity_columns
   auto flattened = structs::detail::flatten_nested_columns(in, column_order, null_precedence);
-  auto in_d      = table_device_view::create(std::get<0>(flattened), stream);
-  rmm::device_vector<order> d_column_order(std::get<1>(flattened));
-  rmm::device_vector<null_order> const d_null_precedence =
-    (has_nulls) ? rmm::device_vector<null_order>{std::get<2>(flattened)}
-                : rmm::device_vector<null_order>{};
-  auto ineq_op = row_lexicographic_comparator<has_nulls>(
-    *in_d, *in_d, d_column_order.data().get(), d_null_precedence.data().get());
+
+  auto const d_input           = table_device_view::create(std::get<0>(flattened), stream);
+  auto const d_column_order    = make_device_uvector_async(std::get<1>(flattened), stream);
+  auto const d_null_precedence = has_nulls
+                                   ? make_device_uvector_async(std::get<2>(flattened), stream)
+                                   : rmm::device_uvector<null_order>(0, stream);
+
+  auto comparator = row_lexicographic_comparator<has_nulls>(
+    *d_input, *d_input, d_column_order.data(), d_null_precedence.data());
 
   auto sorted = thrust::is_sorted(rmm::exec_policy(stream),
                                   thrust::make_counting_iterator(0),
                                   thrust::make_counting_iterator(in.num_rows()),
-                                  ineq_op);
+                                  comparator);
 
   return sorted;
 }
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index d24bf2bcd7c..66548ac1e73 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -27,7 +27,6 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
@@ -55,13 +54,13 @@ struct unique_comparator {
 };
 
 // Assign rank from 1 to n unique values. Equal values get same rank value.
-rmm::device_vector<size_type> sorted_dense_rank(column_view input_col,
-                                                column_view sorted_order_view,
-                                                rmm::cuda_stream_view stream)
+rmm::device_uvector<size_type> sorted_dense_rank(column_view input_col,
+                                                 column_view sorted_order_view,
+                                                 rmm::cuda_stream_view stream)
 {
   auto device_table     = table_device_view::create(table_view{{input_col}}, stream);
   auto const input_size = input_col.size();
-  rmm::device_vector<size_type> dense_rank_sorted(input_size);
+  rmm::device_uvector<size_type> dense_rank_sorted(input_size, stream);
   auto sorted_index_order = thrust::make_permutation_iterator(
     sorted_order_view.begin<size_type>(), thrust::make_counting_iterator<size_type>(0));
   if (input_col.has_nulls()) {
@@ -70,14 +69,14 @@ rmm::device_vector<size_type> sorted_dense_rank(column_view input_col,
     auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
 
     thrust::inclusive_scan(
-      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data().get());
+      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
   } else {
     auto conv = unique_comparator<false, size_type, decltype(sorted_index_order)>(
       *device_table, sorted_index_order);
     auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
 
     thrust::inclusive_scan(
-      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data().get());
+      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
   }
   return dense_rank_sorted;
 }
@@ -100,7 +99,7 @@ template <typename TieType,
           typename TieBreaker,
           typename Transformer,
           typename TieIterator>
-void tie_break_ranks_transform(rmm::device_vector<size_type> const &dense_rank_sorted,
+void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sorted,
                                TieIterator tie_iter,
                                column_view const &sorted_order_view,
                                outputIterator rank_iter,
@@ -109,9 +108,9 @@ void tie_break_ranks_transform(rmm::device_vector<size_type> const &dense_rank_s
                                rmm::cuda_stream_view stream)
 {
   auto const input_size = sorted_order_view.size();
-  rmm::device_vector<TieType> tie_sorted(input_size, 0);
   // algorithm: reduce_by_key(dense_rank, 1, n, reduction_tie_breaker)
   // reduction_tie_breaker = min, max, min_count
+  rmm::device_uvector<TieType> tie_sorted(sorted_order_view.size(), stream);
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         dense_rank_sorted.begin(),
                         dense_rank_sorted.end(),
@@ -146,7 +145,7 @@ void rank_first(column_view sorted_order_view,
 }
 
 template <typename outputType>
-void rank_dense(rmm::device_vector<size_type> const &dense_rank_sorted,
+void rank_dense(cudf::device_span<size_type const> dense_rank_sorted,
                 column_view sorted_order_view,
                 mutable_column_view rank_mutable_view,
                 rmm::cuda_stream_view stream)
@@ -160,7 +159,7 @@ void rank_dense(rmm::device_vector<size_type> const &dense_rank_sorted,
 }
 
 template <typename outputType>
-void rank_min(rmm::device_vector<size_type> const &group_keys,
+void rank_min(cudf::device_span<size_type const> group_keys,
               column_view sorted_order_view,
               mutable_column_view rank_mutable_view,
               rmm::cuda_stream_view stream)
@@ -178,7 +177,7 @@ void rank_min(rmm::device_vector<size_type> const &group_keys,
 }
 
 template <typename outputType>
-void rank_max(rmm::device_vector<size_type> const &group_keys,
+void rank_max(cudf::device_span<size_type const> group_keys,
               column_view sorted_order_view,
               mutable_column_view rank_mutable_view,
               rmm::cuda_stream_view stream)
@@ -195,7 +194,7 @@ void rank_max(rmm::device_vector<size_type> const &group_keys,
                                        stream);
 }
 
-void rank_average(rmm::device_vector<size_type> const &group_keys,
+void rank_average(cudf::device_span<size_type const> group_keys,
                   column_view sorted_order_view,
                   mutable_column_view rank_mutable_view,
                   rmm::cuda_stream_view stream)
@@ -206,16 +205,18 @@ void rank_average(rmm::device_vector<size_type> const &group_keys,
   // Calculate Min of ranks and Count of equal values
   // algorithm: reduce_by_key(dense_rank, 1, n, min_count)
   //            transform(min+(count-1)/2), scatter
-  using MinCount = thrust::tuple<size_type, size_type>;
+  using MinCount = thrust::pair<size_type, size_type>;
   tie_break_ranks_transform<MinCount>(
     group_keys,
-    thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<size_type>(1),
-                                                 thrust::make_constant_iterator<size_type>(1))),
+    cudf::detail::make_counting_transform_iterator(1,
+                                                   [] __device__(auto i) {
+                                                     return MinCount{i, 1};
+                                                   }),
     sorted_order_view,
     rank_mutable_view.begin<double>(),
     [] __device__(auto rank_count1, auto rank_count2) {
-      return MinCount{std::min(thrust::get<0>(rank_count1), thrust::get<0>(rank_count2)),
-                      thrust::get<1>(rank_count1) + thrust::get<1>(rank_count2)};
+      return MinCount{std::min(rank_count1.first, rank_count2.first),
+                      rank_count1.second + rank_count2.second};
     },
     [] __device__(MinCount minrank_count) {  // min+(count-1)/2
       return static_cast<double>(thrust::get<0>(minrank_count)) +
@@ -261,12 +262,12 @@ std::unique_ptr<column> rank(column_view const &input,
 
   // dense: All equal values have same rank and rank always increases by 1 between groups
   // acts as key for min, max, average to denote equal value groups
-  rmm::device_vector<size_type> const dense_rank_sorted =
+  rmm::device_uvector<size_type> const dense_rank_sorted =
     [&method, &input, &sorted_order_view, &stream] {
       if (method != rank_method::FIRST)
         return sorted_dense_rank(input, sorted_order_view, stream);
       else
-        return rmm::device_vector<size_type>();
+        return rmm::device_uvector<size_type>(0, stream);
     }();
 
   if (output_type.id() == type_id::FLOAT64) {
@@ -314,7 +315,7 @@ std::unique_ptr<column> rank(column_view const &input,
     auto rank_iter = rank_mutable_view.begin<double>();
     size_type const count =
       (null_handling == null_policy::EXCLUDE) ? input.size() - input.null_count() : input.size();
-    auto drs            = dense_rank_sorted.data().get();
+    auto drs            = dense_rank_sorted.data();
     bool const is_dense = (method == rank_method::DENSE);
     thrust::transform(rmm::exec_policy(stream),
                       rank_iter,
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 506334c2a3d..e99d88a9e84 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -18,10 +18,12 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+
 #include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -123,14 +125,14 @@ std::unique_ptr<column> sorted_order(table_view input,
   }
 
   auto flattened = structs::detail::flatten_nested_columns(input, column_order, null_precedence);
-  auto& input_flattened = std::get<0>(flattened);
-  auto device_table     = table_device_view::create(input_flattened, stream);
-  rmm::device_vector<order> d_column_order(std::get<1>(flattened));
+  auto& input_flattened     = std::get<0>(flattened);
+  auto device_table         = table_device_view::create(input_flattened, stream);
+  auto const d_column_order = make_device_uvector_async(std::get<1>(flattened), stream);
 
   if (has_nulls(input_flattened)) {
-    rmm::device_vector<null_order> d_null_precedence(std::get<2>(flattened));
-    auto comparator = row_lexicographic_comparator<true>(
-      *device_table, *device_table, d_column_order.data().get(), d_null_precedence.data().get());
+    auto const d_null_precedence = make_device_uvector_async(std::get<2>(flattened), stream);
+    auto const comparator        = row_lexicographic_comparator<true>(
+      *device_table, *device_table, d_column_order.data(), d_null_precedence.data());
     if (stable) {
       thrust::stable_sort(rmm::exec_policy(stream),
                           mutable_indices_view.begin<size_type>(),
@@ -142,9 +144,11 @@ std::unique_ptr<column> sorted_order(table_view input,
                    mutable_indices_view.end<size_type>(),
                    comparator);
     }
+    // protection for temporary d_column_order and d_null_precedence
+    stream.synchronize();
   } else {
-    auto comparator = row_lexicographic_comparator<false>(
-      *device_table, *device_table, d_column_order.data().get());
+    auto const comparator =
+      row_lexicographic_comparator<false>(*device_table, *device_table, d_column_order.data());
     if (stable) {
       thrust::stable_sort(rmm::exec_policy(stream),
                           mutable_indices_view.begin<size_type>(),
@@ -156,6 +160,8 @@ std::unique_ptr<column> sorted_order(table_view input,
                    mutable_indices_view.end<size_type>(),
                    comparator);
     }
+    // protection for temporary d_column_order
+    stream.synchronize();
   }
 
   return sorted_indices;
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index 25213502bf5..30ea32fba8e 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -123,6 +123,7 @@ OutputIterator unique_copy(InputIterator first,
  * @param[out] unique_indices Column to store the index with unique rows
  * @param[in] keep            keep first entry, last entry, or no entries if duplicates found
  * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL,
+ * @param[in] null_precedence flag to denote nulls should appear before or after non-null items,
  *                            nulls are not equal if null_equality::UNEQUAL
  * @param[in] stream          CUDA stream used for device memory operations and kernel launches.
  *
@@ -133,14 +134,16 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
                                        cudf::mutable_column_view& unique_indices,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
+                                       null_order null_precedence,
                                        rmm::cuda_stream_view stream)
 {
   // sort only indices
-  auto sorted_indices = sorted_order(keys,
-                                     std::vector<order>{},
-                                     std::vector<null_order>{},
-                                     stream,
-                                     rmm::mr::get_current_device_resource());
+  auto sorted_indices = sorted_order(
+    keys,
+    std::vector<order>{},
+    std::vector<null_order>{static_cast<uint64_t>(keys.num_columns()), null_precedence},
+    stream,
+    rmm::mr::get_current_device_resource());
 
   // extract unique indices
   auto device_input_table = cudf::table_device_view::create(keys, stream);
@@ -180,6 +183,7 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                                        std::vector<size_type> const& keys,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
+                                       null_order null_precedence,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
@@ -196,7 +200,7 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
   // This is just slice of `unique_indices` but with different size as per the
   // keys_view has been processed in `get_unique_ordered_indices`
   auto unique_indices_view = detail::get_unique_ordered_indices(
-    keys_view, mutable_unique_indices_view, keep, nulls_equal, stream);
+    keys_view, mutable_unique_indices_view, keep, nulls_equal, null_precedence, stream);
 
   // run gather operation to establish new order
   return detail::gather(input,
@@ -213,10 +217,12 @@ std::unique_ptr<table> drop_duplicates(table_view const& input,
                                        std::vector<size_type> const& keys,
                                        duplicate_keep_option const keep,
                                        null_equality nulls_equal,
+                                       null_order null_precedence,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_duplicates(input, keys, keep, nulls_equal, rmm::cuda_stream_default, mr);
+  return detail::drop_duplicates(
+    input, keys, keep, nulls_equal, null_precedence, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 7dd4962e8de..b6526a5f52c 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,260 +15,191 @@
  */
 
 #include <strings/char_types/is_flags.h>
-#include <strings/utilities.cuh>
+#include <strings/utf8.cuh>
 #include <strings/utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/case.hpp>
-#include <cudf/strings/detail/modify_strings.cuh>
+#include <cudf/strings/capitalize.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace strings {
 namespace detail {
-namespace {  // anonym.
+namespace {
 
-// base class for 2-passes string modification:
-// 1st pass: probing string manipulation memory load requirements
-// 2nd pass: executing string modification.
-//
-struct probe_execute_base {
-  using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;
+/**
+ * @brief Base class for capitalize and title functors.
+ *
+ * Utility functions here manage access to the character case and flags tables.
+ */
+struct base_fn {
+  character_flags_table_type const* d_flags;
+  character_cases_table_type const* d_case_table;
 
-  probe_execute_base(column_device_view const d_column,
-                     int32_t const* d_offsets = nullptr,
-                     char* d_chars            = nullptr)
-    : d_column_(d_column),
-      d_flags_(get_character_flags_table()),       // set flag table
-      d_case_table_(get_character_cases_table()),  // set case table
-      d_offsets_(d_offsets),
-      d_chars_(d_chars)
-  {
-  }
+  base_fn() : d_flags(get_character_flags_table()), d_case_table(get_character_cases_table()) {}
 
-  __host__ __device__ column_device_view const get_column(void) const { return d_column_; }
+  using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;
 
   __device__ char_info get_char_info(char_utf8 chr) const
   {
-    uint32_t code_point                     = detail::utf8_to_codepoint(chr);
-    detail::character_flags_table_type flag = code_point <= 0x00FFFF ? d_flags_[code_point] : 0;
+    auto const code_point = detail::utf8_to_codepoint(chr);
+    auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0};
     return char_info{code_point, flag};
   }
 
   __device__ char_utf8 convert_char(char_info const& info) const
   {
-    return detail::codepoint_to_utf8(d_case_table_[info.first]);
-  }
-
-  __device__ char* get_output_ptr(size_type idx)
-  {
-    return d_chars_ && d_offsets_ ? d_chars_ + d_offsets_[idx] : nullptr;
+    return codepoint_to_utf8(d_case_table[info.first]);
   }
-
- private:
-  column_device_view const d_column_;
-  character_flags_table_type const* d_flags_;
-  character_cases_table_type const* d_case_table_;
-  int32_t const* d_offsets_;
-  char* d_chars_;
 };
 
-// class that factors out the common inside-loop behavior
-// of operator() between capitalize's `probe` and `execute`;
-//(public inheritance to allow getters pass-through
-// in derived classes);
-//
-struct probe_execute_capitalize : public probe_execute_base {
-  explicit probe_execute_capitalize(column_device_view const d_column)
-    : probe_execute_base(d_column)
-  {
-  }
-
-  probe_execute_capitalize(column_device_view const d_column,
-                           int32_t const* d_offsets,
-                           char* d_chars)
-    : probe_execute_base(d_column, d_offsets, d_chars)
-  {
-  }
-
-  __device__ char_utf8 generate_chr(string_view::const_iterator itr, string_view d_str) const
-  {
-    auto the_chr = *itr;
-
-    auto pair_char_info                     = get_char_info(the_chr);
-    detail::character_flags_table_type flag = pair_char_info.second;
-
-    if ((itr == d_str.begin()) ? IS_LOWER(flag) : IS_UPPER(flag))
-      the_chr = convert_char(pair_char_info);
-
-    return the_chr;
-  }
-};
+/**
+ * @brief Capitalize functor.
+ *
+ * This capitalizes the first letter of the string.
+ * Also lower-case any characters after the first letter.
+ */
+struct capitalize_fn : base_fn {
+  column_device_view const d_column;
+  offset_type* d_offsets{};
+  char* d_chars{};
 
-// functor for probing string capitalization
-// requirements:
-//(private inheritance to prevent polymorphic use,
-// a requirement that came up in code review)
-//
-struct probe_capitalize : private probe_execute_capitalize {
-  explicit probe_capitalize(column_device_view const d_column)
-    :  // probe_execute_base(d_column)
-      probe_execute_capitalize(d_column)
-  {
-  }
+  capitalize_fn(column_device_view const& d_column) : base_fn(), d_column(d_column) {}
 
-  __device__ int32_t operator()(size_type idx) const
+  __device__ void operator()(size_type idx)
   {
-    if (get_column().is_null(idx)) return 0;  // null string
-
-    string_view d_str = get_column().template element<string_view>(idx);
-    int32_t bytes     = 0;
-
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      bytes += detail::bytes_in_char_utf8(generate_chr(itr, d_str));
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
     }
-    return bytes;
-  }
-};
-
-// functor for executing string capitalization:
-//(private inheritance to prevent polymorphic use,
-// a requirement that came up in code review)
-//
-struct execute_capitalize : private probe_execute_capitalize {
-  execute_capitalize(column_device_view const d_column, int32_t const* d_offsets, char* d_chars)
-    :  // probe_execute_base(d_column, d_offsets, d_chars)
-      probe_execute_capitalize(d_column, d_offsets, d_chars)
-  {
-  }
-
-  __device__ int32_t operator()(size_type idx)
-  {
-    if (get_column().is_null(idx)) return 0;  // null string
-
-    string_view d_str = get_column().template element<string_view>(idx);
-    char* d_buffer    = get_output_ptr(idx);
 
+    auto const d_str  = d_column.element<string_view>(idx);
+    offset_type bytes = 0;
+    auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
     for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      d_buffer += detail::from_char_utf8(generate_chr(itr, d_str), d_buffer);
+      auto const info        = get_char_info(*itr);
+      auto const flag        = info.second;
+      auto const change_case = (itr == d_str.begin()) ? IS_LOWER(flag) : IS_UPPER(flag);
+      auto const new_char    = change_case ? convert_char(info) : *itr;
+
+      if (d_buffer)
+        d_buffer += detail::from_char_utf8(new_char, d_buffer);
+      else
+        bytes += detail::bytes_in_char_utf8(new_char);
     }
-    return 0;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
-// class that factors out the common inside-loop behavior
-// of operator() between title's `probe` and `execute`;
-//(public inheritance to allow getters pass-through
-// in derived classes);
-//
-struct probe_execute_title : public probe_execute_base {
-  explicit probe_execute_title(column_device_view const d_column) : probe_execute_base(d_column) {}
+/**
+ * @brief Title functor.
+ *
+ * This capitalizes the first letter of each word.
+ * The beginning of a word is identified as the first alphabetic
+ * character after a non-alphabetic character.
+ * Also, lower-case all other alpabetic characters.
+ */
+struct title_fn : base_fn {
+  column_device_view const d_column;
+  offset_type* d_offsets{};
+  char* d_chars{};
 
-  probe_execute_title(column_device_view const d_column, int32_t const* d_offsets, char* d_chars)
-    : probe_execute_base(d_column, d_offsets, d_chars)
-  {
-  }
+  title_fn(column_device_view const& d_column) : base_fn(), d_column(d_column) {}
 
-  __device__ thrust::pair<char_utf8, bool> generate_chr(string_view::const_iterator itr,
-                                                        string_view d_str,
-                                                        bool bcapnext) const
+  __device__ void operator()(size_type idx)
   {
-    auto the_chr = *itr;
-
-    auto pair_char_info                     = get_char_info(the_chr);
-    detail::character_flags_table_type flag = pair_char_info.second;
-
-    if (!IS_ALPHA(flag)) {
-      bcapnext = true;
-    } else {
-      if (bcapnext ? IS_LOWER(flag) : IS_UPPER(flag)) the_chr = convert_char(pair_char_info);
-
-      bcapnext = false;
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
     }
 
-    return thrust::make_pair(the_chr, bcapnext);
-  }
-};
-
-// functor for probing string title-ization
-// requirements:
-//(private inheritance to prevent polymorphic use,
-// a requirement that came up in code review)
-//
-struct probe_title : private probe_execute_title {
-  explicit probe_title(column_device_view const d_column) : probe_execute_title(d_column) {}
-
-  __device__ int32_t operator()(size_type idx) const
-  {
-    if (get_column().is_null(idx)) return 0;  // null string
-
-    string_view d_str = get_column().template element<string_view>(idx);
-    int32_t bytes     = 0;
-
-    bool bcapnext = true;
+    auto const d_str  = d_column.element<string_view>(idx);
+    offset_type bytes = 0;
+    auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    bool capitalize   = true;
     for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto pair_char_flag = generate_chr(itr, d_str, bcapnext);
-      bcapnext            = pair_char_flag.second;
-
-      bytes += detail::bytes_in_char_utf8(pair_char_flag.first);
+      auto const info        = get_char_info(*itr);
+      auto const flag        = info.second;
+      auto const change_case = IS_ALPHA(flag) && (capitalize ? IS_LOWER(flag) : IS_UPPER(flag));
+      auto const new_char    = change_case ? convert_char(info) : *itr;
+      // capitalize next char if this one is not alphabetic
+      capitalize = !IS_ALPHA(flag);
+
+      if (d_buffer)
+        d_buffer += detail::from_char_utf8(new_char, d_buffer);
+      else
+        bytes += detail::bytes_in_char_utf8(new_char);
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
-// functor for executing string title-ization:
-//(private inheritance to prevent polymorphic use,
-// a requirement that came up in code review)
-//
-struct execute_title : private probe_execute_title {
-  execute_title(column_device_view const d_column, int32_t const* d_offsets, char* d_chars)
-    : probe_execute_title(d_column, d_offsets, d_chars)
-  {
-  }
-
-  __device__ int32_t operator()(size_type idx)
-  {
-    if (get_column().is_null(idx)) return 0;  // null string
+/**
+ * @brief Common utility function for title() and capitalize().
+ *
+ * @tparam CapitalFn The specific functor.
+ * @param cfn The functor instance.
+ * @param input The input strings column.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for allocating the new device_buffer
+ */
+template <typename CapitalFn>
+std::unique_ptr<column> capitalize_utility(CapitalFn cfn,
+                                           strings_column_view const& input,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  auto children = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
+
+  return make_strings_column(input.size(),
+                             std::move(children.first),
+                             std::move(children.second),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                             stream,
+                             mr);
+}
 
-    string_view d_str = get_column().template element<string_view>(idx);
-    char* d_buffer    = get_output_ptr(idx);
+}  // namespace
 
-    bool bcapnext = true;
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto pair_char_flag = generate_chr(itr, d_str, bcapnext);
-      bcapnext            = pair_char_flag.second;
+std::unique_ptr<column> capitalize(strings_column_view const& input,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) return detail::make_empty_strings_column(stream, mr);
+  auto d_column = column_device_view::create(input.parent(), stream);
+  return capitalize_utility(capitalize_fn{*d_column}, input, stream, mr);
+}
 
-      d_buffer += detail::from_char_utf8(pair_char_flag.first, d_buffer);
-    }
-    return 0;
-  }
-};
+std::unique_ptr<column> title(strings_column_view const& input,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) return detail::make_empty_strings_column(stream, mr);
+  auto d_column = column_device_view::create(input.parent(), stream);
+  return capitalize_utility(title_fn{*d_column}, input, stream, mr);
+}
 
-}  // namespace
 }  // namespace detail
 
 std::unique_ptr<column> capitalize(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::modify_strings<detail::probe_capitalize, detail::execute_capitalize>(
-    strings, rmm::cuda_stream_default, mr);
+  return detail::capitalize(strings, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> title(strings_column_view const& strings,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::modify_strings<detail::probe_title, detail::execute_title>(
-    strings, rmm::cuda_stream_default, mr);
+  return detail::title(strings, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 9f3a1caba8a..a7934e6641b 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -23,12 +23,13 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/case.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <strings/utilities.cuh>
+#include <strings/utf8.cuh>
 #include <strings/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -138,8 +139,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
                          get_special_case_mapping_table()};
 
   // this utility calls the functor to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    functor, strings.size(), strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(functor, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 0b384ad0631..d556d68767d 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -20,11 +20,13 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/utilities.cuh>
+
+#include <strings/utf8.cuh>
 #include <strings/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -173,8 +175,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    filterer, strings_count, strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu
deleted file mode 100644
index f9b8b9e0ea3..00000000000
--- a/cpp/src/strings/combine.cu
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <strings/utilities.cuh>
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/valid_if.cuh>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/combine.hpp>
-#include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/logical.h>
-#include <thrust/transform_reduce.h>
-#include <thrust/transform_scan.h>
-
-#include <algorithm>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-
-std::unique_ptr<column> concatenate(table_view const& strings_columns,
-                                    string_scalar const& separator,
-                                    string_scalar const& narep,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  auto const num_columns = strings_columns.num_columns();
-  CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
-  // check all columns are of type string
-  CUDF_EXPECTS(std::all_of(strings_columns.begin(),
-                           strings_columns.end(),
-                           [](auto c) { return c.type().id() == type_id::STRING; }),
-               "All columns must be of type string");
-  if (num_columns == 1)  // single strings column returns a copy
-    return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
-  auto const strings_count = strings_columns.num_rows();
-  if (strings_count == 0)  // empty begets empty
-    return detail::make_empty_strings_column(stream, mr);
-
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
-  string_view d_separator(separator.data(), separator.size());
-  auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
-
-  // Create device views from the strings columns.
-  auto table   = table_device_view::create(strings_columns, stream);
-  auto d_table = *table;
-
-  // create resulting null mask
-  auto valid_mask = cudf::detail::valid_if(
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings_count),
-    [d_table, d_narep] __device__(size_type idx) {
-      bool null_element = thrust::any_of(
-        thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); });
-      return (!null_element || d_narep.is_valid());
-    },
-    stream,
-    mr);
-  auto& null_mask       = valid_mask.first;
-  auto const null_count = valid_mask.second;
-
-  // build offsets column by computing sizes of each string in the output
-  auto offsets_transformer = [d_table, d_separator, d_narep] __device__(size_type row_idx) {
-    // for this row (idx), iterate over each column and add up the bytes
-    bool const null_element =
-      thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [row_idx](auto const& d_column) {
-        return d_column.is_null(row_idx);
-      });
-    if (null_element && !d_narep.is_valid()) return 0;
-    size_type const bytes = thrust::transform_reduce(
-      thrust::seq,
-      d_table.begin(),
-      d_table.end(),
-      [row_idx, d_separator, d_narep] __device__(column_device_view const& d_column) {
-        return d_separator.size_bytes() + (d_column.is_null(row_idx)
-                                             ? d_narep.size()
-                                             : d_column.element<string_view>(row_idx).size_bytes());
-      },
-      0,
-      thrust::plus<size_type>());
-    // separator goes only in between elements
-    return bytes == 0 ? 0 : (bytes - d_separator.size_bytes());  // remove the last separator
-  };
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), offsets_transformer);
-  auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_results_offsets = offsets_column->view().data<int32_t>();
-
-  // create the chars column
-  auto const bytes =
-    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  // fill the chars column
-  auto d_results_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    strings_count,
-    [d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(
-      size_type idx) {
-      bool const null_element = thrust::any_of(
-        thrust::seq, d_table.begin(), d_table.end(), [idx](column_device_view const& col) {
-          return col.is_null(idx);
-        });
-      if (null_element && !d_narep.is_valid())
-        return;  // do not write to buffer at all if any column element for this row is null
-      char* d_buffer = d_results_chars + d_results_offsets[idx];
-      // write out each column's entry for this row
-      for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) {
-        auto const d_column = d_table.column(col_idx);
-        string_view const d_str =
-          d_column.is_null(idx) ? d_narep.value() : d_column.element<string_view>(idx);
-        d_buffer = detail::copy_string(d_buffer, d_str);
-        // separator goes only in between elements
-        if (col_idx + 1 < num_columns) d_buffer = detail::copy_string(d_buffer, d_separator);
-      }
-    });
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             null_count,
-                             std::move(null_mask),
-                             stream,
-                             mr);
-}
-
-std::unique_ptr<column> join_strings(strings_column_view const& strings,
-                                     string_scalar const& separator,
-                                     string_scalar const& narep,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
-{
-  auto strings_count = strings.size();
-  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
-
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
-
-  string_view d_separator(separator.data(), separator.size());
-  auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
-
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-
-  // create an offsets array for building the output memory layout
-  rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
-  auto d_output_offsets = output_offsets.data();
-  // using inclusive-scan to compute last entry which is the total size
-  thrust::transform_inclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings_count),
-    d_output_offsets + 1,
-    [d_strings, d_separator, d_narep] __device__(size_type idx) {
-      size_type bytes = 0;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid()) return 0;  // skip nulls
-        bytes += d_narep.size();
-      } else
-        bytes += d_strings.element<string_view>(idx).size_bytes();
-      if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes();
-      return bytes;
-    },
-    thrust::plus<size_type>());
-  size_type const zero = 0;
-  output_offsets.set_element_async(0, zero, stream);
-  // total size is the last entry
-  // Note this call does a synchronize on the stream and thereby also protects the
-  // set_element_async parameter from going out of scope before it is used.
-  size_type const bytes = output_offsets.back_element(stream);
-
-  // build offsets column (only 1 string so 2 offset entries)
-  auto offsets_column =
-    make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  // set the first entry to 0 and the last entry to bytes
-  int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
-  CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
-                           new_offsets,
-                           sizeof(new_offsets),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
-
-  // build null mask
-  // only one entry so it is either all valid or all null
-  size_type null_count = 0;
-  rmm::device_buffer null_mask{0, stream, mr};  // init to null null-mask
-  if (strings.null_count() == strings_count && !narep.is_valid()) {
-    null_mask  = cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr);
-    null_count = 1;
-  }
-  auto chars_column =
-    detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.data<char>();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    strings_count,
-    [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
-      size_type offset = d_output_offsets[idx];
-      char* d_buffer   = d_chars + offset;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid())
-          return;  // do not write to buffer if element is null (including separator)
-        d_buffer = detail::copy_string(d_buffer, d_narep.value());
-      } else {
-        string_view d_str = d_strings.element<string_view>(idx);
-        d_buffer          = detail::copy_string(d_buffer, d_str);
-      }
-      if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator);
-    });
-
-  return make_strings_column(1,
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             null_count,
-                             std::move(null_mask),
-                             stream,
-                             mr);
-}
-
-std::unique_ptr<column> concatenate(table_view const& strings_columns,
-                                    strings_column_view const& separators,
-                                    string_scalar const& separator_narep,
-                                    string_scalar const& col_narep,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  auto const num_columns = strings_columns.num_columns();
-  CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
-  // Check if all columns are of type string
-  CUDF_EXPECTS(std::all_of(strings_columns.begin(),
-                           strings_columns.end(),
-                           [](auto c) { return c.type().id() == type_id::STRING; }),
-               "All columns must be of type string");
-
-  auto const strings_count = strings_columns.num_rows();
-  CUDF_EXPECTS(strings_count == separators.size(),
-               "Separators column should be the same size as the strings columns");
-  if (strings_count == 0)  // Empty begets empty
-    return detail::make_empty_strings_column(stream, mr);
-
-  // Invalid output column strings - null rows
-  string_view const invalid_str{nullptr, 0};
-  auto const separator_rep = get_scalar_device_view(const_cast<string_scalar&>(separator_narep));
-  auto const col_rep       = get_scalar_device_view(const_cast<string_scalar&>(col_narep));
-  auto const separator_col_view_ptr = column_device_view::create(separators.parent(), stream);
-  auto const separator_col_view     = *separator_col_view_ptr;
-
-  if (num_columns == 1) {
-    // Shallow copy of the resultant strings
-    rmm::device_uvector<string_view> out_col_strings(strings_count, stream);
-
-    // Device view of the only column in the table view
-    auto const col0_ptr = column_device_view::create(strings_columns.column(0), stream);
-    auto const col0     = *col0_ptr;
-
-    // Execute it on every element
-    thrust::transform(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(strings_count),
-      out_col_strings.begin(),
-      // Output depends on the separator
-      [col0, invalid_str, separator_col_view, separator_rep, col_rep] __device__(auto ridx) {
-        if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return invalid_str;
-        if (col0.is_valid(ridx)) {
-          auto sv = col0.element<string_view>(ridx);
-          return sv.empty() ? string_view{} : sv;
-        } else if (col_rep.is_valid()) {
-          auto cv = col_rep.value();
-          return cv.empty() ? string_view{} : cv;
-        } else
-          return invalid_str;
-      });
-
-    return make_strings_column(out_col_strings, invalid_str, stream, mr);
-  }
-
-  // Create device views from the strings columns.
-  auto table   = table_device_view::create(strings_columns, stream);
-  auto d_table = *table;
-
-  // Create resulting null mask
-  auto valid_mask = cudf::detail::valid_if(
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings_count),
-    [d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) {
-      if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false;
-      bool all_nulls =
-        thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) {
-          return col.is_null(ridx);
-        });
-      return all_nulls ? col_rep.is_valid() : true;
-    },
-    stream,
-    mr);
-
-  auto null_count = valid_mask.second;
-
-  // Build offsets column by computing sizes of each string in the output
-  auto offsets_transformer = [d_table, separator_col_view, separator_rep, col_rep] __device__(
-                               size_type ridx) {
-    // If the separator value for the row is null and if there aren't global separator
-    // replacements, this row does not have any value - null row
-    if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return 0;
-
-    // For this row (idx), iterate over each column and add up the bytes
-    bool const all_nulls =
-      thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& d_column) {
-        return d_column.is_null(ridx);
-      });
-    // If all column values are null and there isn't a global column replacement value, this row
-    // is a null row
-    if (all_nulls && !col_rep.is_valid()) return 0;
-
-    // There is at least one non-null column value (it can still be empty though)
-    auto const separator_str = separator_col_view.is_valid(ridx)
-                                 ? separator_col_view.element<string_view>(ridx)
-                                 : separator_rep.value();
-
-    size_type const bytes = thrust::transform_reduce(
-      thrust::seq,
-      d_table.begin(),
-      d_table.end(),
-      [ridx, separator_str, col_rep] __device__(column_device_view const& d_column) {
-        // If column is null and there isn't a valid column replacement, this isn't used in
-        // final string concatenate
-        if (d_column.is_null(ridx) && !col_rep.is_valid()) return 0;
-        return separator_str.size_bytes() + (d_column.is_null(ridx)
-                                               ? col_rep.size()
-                                               : d_column.element<string_view>(ridx).size_bytes());
-      },
-      0,
-      thrust::plus<size_type>());
-
-    // Null/empty separator and columns doesn't produce a non-empty string
-    if (bytes == 0) assert(separator_str.size_bytes() == 0);
-
-    // Separator goes only in between elements
-    return static_cast<int32_t>(bytes - separator_str.size_bytes());
-  };
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0), offsets_transformer);
-  auto offsets_column = detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_results_offsets = offsets_column->view().data<int32_t>();
-
-  // Create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_results_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-
-  // Fill the chars column
-  auto d_results_chars = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     [d_table,
-                      num_columns,
-                      d_results_offsets,
-                      d_results_chars,
-                      separator_col_view,
-                      separator_rep,
-                      col_rep] __device__(size_type ridx) {
-                       // If the separator for this row is null and if there isn't a valid separator
-                       // to replace, do not write anything for this row
-                       if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return;
-
-                       bool const all_nulls = thrust::all_of(
-                         thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) {
-                           return col.is_null(ridx);
-                         });
-
-                       // If all column values are null and there isn't a valid column replacement,
-                       // skip this row
-                       if (all_nulls && !col_rep.is_valid()) return;
-
-                       char* d_buffer      = d_results_chars + d_results_offsets[ridx];
-                       bool colval_written = false;
-
-                       // There is at least one non-null column value (it can still be empty though)
-                       auto const separator_str = separator_col_view.is_valid(ridx)
-                                                    ? separator_col_view.element<string_view>(ridx)
-                                                    : separator_rep.value();
-
-                       // Write out each column's entry for this row
-                       for (size_type col_idx = 0; col_idx < num_columns; ++col_idx) {
-                         auto const d_column = d_table.column(col_idx);
-                         // If the row is null and if there is no replacement, skip it
-                         if (d_column.is_null(ridx) && !col_rep.is_valid()) continue;
-
-                         // Separator goes only in between elements
-                         if (colval_written)
-                           d_buffer = detail::copy_string(d_buffer, separator_str);
-
-                         string_view const d_str = d_column.is_null(ridx)
-                                                     ? col_rep.value()
-                                                     : d_column.element<string_view>(ridx);
-                         d_buffer       = detail::copy_string(d_buffer, d_str);
-                         colval_written = true;
-                       }
-                     });
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
-                             null_count,
-                             (null_count) ? std::move(valid_mask.first) : rmm::device_buffer{},
-                             stream,
-                             mr);
-}
-
-}  // namespace detail
-
-// APIs
-
-std::unique_ptr<column> concatenate(table_view const& strings_columns,
-                                    string_scalar const& separator,
-                                    string_scalar const& narep,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> join_strings(strings_column_view const& strings,
-                                     string_scalar const& separator,
-                                     string_scalar const& narep,
-                                     rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::join_strings(strings, separator, narep, rmm::cuda_stream_default, mr);
-}
-
-std::unique_ptr<column> concatenate(table_view const& strings_columns,
-                                    strings_column_view const& separators,
-                                    string_scalar const& separator_narep,
-                                    string_scalar const& col_narep,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::concatenate(
-    strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr);
-}
-
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
new file mode 100644
index 00000000000..1329ad3113f
--- /dev/null
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/logical.h>
+
+#include <algorithm>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+struct concat_strings_base {
+  table_device_view const d_table;
+  string_scalar_device_view const d_narep;
+  separator_on_nulls separate_nulls;
+  offset_type* d_offsets{};
+  char* d_chars{};
+
+  /**
+   * @brief Concatenate each table row to a single output string.
+   *
+   * This will concatenate the strings from each row of the given table
+   * and apply the separator. The null-replacement string `d_narep` is
+   * used in place of any string in a row that contains a null entry.
+   *
+   * @param idx The current row to process
+   * @param d_separator String to place in between each column's row
+   */
+  __device__ void process_row(size_type idx, string_view const d_separator)
+  {
+    if (!d_narep.is_valid() &&
+        thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
+          return col.is_null(idx);
+        })) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+
+    char* d_buffer       = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    offset_type bytes    = 0;
+    bool write_separator = false;
+
+    for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) {
+      auto const d_column     = *itr;
+      bool const null_element = d_column.is_null(idx);
+
+      if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) {
+        if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator);
+        bytes += d_separator.size_bytes();
+        write_separator = false;
+      }
+
+      // write out column's row data (or narep if the row is null)
+      auto const d_str = null_element ? d_narep.value() : d_column.element<string_view>(idx);
+      if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str);
+      bytes += d_str.size_bytes();
+
+      write_separator =
+        write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
+    }
+
+    if (!d_chars) d_offsets[idx] = bytes;
+  }
+};
+
+/**
+ * @brief Single separator concatenate functor
+ */
+struct concat_strings_fn : concat_strings_base {
+  string_view const d_separator;
+
+  concat_strings_fn(table_device_view const& d_table,
+                    string_view const& d_separator,
+                    string_scalar_device_view const& d_narep,
+                    separator_on_nulls separate_nulls)
+    : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator)
+  {
+  }
+
+  __device__ void operator()(size_type idx) { process_row(idx, d_separator); }
+};
+
+}  // namespace
+
+std::unique_ptr<column> concatenate(table_view const& strings_columns,
+                                    string_scalar const& separator,
+                                    string_scalar const& narep,
+                                    separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  auto const num_columns = strings_columns.num_columns();
+  CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified");
+  // check all columns are of type string
+  CUDF_EXPECTS(std::all_of(strings_columns.begin(),
+                           strings_columns.end(),
+                           [](auto c) { return c.type().id() == type_id::STRING; }),
+               "All columns must be of type string");
+  auto const strings_count = strings_columns.num_rows();
+  if (strings_count == 0)  // empty begets empty
+    return detail::make_empty_strings_column(stream, mr);
+
+  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
+  string_view d_separator(separator.data(), separator.size());
+  auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
+
+  // Create device views from the strings columns.
+  auto d_table = table_device_view::create(strings_columns, stream);
+  concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
+  auto children = make_strings_children(fn, strings_count, stream, mr);
+
+  // create resulting null mask
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    [d_table = *d_table, d_narep] __device__(size_type idx) {
+      if (d_narep.is_valid()) return true;
+      return !thrust::any_of(
+        thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); });
+    },
+    stream,
+    mr);
+
+  return make_strings_column(strings_count,
+                             std::move(children.first),
+                             std::move(children.second),
+                             null_count,
+                             std::move(null_mask),
+                             stream,
+                             mr);
+}
+
+namespace {
+
+/**
+ * @brief Concatenate strings functor using multiple separators.
+ *
+ * A unique separator is provided for each row along with a string to use
+ * when a separator row is null `d_separator_narep`. The `d_narep` is
+ * used in place of a null entry in the strings columns.
+ */
+struct multi_separator_concat_fn : concat_strings_base {
+  column_device_view const d_separators;
+  string_scalar_device_view const d_separator_narep;
+
+  multi_separator_concat_fn(table_device_view const& d_table,
+                            column_device_view const& d_separators,
+                            string_scalar_device_view const& d_separator_narep,
+                            string_scalar_device_view const& d_narep,
+                            separator_on_nulls separate_nulls)
+    : concat_strings_base{d_table, d_narep, separate_nulls},
+      d_separators(d_separators),
+      d_separator_narep(d_separator_narep)
+  {
+  }
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+
+    auto const d_separator = d_separators.is_valid(idx) ? d_separators.element<string_view>(idx)
+                                                        : d_separator_narep.value();
+    // base class utility function handles the rest
+    process_row(idx, d_separator);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> concatenate(table_view const& strings_columns,
+                                    strings_column_view const& separators,
+                                    string_scalar const& separator_narep,
+                                    string_scalar const& col_narep,
+                                    separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  auto const num_columns = strings_columns.num_columns();
+  CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
+  // Check if all columns are of type string
+  CUDF_EXPECTS(std::all_of(strings_columns.begin(),
+                           strings_columns.end(),
+                           [](auto c) { return c.type().id() == type_id::STRING; }),
+               "All columns must be of type string");
+
+  auto const strings_count = strings_columns.num_rows();
+  CUDF_EXPECTS(strings_count == separators.size(),
+               "Separators column should be the same size as the strings columns");
+  if (strings_count == 0)  // Empty begets empty
+    return detail::make_empty_strings_column(stream, mr);
+
+  // Invalid output column strings - null rows
+  string_view const invalid_str{nullptr, 0};
+  auto const separator_rep = get_scalar_device_view(const_cast<string_scalar&>(separator_narep));
+  auto const col_rep       = get_scalar_device_view(const_cast<string_scalar&>(col_narep));
+  auto const separator_col_view_ptr = column_device_view::create(separators.parent(), stream);
+  auto const separator_col_view     = *separator_col_view_ptr;
+
+  // Create device views from the strings columns.
+  auto d_table = table_device_view::create(strings_columns, stream);
+
+  multi_separator_concat_fn mscf{
+    *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
+  auto children = make_strings_children(mscf, strings_count, stream, mr);
+
+  // Create resulting null mask
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) {
+      if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false;
+      if (col_rep.is_valid()) return true;
+      return !thrust::any_of(
+        thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); });
+    },
+    stream,
+    mr);
+
+  return make_strings_column(strings_count,
+                             std::move(children.first),
+                             std::move(children.second),
+                             null_count,
+                             std::move(null_mask),
+                             stream,
+                             mr);
+}
+
+}  // namespace detail
+
+// APIs
+
+std::unique_ptr<column> concatenate(table_view const& strings_columns,
+                                    string_scalar const& separator,
+                                    string_scalar const& narep,
+                                    separator_on_nulls separate_nulls,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::concatenate(
+    strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> concatenate(table_view const& strings_columns,
+                                    strings_column_view const& separators,
+                                    string_scalar const& separator_narep,
+                                    string_scalar const& col_narep,
+                                    separator_on_nulls separate_nulls,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::concatenate(strings_columns,
+                             separators,
+                             separator_narep,
+                             col_narep,
+                             separate_nulls,
+                             rmm::cuda_stream_default,
+                             mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
new file mode 100644
index 00000000000..cdfaf856513
--- /dev/null
+++ b/cpp/src/strings/combine/join.cu
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/transform_scan.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+std::unique_ptr<column> join_strings(strings_column_view const& strings,
+                                     string_scalar const& separator,
+                                     string_scalar const& narep,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  auto strings_count = strings.size();
+  if (strings_count == 0) return detail::make_empty_strings_column(stream, mr);
+
+  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
+
+  string_view d_separator(separator.data(), separator.size());
+  auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
+
+  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto d_strings      = *strings_column;
+
+  // create an offsets array for building the output memory layout
+  rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
+  auto d_output_offsets = output_offsets.data();
+  // using inclusive-scan to compute last entry which is the total size
+  thrust::transform_inclusive_scan(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    d_output_offsets + 1,
+    [d_strings, d_separator, d_narep] __device__(size_type idx) {
+      size_type bytes = 0;
+      if (d_strings.is_null(idx)) {
+        if (!d_narep.is_valid()) return 0;  // skip nulls
+        bytes += d_narep.size();
+      } else
+        bytes += d_strings.element<string_view>(idx).size_bytes();
+      if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes();
+      return bytes;
+    },
+    thrust::plus<size_type>());
+  size_type const zero = 0;
+  output_offsets.set_element_async(0, zero, stream);
+  // total size is the last entry
+  // Note this call does a synchronize on the stream and thereby also protects the
+  // set_element_async parameter from going out of scope before it is used.
+  size_type const bytes = output_offsets.back_element(stream);
+
+  // build offsets column (only 1 string so 2 offset entries)
+  auto offsets_column =
+    make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_view = offsets_column->mutable_view();
+  // set the first entry to 0 and the last entry to bytes
+  int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
+  CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
+                           new_offsets,
+                           sizeof(new_offsets),
+                           cudaMemcpyHostToDevice,
+                           stream.value()));
+
+  // build null mask
+  // only one entry so it is either all valid or all null
+  auto const null_count =
+    static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid());
+  auto null_mask = null_count
+                     ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
+                     : rmm::device_buffer{0, stream, mr};
+  auto chars_column = detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    strings_count,
+    [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
+      size_type offset = d_output_offsets[idx];
+      char* d_buffer   = d_chars + offset;
+      if (d_strings.is_null(idx)) {
+        if (!d_narep.is_valid())
+          return;  // do not write to buffer if element is null (including separator)
+        d_buffer = detail::copy_string(d_buffer, d_narep.value());
+      } else {
+        string_view d_str = d_strings.element<string_view>(idx);
+        d_buffer          = detail::copy_string(d_buffer, d_str);
+      }
+      if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator);
+    });
+
+  return make_strings_column(1,
+                             std::move(offsets_column),
+                             std::move(chars_column),
+                             null_count,
+                             std::move(null_mask),
+                             stream,
+                             mr);
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> join_strings(strings_column_view const& strings,
+                                     string_scalar const& separator,
+                                     string_scalar const& narep,
+                                     rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::join_strings(strings, separator, narep, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
new file mode 100644
index 00000000000..7edb0cd8e7b
--- /dev/null
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+namespace {
+/**
+ * @brief Compute string sizes, string validities, and concatenate strings functor.
+ *
+ * This functor is executed twice. In the first pass, the sizes and validities of the output strings
+ * will be computed. In the second pass, this will concatenate the strings within each list element
+ * of the given lists column and apply the separator. The null-replacement string scalar
+ * `string_narep_dv` (if valid) is used in place of any null string.
+ *
+ * @tparam Functor The functor which can check for validity of the input list at a given list index
+ * as well as access to the separator corresponding to the list index.
+ */
+template <class Functor>
+struct compute_size_and_concatenate_fn {
+  Functor const func;
+  column_device_view const lists_dv;
+  offset_type const* const list_offsets;
+  column_device_view const strings_dv;
+  string_scalar_device_view const string_narep_dv;
+  separator_on_nulls const separate_nulls;
+  output_if_empty_list const empty_list_policy;
+
+  offset_type* d_offsets{nullptr};
+
+  // If d_chars == nullptr: only compute sizes and validities of the output strings.
+  // If d_chars != nullptr: only concatenate strings.
+  char* d_chars{nullptr};
+
+  // We need to set `1` or `0` for the validities of the output strings.
+  int8_t* d_validities{nullptr};
+
+  __device__ bool output_is_null(size_type const idx,
+                                 size_type const start_idx,
+                                 size_type const end_idx) const noexcept
+  {
+    if (func.is_null_list(lists_dv, idx)) { return true; }
+    return empty_list_policy == output_if_empty_list::NULL_ELEMENT && start_idx == end_idx;
+  }
+
+  __device__ void operator()(size_type const idx) const noexcept
+  {
+    // If this is the second pass, and the row `idx` is known to be a null string
+    if (d_chars && !d_validities[idx]) { return; }
+
+    // Indices of the strings within the list row
+    auto const start_idx = list_offsets[idx];
+    auto const end_idx   = list_offsets[idx + 1];
+
+    if (!d_chars && output_is_null(idx, start_idx, end_idx)) {
+      d_offsets[idx]    = 0;
+      d_validities[idx] = false;
+      return;
+    }
+
+    auto const separator   = func.separator(idx);
+    auto size_bytes        = size_type{0};
+    char* output_ptr       = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    bool has_valid_element = false;
+    bool write_separator   = false;
+
+    for (size_type str_idx = start_idx; str_idx < end_idx; ++str_idx) {
+      bool null_element = strings_dv.is_null(str_idx);
+      has_valid_element = has_valid_element || !null_element;
+
+      if (!d_chars && (null_element && !string_narep_dv.is_valid())) {
+        d_offsets[idx]    = 0;
+        d_validities[idx] = false;
+        return;  // early termination: the entire list of strings will result in a null string
+      }
+
+      if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) {
+        if (output_ptr) output_ptr = detail::copy_string(output_ptr, separator);
+        size_bytes += separator.size_bytes();
+        write_separator = false;
+      }
+
+      auto const d_str =
+        null_element ? string_narep_dv.value() : strings_dv.element<string_view>(str_idx);
+      if (output_ptr) output_ptr = detail::copy_string(output_ptr, d_str);
+      size_bytes += d_str.size_bytes();
+
+      write_separator =
+        write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
+    }
+
+    // If there are all null elements, the output should be the same as having an empty list input:
+    // a null or an empty string
+    if (!d_chars) {
+      d_offsets[idx] = has_valid_element ? size_bytes : 0;
+      d_validities[idx] =
+        has_valid_element || empty_list_policy == output_if_empty_list::EMPTY_STRING;
+    }
+  }
+};
+
+/**
+ * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string
+ * sizes, output string validities, and concatenating strings within list elements; used when the
+ * separator is a string scalar.
+ */
+struct scalar_separator_fn {
+  string_scalar_device_view const d_separator;
+
+  __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const
+    noexcept
+  {
+    return lists_dv.is_null(idx);
+  }
+
+  __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); }
+};
+
+}  // namespace
+
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           string_scalar const& separator,
+                                           string_scalar const& narep,
+                                           separator_on_nulls separate_nulls,
+                                           output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
+               "The input column must be a column of lists of strings");
+  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
+
+  auto const num_rows = lists_strings_column.size();
+  if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); }
+
+  // Accessing the child strings column of the lists column must be done by calling `child()` on the
+  // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the
+  // lists column returns a pointer to the offsets of the original lists column, which may not start
+  // from `0`.
+  auto const strings_col     = strings_column_view(lists_strings_column.child());
+  auto const lists_dv_ptr    = column_device_view::create(lists_strings_column.parent(), stream);
+  auto const strings_dv_ptr  = column_device_view::create(strings_col.parent(), stream);
+  auto const sep_dv          = get_scalar_device_view(const_cast<string_scalar&>(separator));
+  auto const string_narep_dv = get_scalar_device_view(const_cast<string_scalar&>(narep));
+
+  auto const func = scalar_separator_fn{sep_dv};
+  auto const comp_fn =
+    compute_size_and_concatenate_fn<decltype(func)>{func,
+                                                    *lists_dv_ptr,
+                                                    lists_strings_column.offsets_begin(),
+                                                    *strings_dv_ptr,
+                                                    string_narep_dv,
+                                                    separate_nulls,
+                                                    empty_list_policy};
+  auto [offsets_column, chars_column, null_mask, null_count] =
+    make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr);
+
+  return make_strings_column(num_rows,
+                             std::move(offsets_column),
+                             std::move(chars_column),
+                             null_count,
+                             std::move(null_mask),
+                             stream,
+                             mr);
+}
+
+namespace {
+/**
+ * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string
+ * sizes, output string validities, and concatenating strings within list elements; used when the
+ * separators are given as a strings column.
+ */
+struct column_separators_fn {
+  column_device_view const separators_dv;
+  string_scalar_device_view const sep_narep_dv;
+
+  __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const
+    noexcept
+  {
+    return lists_dv.is_null(idx) || (separators_dv.is_null(idx) && !sep_narep_dv.is_valid());
+  }
+
+  __device__ string_view separator(size_type const idx) const noexcept
+  {
+    return separators_dv.is_valid(idx) ? separators_dv.element<string_view>(idx)
+                                       : sep_narep_dv.value();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           strings_column_view const& separators,
+                                           string_scalar const& separator_narep,
+                                           string_scalar const& string_narep,
+                                           separator_on_nulls separate_nulls,
+                                           output_if_empty_list empty_list_policy,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
+               "The input column must be a column of lists of strings");
+  CUDF_EXPECTS(lists_strings_column.size() == separators.size(),
+               "Separators column should be the same size as the lists columns");
+
+  auto const num_rows = lists_strings_column.size();
+  if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); }
+
+  // Accessing the child strings column of the lists column must be done by calling `child()` on the
+  // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the
+  // lists column returns a pointer to the offsets of the original lists column, which may not start
+  // from `0`.
+  auto const strings_col     = strings_column_view(lists_strings_column.child());
+  auto const lists_dv_ptr    = column_device_view::create(lists_strings_column.parent(), stream);
+  auto const strings_dv_ptr  = column_device_view::create(strings_col.parent(), stream);
+  auto const string_narep_dv = get_scalar_device_view(const_cast<string_scalar&>(string_narep));
+  auto const sep_dv_ptr      = column_device_view::create(separators.parent(), stream);
+  auto const sep_narep_dv    = get_scalar_device_view(const_cast<string_scalar&>(separator_narep));
+
+  auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv};
+  auto const comp_fn =
+    compute_size_and_concatenate_fn<decltype(func)>{func,
+                                                    *lists_dv_ptr,
+                                                    lists_strings_column.offsets_begin(),
+                                                    *strings_dv_ptr,
+                                                    string_narep_dv,
+                                                    separate_nulls,
+                                                    empty_list_policy};
+  auto [offsets_column, chars_column, null_mask, null_count] =
+    make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr);
+
+  return make_strings_column(num_rows,
+                             std::move(offsets_column),
+                             std::move(chars_column),
+                             null_count,
+                             std::move(null_mask),
+                             stream,
+                             mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           string_scalar const& separator,
+                                           string_scalar const& narep,
+                                           separator_on_nulls separate_nulls,
+                                           output_if_empty_list empty_list_policy,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::join_list_elements(lists_strings_column,
+                                    separator,
+                                    narep,
+                                    separate_nulls,
+                                    empty_list_policy,
+                                    rmm::cuda_stream_default,
+                                    mr);
+}
+
+std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
+                                           strings_column_view const& separators,
+                                           string_scalar const& separator_narep,
+                                           string_scalar const& string_narep,
+                                           separator_on_nulls separate_nulls,
+                                           output_if_empty_list empty_list_policy,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::join_list_elements(lists_strings_column,
+                                    separators,
+                                    separator_narep,
+                                    string_narep,
+                                    separate_nulls,
+                                    empty_list_policy,
+                                    rmm::cuda_stream_default,
+                                    mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index e1b203a60e2..b0543366a8e 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -21,14 +21,13 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <strings/utilities.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -124,11 +123,10 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   auto d_offsets    = offsets_view.data<int32_t>();
 
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    create_chars_child_column(strings_count, booleans.null_count(), bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 8b46f66a48f..637a612472b 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -29,8 +30,6 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <strings/utilities.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -962,11 +961,10 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
   auto d_new_offsets = offsets_view.template data<int32_t>();
 
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
-  auto chars_column =
-    create_chars_child_column(strings_count, timestamps.null_count(), bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.template data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().template data<char>();
   // fill in chars column with timestamps
   // dispatcher is called to handle the different timestamp types
   cudf::type_dispatcher(timestamps.type(),
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 9e0df374bc8..325fa428cc6 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/types.hpp>
 #include <strings/convert/utilities.cuh>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -427,10 +427,8 @@ struct dispatch_from_durations_fn {
     // build chars column
     auto const chars_bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
-    auto chars_column = detail::create_chars_child_column(
-      strings_count, durations.null_count(), chars_bytes, stream, mr);
-    auto chars_view = chars_column->mutable_view();
-    auto d_chars    = chars_view.template data<char>();
+    auto chars_column = detail::create_chars_child_column(strings_count, chars_bytes, stream, mr);
+    auto d_chars      = chars_column->mutable_view().template data<char>();
 
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index c22fee5ec05..5674f546c8c 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -22,23 +22,113 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <strings/convert/utilities.cuh>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/optional.h>
 #include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
+
+struct string_to_decimal_base {
+  /**
+   * @brief Return the integer component of a decimal string.
+   *
+   * This is reads everything up to the exponent 'e' notation.
+   * The return includes the integer digits and any exponent offset.
+   *
+   * @param[in,out] iter Start of characters to parse
+   * @param[in] end End of characters to parse
+   * @return Integer component and exponent offset.
+   */
+  __device__ thrust::pair<uint64_t, int32_t> parse_integer(char const*& iter,
+                                                           char const* iter_end) const
+  {
+    // highest value where another decimal digit cannot be appended without an overflow;
+    // this preserves the most digits when scaling the final result
+    constexpr uint64_t decimal_max = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
+
+    uint64_t value     = 0;  // for checking overflow
+    int32_t exp_offset = 0;
+    bool decimal_found = false;
+
+    while (iter < iter_end) {
+      auto const ch = *iter++;
+      if (ch == '.' && !decimal_found) {
+        decimal_found = true;
+        continue;
+      }
+      if (ch < '0' || ch > '9') {
+        --iter;
+        break;
+      }
+      if (value > decimal_max) {
+        exp_offset += static_cast<int32_t>(!decimal_found);
+      } else {
+        value = (value * 10) + static_cast<uint64_t>(ch - '0');
+        exp_offset -= static_cast<int32_t>(decimal_found);
+      }
+    }
+    return {value, exp_offset};
+  }
+
+  /**
+   * @brief Return the exponent of a decimal string.
+   *
+   * This should only be called after the exponent 'e' notation was detected.
+   * The return is the exponent (base-10) integer and can only be
+   * invalid if `check_only == true` and invalid characters are found or the
+   * exponent overflows an int32.
+   *
+   * @tparam check_only Set to true to verify the characters are valid and the
+   *         exponent value in the decimal string does not overflow int32
+   * @param[in,out] iter Start of characters to parse
+   *                     (points to the character after the 'E' or 'e')
+   * @param[in] end End of characters to parse
+   * @return Integer value of the exponent
+   */
+  template <bool check_only = false>
+  __device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const* iter_end) const
+  {
+    constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
+
+    // get optional exponent sign
+    int32_t const exp_sign = [&iter] {
+      auto const ch = *iter;
+      if (ch != '-' && ch != '+') { return 1; }
+      ++iter;
+      return (ch == '-' ? -1 : 1);
+    }();
+
+    // parse exponent integer
+    int32_t exp_ten = 0;
+    while (iter < iter_end) {
+      auto const ch = *iter++;
+      if (ch < '0' || ch > '9') {
+        if (check_only) { return thrust::nullopt; }
+        break;
+      }
+
+      uint32_t exp_check = static_cast<uint32_t>(exp_ten * 10) + static_cast<uint32_t>(ch - '0');
+      if (check_only && (exp_check > exponent_max)) { return thrust::nullopt; }  // check overflow
+      exp_ten = static_cast<int32_t>(exp_check);
+    }
+
+    return exp_ten * exp_sign;
+  }
+};
+
 /**
  * @brief Converts strings into an integers and records decimal places.
  *
@@ -46,61 +136,49 @@ namespace {
  * integer. This can prevent overflow for strings with many digits.
  */
 template <typename DecimalType>
-struct string_to_decimal_fn {
+struct string_to_decimal_fn : string_to_decimal_base {
   column_device_view const d_strings;
   int32_t const scale;
 
+  string_to_decimal_fn(column_device_view const& d_strings, int32_t scale)
+    : d_strings(d_strings), scale(scale)
+  {
+  }
+
   __device__ DecimalType operator()(size_type idx) const
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) { return 0; }
     auto const d_str = d_strings.element<string_view>(idx);
-    if (d_str.empty()) return 0;
+    if (d_str.empty()) { return 0; }
 
     auto const sign = [&] {
-      if (d_str.data()[0] == '-') return -1;
-      if (d_str.data()[0] == '+') return 1;
+      if (d_str.data()[0] == '-') { return -1; }
+      if (d_str.data()[0] == '+') { return 1; }
       return 0;
     }();
     auto iter = d_str.data() + (sign != 0);
 
-    int64_t value = 0;
-    if (scale >= 0) {
-      // find end-point which is (begin + max(0,length-scale))
-      // where length = number bytes up to the decimal point
-      auto const iter_end =
-        iter +
-        std::max(0,
-                 static_cast<int32_t>(thrust::distance(
-                   iter, thrust::find(thrust::seq, iter, d_str.data() + d_str.size_bytes(), '.'))) -
-                   scale);
-      // only convert up to the number characters needed for the specified scale
-      while (iter != iter_end) {
-        auto const chr = *iter++;
-        if (chr < '0' || chr > '9') break;
-        value = (value * 10) + static_cast<int64_t>(chr - '0');
-      }
-    } else {  // scale < 0
-      auto const iter_end = d_str.data() + d_str.size_bytes();
-      int32_t curr_scale  = scale;
-      bool decimal_found  = false;
-      // convert up through the decimal point until the
-      // end of the string or until curr_scale==0
-      while (iter != iter_end) {
-        auto const chr = *iter++;
-        if (chr >= '0' && chr <= '9') {
-          if (decimal_found && (curr_scale == 0)) break;  // processing done
-          value = (value * 10) + static_cast<int64_t>(chr - '0');
-          curr_scale += (decimal_found && (curr_scale < 0));
-        } else if (chr == '.') {
-          decimal_found = true;
-        } else
-          break;
-      }
-      // account for any left over scale
-      value *= static_cast<int64_t>(exp10(static_cast<double>(-curr_scale)));
+    auto const iter_end = d_str.data() + d_str.size_bytes();
+
+    auto [value, exp_offset] = parse_integer(iter, iter_end);
+    if (value == 0) { return DecimalType{0}; }
+
+    // check for exponent
+    int32_t exp_ten = 0;
+    if ((iter < iter_end) && (*iter == 'e' || *iter == 'E')) {
+      ++iter;
+      if (iter < iter_end) { exp_ten = parse_exponent<false>(iter, iter_end).value(); }
     }
+    exp_ten += exp_offset;
 
-    return static_cast<DecimalType>(value * (sign == 0 ? 1 : sign));
+    // shift the output value based on the exp_ten and the scale values
+    if (exp_ten < scale) {
+      value = value / static_cast<uint64_t>(exp10(static_cast<double>(scale - exp_ten)));
+    } else {
+      value = value * static_cast<uint64_t>(exp10(static_cast<double>(exp_ten - scale)));
+    }
+
+    return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
   }
 };
 
@@ -111,61 +189,45 @@ struct string_to_decimal_fn {
  * characters for conversion and the integer component does not overflow.
  */
 template <typename DecimalType>
-struct string_to_decimal_check_fn {
+struct string_to_decimal_check_fn : string_to_decimal_base {
   column_device_view const d_strings;
   int32_t const scale;
 
+  string_to_decimal_check_fn(column_device_view const& d_strings, int32_t scale)
+    : d_strings(d_strings), scale(scale)
+  {
+  }
+
   __device__ bool operator()(size_type idx) const
   {
-    if (d_strings.is_null(idx)) return false;
+    if (d_strings.is_null(idx)) { return false; }
     auto const d_str = d_strings.element<string_view>(idx);
-    if (d_str.empty()) return false;
+    if (d_str.empty()) { return false; }
 
     auto iter = d_str.data() + static_cast<int>((d_str.data()[0] == '-' || d_str.data()[0] == '+'));
 
-    // The following variables identify 3 possible locations in the decimal string
-    //     +123456789.09876543
-    //            ^  ^        ^
-    //      check-^  ^        ^- end
-    //               ^- decimal
-    // The iter_check value will be unique when scale > 0 and
-    // the number of digits left of the decimal point is larger than the scale.
-    auto const iter_end     = d_str.data() + d_str.size_bytes();
-    auto const iter_decimal = thrust::find(thrust::seq, iter, iter_end, '.');
-    auto const iter_check =
-      scale < 0
-        ? iter_decimal
-        : iter + std::max(0, static_cast<int32_t>(thrust::distance(iter, iter_decimal)) - scale);
-
-    DecimalType value  = 0;      // used for overflow checking
-    bool decimal_found = false;  // mainly for checking duplicate decimal points
-    int32_t curr_scale = scale;  // running scale for scale < 0 case
-    while (iter != iter_end) {   // check all bytes for valid characters
-      auto const chr = *iter++;
-      if (chr == '.' && !decimal_found) {
-        decimal_found = true;
-        continue;
-      }
-      if (chr < '0' || chr > '9') return false;            // invalid character check
-      if (iter > iter_check && curr_scale >= 0) continue;  // overflow checking no longer needed
+    auto const iter_end = d_str.data() + d_str.size_bytes();
+
+    auto [value, exp_offset] = parse_integer(iter, iter_end);
 
-      // check for overflow in the integer component
-      auto const digit     = static_cast<DecimalType>(chr - '0');
-      auto const max_check = (std::numeric_limits<DecimalType>::max() - digit) / DecimalType{10};
-      if (value > max_check) return false;
-      value = (value * DecimalType{10}) + digit;
+    // only exponent notation is expected here
+    if ((iter < iter_end) && (*iter != 'e' && *iter != 'E')) { return false; }
+    ++iter;
 
-      // increment running scale if we are right of the decimal point
-      curr_scale += (decimal_found && curr_scale < 0);
+    int32_t exp_ten = 0;  // check exponent overflow
+    if (iter < iter_end) {
+      auto exp_result = parse_exponent<true>(iter, iter_end);
+      if (!exp_result) { return false; }
+      exp_ten = exp_result.value();
     }
-    // check overflow on any remaining negative scale value
-    if ((curr_scale < 0) &&
-        (value > (std::numeric_limits<DecimalType>::max() /
-                  static_cast<DecimalType>(exp10(static_cast<double>(-curr_scale))))))
-      return false;
-
-    // everything passed
-    return true;
+    exp_ten += exp_offset;
+
+    // finally, check for overflow based on the exp_ten and scale values
+    return (exp_ten < scale)
+             ? true
+             : value <= static_cast<uint64_t>(
+                          std::numeric_limits<DecimalType>::max() /
+                          static_cast<DecimalType>(exp10(static_cast<double>(exp_ten - scale))));
   }
 };
 
@@ -334,9 +396,8 @@ struct dispatch_from_fixed_point_fn {
     // build chars column
     auto const bytes =
       cudf::detail::get_value<int32_t>(offsets_column->view(), input.size(), stream);
-    auto chars_column =
-      detail::create_chars_child_column(input.size(), input.null_count(), bytes, stream, mr);
-    auto d_chars = chars_column->mutable_view().template data<char>();
+    auto chars_column = detail::create_chars_child_column(input.size(), bytes, stream, mr);
+    auto d_chars      = chars_column->mutable_view().template data<char>();
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index b6d99efd51f..0cdbea1d4ef 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -20,13 +20,13 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -124,8 +124,7 @@ __device__ inline double stod(string_view const& d_str)
   else if (exp_ten < std::numeric_limits<double>::min_exponent10)
     return double{0};
 
-  // using exp10() since the pow(10.0,exp_ten) function is
-  // very inaccurate in 10.2: http://nvbugs/2971187
+  // exp10() is faster than pow(10.0,exp_ten)
   double const base =
     sign * static_cast<double>(digits) * exp10(static_cast<double>(1 - num_digits));
   double const exponent = exp10(static_cast<double>(exp_ten));
@@ -492,11 +491,10 @@ struct dispatch_from_floats_fn {
     auto d_offsets    = offsets_view.template data<int32_t>();
 
     // build chars column
-    size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-    auto chars_column =
-      detail::create_chars_child_column(strings_count, floats.null_count(), bytes, stream, mr);
-    auto chars_view = chars_column->mutable_view();
-    auto d_chars    = chars_view.template data<char>();
+    auto const bytes  = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
+    auto chars_column = detail::create_chars_child_column(strings_count, bytes, stream, mr);
+    auto chars_view   = chars_column->mutable_view();
+    auto d_chars      = chars_view.template data<char>();
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 6f087bda1bf..48d25c1707f 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,12 +19,12 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 7eee2b3cc0e..575058eae09 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -21,13 +21,13 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/detail/converters.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <strings/convert/utilities.cuh>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -349,11 +349,10 @@ struct dispatch_from_integers_fn {
     auto d_new_offsets = offsets_view.template data<int32_t>();
 
     // build chars column
-    size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count];
-    auto chars_column =
-      detail::create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr);
-    auto chars_view = chars_column->mutable_view();
-    auto d_chars    = chars_view.template data<char>();
+    auto const bytes  = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
+    auto chars_column = detail::create_chars_child_column(strings_count, bytes, stream, mr);
+    auto chars_view   = chars_column->mutable_view();
+    auto d_chars      = chars_view.template data<char>();
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        strings_count,
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 6347c0eab61..016b9befe5c 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,10 +19,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_ipv4.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -190,10 +190,10 @@ std::unique_ptr<column> integers_to_ipv4(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    create_chars_child_column(strings_count, integers.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index cdca23a3584..3be81dbd005 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -20,11 +20,11 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/span.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -139,14 +139,11 @@ std::unique_ptr<column> url_encode(
   auto offsets_column = make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
   auto d_offsets = offsets_column->view().data<int32_t>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   // build chars column
-  auto chars_column =
-    create_chars_child_column(strings_count,
-                              strings.null_count(),
-                              thrust::device_pointer_cast(d_offsets)[strings_count],
-                              stream,
-                              mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
@@ -355,7 +352,7 @@ std::unique_ptr<column> url_decode(
 
   if (esc_count == 0) {
     // nothing to replace, so just copy the input column
-    return std::make_unique<cudf::column>(strings.parent());
+    return std::make_unique<cudf::column>(strings.parent(), stream, mr);
   }
 
   // create a vector of the potential escape sequence positions
@@ -378,7 +375,7 @@ std::unique_ptr<column> url_decode(
   esc_count = esc_pos_end - d_esc_positions;
   if (esc_count == 0) {
     // nothing to replace, so just copy the input column
-    return std::make_unique<cudf::column>(strings.parent());
+    return std::make_unique<cudf::column>(strings.parent(), stream, mr);
   }
 
   device_span<size_type const> d_esc_positions_span(d_esc_positions, esc_count);
@@ -396,7 +393,6 @@ std::unique_ptr<column> url_decode(
   // create the chars column
   auto chars_column =
     create_chars_child_column(strings_count,
-                              strings.null_count(),
                               chars_bytes - (esc_count * 2),  // replacing 3 bytes with 1
                               stream,
                               mr);
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 779a64c9eb2..996fdf9997b 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -78,9 +78,9 @@ std::unique_ptr<column> fill(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // create the chars column
-  size_type bytes = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column =
-    strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
   // fill the chars column
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 7ed77a830ad..8c8a2ab05b1 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -21,13 +21,12 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
 
-#include <strings/utilities.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -139,8 +138,7 @@ std::unique_ptr<column> filter_characters(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto children = cudf::strings::detail::make_strings_children(
-    ffn, strings.size(), strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(children.first),
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index cd8aae12070..20d4aa2a307 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/json.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
@@ -97,80 +98,48 @@ class parser {
     return false;
   }
 
+  /**
+   * @brief Parse a quote-enclosed JSON string.
+   *
+   * @param[out] str The resulting string.
+   * @param can_be_empty Parameter indicating whether it is valid for the string
+   * to not be present.
+   * @param quote Character expected as the surrounding quotes.  A value of 0
+   * indicates allowing either single or double quotes (but not a mixture of both).
+   * @returns A result code indicating success, failure or other result.
+   */
   CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str,
                                                       bool can_be_empty,
                                                       char quote)
   {
     str = string_view(nullptr, 0);
 
-    if (parse_whitespace() && *pos == quote) {
-      const char* start = ++pos;
-      while (!eof()) {
-        if (*pos == quote) {
-          str = string_view(start, pos - start);
+    if (parse_whitespace()) {
+      // if the user specifies 0 for quote, allow either ' or ". otherwise
+      // use the char directly
+      if ((quote == 0 && (*pos == '\'' || *pos == '\"')) || (quote == *pos)) {
+        quote = *pos;
+
+        const char* start = ++pos;
+        while (!eof()) {
+          if (*pos == quote) {
+            str = string_view(start, pos - start);
+            pos++;
+            return parse_result::SUCCESS;
+          }
           pos++;
-          return parse_result::SUCCESS;
         }
-        pos++;
       }
     }
 
     return can_be_empty ? parse_result::EMPTY : parse_result::ERROR;
   }
 
-  // a name means:
-  // - a string followed by a :
-  // - no string
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name,
-                                                    bool can_be_empty,
-                                                    char quote)
-  {
-    if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) {
-      return parse_result::ERROR;
-    }
-
-    // if we got a real string, the next char must be a :
-    if (name.size_bytes() > 0) {
-      if (!parse_whitespace()) { return parse_result::ERROR; }
-      if (*pos == ':') {
-        pos++;
-        return parse_result::SUCCESS;
-      }
-    }
-    return parse_result::EMPTY;
-  }
-
-  // numbers, true, false, null.
-  // this function is not particularly strong. badly formed values will get
-  // consumed without throwing any errors
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val)
-  {
-    if (!parse_whitespace()) { return parse_result::ERROR; }
-
-    // parse to the end of the value
-    char const* start = pos;
-    char const* end   = start;
-    while (!eof(end)) {
-      char const c = *end;
-      if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; }
-
-      // illegal chars
-      if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; }
-      end++;
-    }
-    pos = end;
-
-    val = string_view(start, end - start);
-
-    return parse_result::SUCCESS;
-  }
-
  protected:
   char const* input;
   int64_t input_len;
   char const* pos;
 
- private:
   CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; }
 };
 
@@ -217,11 +186,12 @@ class json_state : private parser {
       parent_el_type(json_element_type::NONE)
   {
   }
-  __device__ json_state(const char* _input, int64_t _input_len)
+  __device__ json_state(const char* _input, int64_t _input_len, get_json_object_options _options)
     : parser(_input, _input_len),
       cur_el_start(nullptr),
       cur_el_type(json_element_type::NONE),
-      parent_el_type(json_element_type::NONE)
+      parent_el_type(json_element_type::NONE),
+      options(_options)
   {
   }
 
@@ -229,7 +199,8 @@ class json_state : private parser {
     : parser(j),
       cur_el_start(j.cur_el_start),
       cur_el_type(j.cur_el_type),
-      parent_el_type(j.parent_el_type)
+      parent_el_type(j.parent_el_type),
+      options(j.options)
   {
   }
 
@@ -245,10 +216,9 @@ class json_state : private parser {
       if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; }
       end = pos;
 
-      // SPARK-specific behavior.  if this is a non-list-element wrapped in quotes,
-      // strip them. we may need to make this behavior configurable in some way
-      // later on.
-      if (!list_element && *start == '\"' && *(end - 1) == '\"') {
+      // potentially strip quotes from individually returned string values.
+      if (options.get_strip_quotes_from_single_strings() && !list_element && is_quote(*start) &&
+          *(end - 1) == *start) {
         start++;
         end--;
       }
@@ -327,7 +297,80 @@ class json_state : private parser {
     return parse_result::ERROR;
   }
 
+  /**
+   * @brief Parse a name field for a JSON element.
+   *
+   * When parsing JSON objects, it is not always a requirement that the name
+   * actually exists.  For example, the outer object bounded by {} here has
+   * no name, while the inner element "a" does.
+   *
+   * ```
+   * {
+   *    "a" : "b"
+   * }
+   * ```
+   *
+   * The user can specify whether or not the name string must be present via
+   * the `can_be_empty` flag.
+   *
+   * When a name is present, it must be followed by a colon `:`
+   *
+   * @param[out] name The resulting name.
+   * @param can_be_empty Parameter indicating whether it is valid for the name
+   * to not be present.
+   * @returns A result code indicating success, failure or other result.
+   */
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, bool can_be_empty)
+  {
+    char const quote = options.get_allow_single_quotes() ? 0 : '\"';
+
+    if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) {
+      return parse_result::ERROR;
+    }
+
+    // if we got a real string, the next char must be a :
+    if (name.size_bytes() > 0) {
+      if (!parse_whitespace()) { return parse_result::ERROR; }
+      if (*pos == ':') {
+        pos++;
+        return parse_result::SUCCESS;
+      }
+    }
+    return parse_result::EMPTY;
+  }
+
  private:
+  /**
+   * @brief Parse a non-string JSON value.
+   *
+   * Non-string values include numbers, true, false, or null. This function does not
+   * do any validation of the value.
+   *
+   * @param val (Output) The string containing the parsed value
+   * @returns A result code indicating success, failure or other result.
+   */
+  CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val)
+  {
+    if (!parse_whitespace()) { return parse_result::ERROR; }
+
+    // parse to the end of the value
+    char const* start = pos;
+    char const* end   = start;
+    while (!eof(end)) {
+      char const c = *end;
+      if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; }
+
+      // illegal chars
+      if (c == '[' || c == '{' || c == ':' || is_quote(c)) { return parse_result::ERROR; }
+      end++;
+    }
+    pos = end;
+
+    val = string_view(start, end - start);
+
+    return parse_result::SUCCESS;
+  }
+
   // parse a value - either a string or a number/null/bool
   __device__ parse_result parse_value()
   {
@@ -335,7 +378,7 @@ class json_state : private parser {
 
     // string or number?
     string_view unused;
-    return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused);
+    return is_quote(*pos) ? parse_string(unused, false, *pos) : parse_non_string_value(unused);
   }
 
   __device__ parse_result next_element_internal(bool child)
@@ -363,7 +406,7 @@ class json_state : private parser {
     // if we're not accessing elements of an array, check for name.
     bool const array_access =
       (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child);
-    if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) {
+    if (!array_access && parse_name(cur_el_name, true) == parse_result::ERROR) {
       return parse_result::ERROR;
     }
 
@@ -374,8 +417,12 @@ class json_state : private parser {
       case '{': cur_el_type = OBJECT; break;
 
       case ',':
-      case ':':
-      case '\'': return parse_result::ERROR;
+      case ':': return parse_result::ERROR;
+
+      case '\'':
+        if (!options.get_allow_single_quotes()) { return parse_result::ERROR; }
+        cur_el_type = VALUE;
+        break;
 
       // value type
       default: cur_el_type = VALUE; break;
@@ -386,11 +433,17 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
+  CUDA_HOST_DEVICE_CALLABLE bool is_quote(char c)
+  {
+    return (c == '\"') || (options.get_allow_single_quotes() && (c == '\''));
+  }
+
   const char* cur_el_start;          // pointer to the first character of the -value- of the current
                                      // element - not the name
   string_view cur_el_name;           // name of the current element (if applicable)
   json_element_type cur_el_type;     // type of the current element
   json_element_type parent_el_type;  // parent element type
+  get_json_object_options options;   // behavior options
 };
 
 enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
@@ -762,6 +815,7 @@ constexpr int max_command_stack_depth = 8;
  * @param out_buf Buffer user to store the results of the query (nullptr in the size computation
  * step)
  * @param out_buf_size Size of the output buffer
+ * @param options Options controlling behavior
  * @returns A pair containing the result code the output buffer.
  */
 __device__ thrust::pair<parse_result, json_output> get_json_object_single(
@@ -769,9 +823,10 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
   size_t input_len,
   path_operator const* const commands,
   char* out_buf,
-  size_t out_buf_size)
+  size_t out_buf_size,
+  get_json_object_options options)
 {
-  json_state j_state(input, input_len);
+  json_state j_state(input, input_len, options);
   json_output output{out_buf_size, out_buf};
 
   auto const result = parse_json_path<max_command_stack_depth>(j_state, commands, output);
@@ -792,6 +847,7 @@ __device__ thrust::pair<parse_result, json_output> get_json_object_single(
  * @param out_buf Buffer used to store the results of the query
  * @param out_validity Output validity buffer
  * @param out_valid_count Output count of # of valid bits
+ * @param options Options controlling behavior
  */
 template <int block_size>
 __launch_bounds__(block_size) __global__
@@ -800,7 +856,8 @@ __launch_bounds__(block_size) __global__
                               offset_type* output_offsets,
                               thrust::optional<char*> out_buf,
                               thrust::optional<bitmask_type*> out_validity,
-                              thrust::optional<size_type*> out_valid_count)
+                              thrust::optional<size_type*> out_valid_count,
+                              get_json_object_options options)
 {
   size_type tid    = threadIdx.x + (blockDim.x * blockIdx.x);
   size_type stride = blockDim.x * gridDim.x;
@@ -821,7 +878,7 @@ __launch_bounds__(block_size) __global__
       parse_result result;
       json_output out;
       thrust::tie(result, out) =
-        get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size);
+        get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size, options);
       output_size = out.output_len.value_or(0);
       if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; }
     }
@@ -857,6 +914,7 @@ __launch_bounds__(block_size) __global__
  */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
+                                              get_json_object_options options,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
@@ -893,7 +951,8 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       offsets_view.head<offset_type>(),
       thrust::nullopt,
       thrust::nullopt,
-      thrust::nullopt);
+      thrust::nullopt,
+      options);
 
   // convert sizes to offsets
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -923,7 +982,8 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       offsets_view.head<offset_type>(),
       chars_view.head<char>(),
       static_cast<bitmask_type*>(validity.data()),
-      d_valid_count.data());
+      d_valid_count.data(),
+      options);
 
   return make_strings_column(col.size(),
                              std::move(offsets),
@@ -942,10 +1002,11 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
  */
 std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
                                               cudf::string_scalar const& json_path,
+                                              get_json_object_options options,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_json_object(col, json_path, 0, mr);
+  return detail::get_json_object(col, json_path, options, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 83a8d5c840f..22bbb7e7a4a 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -20,11 +20,11 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -80,10 +80,10 @@ std::unique_ptr<column> pad(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
 
   if (side == pad_side::LEFT) {
     thrust::for_each_n(
@@ -168,10 +168,10 @@ std::unique_ptr<column> zfill(
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
 
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 5c9d1152cc6..843dc9a7ca7 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
-#include <cuda_runtime.h>
 #include <strings/char_types/is_flags.h>
+#include <strings/utf8.cuh>
+
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
-#include <strings/utilities.cuh>
 
 #include <memory.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index cac774ef43e..1ed1ee4d96f 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -119,15 +119,14 @@ std::unique_ptr<column> replace_with_backrefs(
         backrefs_fn<BackRefIterator, RX_STACK_SMALL>{
           *d_strings, *d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
         strings.size(),
-        strings.null_count(),
         stream,
         mr);
     } else if (regex_insts <= RX_MEDIUM_INSTS)
       return replace_with_backrefs_medium(
-        *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
+        *d_strings, *d_prog, d_repl_template, backrefs, stream, mr);
     else
       return replace_with_backrefs_large(
-        *d_strings, *d_prog, d_repl_template, backrefs, strings.null_count(), stream, mr);
+        *d_strings, *d_prog, d_repl_template, backrefs, stream, mr);
   }();
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index 529b91a98e5..d9ce887a689 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -16,9 +16,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <strings/regex/regex.cuh>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -118,7 +118,6 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            reprog_device& d_prog,
                                            string_view const& d_repl_template,
                                            device_span<backref_type> backrefs,
-                                           size_type null_count,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr);
 
@@ -126,7 +125,6 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           reprog_device& d_prog,
                                           string_view const& d_repl_template,
                                           device_span<backref_type> backrefs,
-                                          size_type null_count,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/strings/replace/backref_re_large.cu b/cpp/src/strings/replace/backref_re_large.cu
index 56bd8941b8a..4f3c2fb3e1d 100644
--- a/cpp/src/strings/replace/backref_re_large.cu
+++ b/cpp/src/strings/replace/backref_re_large.cu
@@ -28,7 +28,6 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings,
                                           reprog_device& d_prog,
                                           string_view const& d_repl_template,
                                           device_span<backref_type> backrefs,
-                                          size_type null_count,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
@@ -37,7 +36,6 @@ children_pair replace_with_backrefs_large(column_device_view const& d_strings,
     backrefs_fn<Iterator, RX_STACK_LARGE>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
-    null_count,
     stream,
     mr);
 }
diff --git a/cpp/src/strings/replace/backref_re_medium.cu b/cpp/src/strings/replace/backref_re_medium.cu
index 8b1dd6c5999..277c75930a6 100644
--- a/cpp/src/strings/replace/backref_re_medium.cu
+++ b/cpp/src/strings/replace/backref_re_medium.cu
@@ -28,7 +28,6 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
                                            reprog_device& d_prog,
                                            string_view const& d_repl_template,
                                            device_span<backref_type> backrefs,
-                                           size_type null_count,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
@@ -37,7 +36,6 @@ children_pair replace_with_backrefs_medium(column_device_view const& d_strings,
     backrefs_fn<Iterator, RX_STACK_MEDIUM>{
       d_strings, d_prog, d_repl_template, backrefs.begin(), backrefs.end()},
     d_strings.size(),
-    null_count,
     stream,
     mr);
 }
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 39725361741..2672dc4fb7a 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -15,7 +15,6 @@
  */
 
 #include <strings/regex/regex.cuh>
-#include <strings/utilities.cuh>
 #include <strings/utilities.hpp>
 
 #include <cudf/column/column.hpp>
@@ -23,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -159,7 +159,7 @@ std::unique_ptr<column> replace_re(
   }
 
   // copy all the reprog_device instances to a device memory array
-  rmm::device_buffer progs_buffer{sizeof(reprog_device) * progs.size()};
+  rmm::device_buffer progs_buffer{sizeof(reprog_device) * progs.size(), stream};
   CUDA_TRY(cudaMemcpyAsync(progs_buffer.data(),
                            progs.data(),
                            progs.size() * sizeof(reprog_device),
@@ -181,7 +181,6 @@ std::unique_ptr<column> replace_re(
         replace_multi_regex_fn<RX_STACK_SMALL>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
         strings_count,
-        strings.null_count(),
         stream,
         mr);
     else if (regex_insts <= RX_MEDIUM_INSTS)
@@ -189,7 +188,6 @@ std::unique_ptr<column> replace_re(
         replace_multi_regex_fn<RX_STACK_MEDIUM>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
         strings_count,
-        strings.null_count(),
         stream,
         mr);
     else
@@ -197,7 +195,6 @@ std::unique_ptr<column> replace_re(
         replace_multi_regex_fn<RX_STACK_LARGE>{
           *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
         strings_count,
-        strings.null_count(),
         stream,
         mr);
   }();
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index ea474644d06..31c6460267c 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -14,19 +14,20 @@
  * limitations under the License.
  */
 
+#include <strings/utilities.hpp>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/replace.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/span.hpp>
-#include <strings/utilities.cuh>
-#include <strings/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -476,12 +477,9 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
                     offsets_update_fn);
 
   // build the characters column
-  auto chars_column = create_chars_child_column(strings_count,
-                                                strings.null_count(),
-                                                chars_bytes + (delta_per_target * target_count),
-                                                stream,
-                                                mr);
-  auto d_out_chars  = chars_column->mutable_view().data<char>();
+  auto chars_column = create_chars_child_column(
+    strings_count, chars_bytes + (delta_per_target * target_count), stream, mr);
+  auto d_out_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(chars_start),
@@ -528,11 +526,7 @@ std::unique_ptr<column> replace_row_parallel(strings_column_view const& strings,
 
   // this utility calls the given functor to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
-    replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl},
-    strings.size(),
-    strings.null_count(),
-    stream,
-    mr);
+    replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -698,12 +692,8 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children =
-    cudf::strings::detail::make_strings_children(replace_slice_fn{*d_strings, d_repl, start, stop},
-                                                 strings.size(),
-                                                 strings.null_count(),
-                                                 stream,
-                                                 mr);
+  auto children = cudf::strings::detail::make_strings_children(
+    replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -790,12 +780,8 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   auto d_repls   = column_device_view::create(repls.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto children =
-    cudf::strings::detail::make_strings_children(replace_multi_fn{*d_strings, *d_targets, *d_repls},
-                                                 strings.size(),
-                                                 strings.null_count(),
-                                                 stream,
-                                                 mr);
+  auto children = cudf::strings::detail::make_strings_children(
+    replace_multi_fn{*d_strings, *d_targets, *d_repls}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -831,10 +817,10 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
   auto d_offsets = offsets_column->view().data<int32_t>();
 
   // build chars column
-  size_type bytes   = thrust::device_pointer_cast(d_offsets)[strings_count];
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data<char>();
+  auto const bytes =
+    cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
+  auto chars_column = strings::detail::create_chars_child_column(strings_count, bytes, stream, mr);
+  auto d_chars      = chars_column->mutable_view().data<char>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
@@ -886,13 +872,5 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   return detail::replace(strings, targets, repls, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
-                                      string_scalar const& repl,
-                                      rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::replace_nulls(strings, repl, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 156b246fdfc..a4606a599bb 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -15,7 +15,6 @@
  */
 
 #include <strings/regex/regex.cuh>
-#include <strings/utilities.cuh>
 #include <strings/utilities.hpp>
 
 #include <cudf/column/column.hpp>
@@ -23,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -135,21 +135,18 @@ std::unique_ptr<column> replace_re(
     children =
       make_strings_children(replace_regex_fn<RX_STACK_SMALL>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
-                            null_count,
                             stream,
                             mr);
   else if (regex_insts <= RX_MEDIUM_INSTS)
     children =
       make_strings_children(replace_regex_fn<RX_STACK_MEDIUM>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
-                            null_count,
                             stream,
                             mr);
   else
     children =
       make_strings_children(replace_regex_fn<RX_STACK_LARGE>{d_strings, d_prog, d_repl, maxrepl},
                             strings_count,
-                            null_count,
                             stream,
                             mr);
 
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 4d6c9389173..abf1f9599dc 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -18,12 +18,11 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
 #include <rmm/exec_policy.hpp>
 
 namespace cudf {
@@ -132,8 +131,8 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
   children.emplace_back(std::move(chars_column));
   return std::make_unique<column>(data_type{type_id::STRING},
                                   num_strings,
-                                  rmm::device_buffer{0, stream, mr},
-                                  null_mask,
+                                  rmm::device_buffer{},
+                                  std::move(null_mask),
                                   null_count,
                                   std::move(children));
 }
diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cu
index 3eb1841e467..3c98796bf2d 100644
--- a/cpp/src/strings/strings_column_view.cu
+++ b/cpp/src/strings/strings_column_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,19 +15,17 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/transform.h>
-#include <thrust/transform_scan.h>
-
-#include <iostream>
 
 namespace cudf {
 //
@@ -57,113 +55,39 @@ size_type strings_column_view::chars_size() const noexcept
 }
 
 namespace strings {
-// print strings to stdout
-void print(strings_column_view const& strings,
-           size_type first,
-           size_type last,
-           size_type max_width,
-           const char* delimiter)
-{
-  size_type count = strings.size();
-  if (last < 0 || last > count) last = count;
-  if (first < 0) first = 0;
-  CUDF_EXPECTS(((first >= 0) && (first < last)), "invalid start parameter");
-  count = last - first;
-
-  // stick with the default stream for this odd/rare stdout function
-  auto strings_column = column_device_view::create(strings.parent());
-  auto d_column       = *strings_column;
-
-  // create output strings offsets
-  rmm::device_vector<size_type> output_offsets(count + 1);
-  size_type* d_output_offsets = output_offsets.data().get();
-  thrust::transform_inclusive_scan(
-    thrust::device,
-    thrust::make_counting_iterator<size_type>(first),
-    thrust::make_counting_iterator<size_type>(last),
-    d_output_offsets + 1,
-    [d_column, max_width] __device__(size_type idx) {
-      if (d_column.is_null(idx)) return static_cast<size_type>(0);
-      string_view d_str = d_column.element<string_view>(idx);
-      size_type bytes   = d_str.size_bytes();
-      if ((max_width > 0) && (d_str.length() > max_width)) bytes = d_str.byte_offset(max_width);
-      return static_cast<size_type>(bytes + 1);  // allow for null-terminator on non-null strings
-    },
-    thrust::plus<size_type>());
-  CUDA_TRY(cudaMemset(d_output_offsets, 0, sizeof(*d_output_offsets)));
-  // build output buffer
-  size_type buffer_size = output_offsets.back();  // last element has total size
-  if (buffer_size == 0) {
-    std::cout << "all " << count << " strings are null\n";
-    return;
-  }
-  rmm::device_vector<char> buffer(buffer_size, 0);  // allocate and pre-null-terminate
-  char* d_buffer = buffer.data().get();
-  // copy strings into output buffer
-  thrust::for_each_n(
-    thrust::device,
-    thrust::make_counting_iterator<size_type>(0),
-    count,
-    [d_column, max_width, first, d_output_offsets, d_buffer] __device__(size_type idx) {
-      if (d_column.is_null(first + idx)) return;
-      string_view d_str = d_column.element<string_view>(first + idx);
-      size_type bytes   = d_str.size_bytes();
-      if ((max_width > 0) && (d_str.length() > max_width)) bytes = d_str.byte_offset(max_width);
-      memcpy(d_buffer + d_output_offsets[idx], d_str.data(), bytes);
-    });
-
-  // copy output buffer to host
-  std::vector<size_type> h_offsets(count + 1);
-  CUDA_TRY(cudaMemcpy(
-    h_offsets.data(), d_output_offsets, (count + 1) * sizeof(size_type), cudaMemcpyDeviceToHost));
-  std::vector<char> h_buffer(buffer_size);
-  CUDA_TRY(cudaMemcpy(h_buffer.data(), d_buffer, buffer_size, cudaMemcpyDeviceToHost));
-
-  // print out the strings to stdout
-  for (size_type idx = 0; idx < count; ++idx) {
-    size_type offset = h_offsets[idx];
-    size_type length = h_offsets[idx + 1] - offset;
-    std::cout << idx << ":";
-    if (length)
-      std::cout << "[" << std::string(h_buffer.data() + offset) << "]";
-    else
-      std::cout << "<null>";
-    std::cout << delimiter;
-  }
-}
 
-//
-std::pair<rmm::device_vector<char>, rmm::device_vector<size_type>> create_offsets(
+std::pair<rmm::device_uvector<char>, rmm::device_uvector<size_type>> create_offsets(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  size_type count          = strings.size();
-  const int32_t* d_offsets = strings.offsets().data<int32_t>();
+  size_type const count = strings.size();
+
+  auto d_offsets = strings.offsets().data<int32_t>();
   d_offsets += strings.offset();  // nvbug-2808421 : do not combine with the previous line
-  int32_t first = 0;
-  CUDA_TRY(
-    cudaMemcpyAsync(&first, d_offsets, sizeof(int32_t), cudaMemcpyDeviceToHost, stream.value()));
-  rmm::device_vector<size_type> offsets(count + 1);
+
+  rmm::device_uvector<size_type> offsets(count + 1, stream);
   // normalize the offset values for the column offset
-  thrust::transform(
-    rmm::exec_policy(stream),
-    d_offsets,
-    d_offsets + count + 1,
-    offsets.begin(),
-    [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
-  // copy the chars column data
-  int32_t bytes = 0;  // last offset entry is the size in bytes
-  CUDA_TRY(cudaMemcpyAsync(
-    &bytes, d_offsets + count, sizeof(int32_t), cudaMemcpyDeviceToHost, stream.value()));
+  thrust::transform(rmm::exec_policy(stream),
+                    d_offsets,
+                    d_offsets + count + 1,
+                    offsets.begin(),
+                    [d_offsets] __device__(int32_t offset) {
+                      return static_cast<size_type>(offset - d_offsets[0]);
+                    });
+
+  // get the input chars column byte offset
+  auto const bytes = offsets.element(count, stream);
+  auto const chars_offset =
+    cudf::detail::get_value<offset_type>(strings.offsets(), strings.offset(), stream);
   stream.synchronize();
 
-  bytes -= first;
-  const char* d_chars = strings.chars().data<char>() + first;
-  rmm::device_vector<char> chars(bytes);
-  CUDA_TRY(
-    cudaMemcpyAsync(chars.data().get(), d_chars, bytes, cudaMemcpyDeviceToHost, stream.value()));
+  // copy the chars column data
+  const char* d_chars = strings.chars().data<char>() + chars_offset;
+  rmm::device_uvector<char> chars(bytes, stream);
+  CUDA_TRY(cudaMemcpyAsync(chars.data(), d_chars, bytes, cudaMemcpyDefault, stream.value()));
+
   // return offsets and chars
   return std::make_pair(std::move(chars), std::move(offsets));
 }
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 3ffa331ba49..95d8eae36d4 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -18,12 +18,12 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/strip.hpp>
 #include <cudf/utilities/error.hpp>
-#include <strings/utilities.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -110,7 +110,7 @@ std::unique_ptr<column> strip(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   auto children = cudf::strings::detail::make_strings_children(
-    strip_fn{*d_column, stype, d_to_strip}, strings.size(), strings.null_count(), stream, mr);
+    strip_fn{*d_column, stype, d_to_strip}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index f712b0cb6aa..3e7c1181a25 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <strings/utilities.cuh>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -25,6 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -120,11 +119,8 @@ std::unique_ptr<column> slice_strings(
   auto const d_stop   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step   = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step},
-                                        strings.size(),
-                                        strings.null_count(),
-                                        stream,
-                                        mr);
+  auto children = make_strings_children(
+    substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
@@ -172,7 +168,7 @@ struct substring_from_fn {
     }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
-    auto const start  = starts[idx];
+    auto const start  = std::max(starts[idx], 0);
     if (start >= length) {
       if (!d_chars) d_offsets[idx] = 0;
       return;
@@ -218,8 +214,8 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
       : rmm::device_buffer(
           d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
 
-  auto children = make_strings_children(
-    substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr);
+  auto children =
+    make_strings_children(substring_from_fn{d_column, starts, stops}, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(children.first),
@@ -402,8 +398,9 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                "Strings and delimiters column sizes do not match");
 
   CUDF_FUNC_RANGE();
-  auto delimiters_dev_view_ptr = cudf::column_device_view::create(delimiters.parent(), 0);
-  auto delimiters_dev_view     = *delimiters_dev_view_ptr;
+  auto delimiters_dev_view_ptr =
+    cudf::column_device_view::create(delimiters.parent(), rmm::cuda_stream_default);
+  auto delimiters_dev_view = *delimiters_dev_view_ptr;
   return (delimiters_dev_view.nullable())
            ? detail::slice_strings(
                strings,
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 138fe3fa508..30fc7d50b3f 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#include <strings/utilities.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -112,11 +111,8 @@ std::unique_ptr<column> translate(
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto children = make_strings_children(translate_fn{*d_strings, table.begin(), table.end()},
-                                        strings.size(),
-                                        strings.null_count(),
-                                        stream,
-                                        mr);
+  auto children = make_strings_children(
+    translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
                              std::move(children.first),
diff --git a/cpp/src/strings/utf8.cuh b/cpp/src/strings/utf8.cuh
new file mode 100644
index 00000000000..6a59390255c
--- /dev/null
+++ b/cpp/src/strings/utf8.cuh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/string_view.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @brief Converts a single UTF-8 character into a code-point value that
+ * can be used for lookup in the character flags or the character case tables.
+ *
+ * @param utf8_char Single UTF-8 character to convert.
+ * @return Code-point for the UTF-8 character.
+ */
+constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
+{
+  uint32_t unchr = 0;
+  if (utf8_char < 0x00000080)  // single-byte pass thru
+    unchr = utf8_char;
+  else if (utf8_char < 0x0000E000)  // two bytes
+  {
+    unchr = (utf8_char & 0x1F00) >> 2;  // shift and
+    unchr |= (utf8_char & 0x003F);      // unmask
+  } else if (utf8_char < 0x00F00000)    // three bytes
+  {
+    unchr = (utf8_char & 0x0F0000) >> 4;         // get upper 4 bits
+    unchr |= (utf8_char & 0x003F00) >> 2;        // shift and
+    unchr |= (utf8_char & 0x00003F);             // unmask
+  } else if (utf8_char <= (unsigned)0xF8000000)  // four bytes
+  {
+    unchr = (utf8_char & 0x03000000) >> 6;   // upper 3 bits
+    unchr |= (utf8_char & 0x003F0000) >> 4;  // next 6 bits
+    unchr |= (utf8_char & 0x00003F00) >> 2;  // next 6 bits
+    unchr |= (utf8_char & 0x0000003F);       // unmask
+  }
+  return unchr;
+}
+
+/**
+ * @brief Converts a character code-point value into a UTF-8 character.
+ *
+ * @param unchr Character code-point to convert.
+ * @return Single UTF-8 character.
+ */
+constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
+{
+  cudf::char_utf8 utf8 = 0;
+  if (unchr < 0x00000080)  // single byte utf8
+    utf8 = unchr;
+  else if (unchr < 0x00000800)  // double byte utf8
+  {
+    utf8 = (unchr << 2) & 0x1F00;  // shift bits for
+    utf8 |= (unchr & 0x3F);        // utf8 encoding
+    utf8 |= 0x0000C080;
+  } else if (unchr < 0x00010000)  // triple byte utf8
+  {
+    utf8 = (unchr << 4) & 0x0F0000;   // upper 4 bits
+    utf8 |= (unchr << 2) & 0x003F00;  // next 6 bits
+    utf8 |= (unchr & 0x3F);           // last 6 bits
+    utf8 |= 0x00E08080;
+  } else if (unchr < 0x00110000)  // quadruple byte utf8
+  {
+    utf8 = (unchr << 6) & 0x07000000;   // upper 3 bits
+    utf8 |= (unchr << 4) & 0x003F0000;  // next 6 bits
+    utf8 |= (unchr << 2) & 0x00003F00;  // next 6 bits
+    utf8 |= (unchr & 0x3F);             // last 6 bits
+    utf8 |= (unsigned)0xF0808080;
+  }
+  return utf8;
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 2af313627ad..1b7141ccb18 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -16,7 +16,6 @@
 
 #include <strings/char_types/char_cases.h>
 #include <strings/char_types/char_flags.h>
-#include <strings/utilities.cuh>
 #include <strings/utilities.hpp>
 
 #include <cudf/column/column_device_view.cuh>
@@ -37,21 +36,6 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-// Used to build a temporary string_view object from a single host string.
-std::unique_ptr<string_view, std::function<void(string_view*)>> string_from_host(
-  const char* str, rmm::cuda_stream_view stream)
-{
-  if (!str) return nullptr;
-  auto length = std::strlen(str);
-
-  auto* d_str = new rmm::device_buffer(length, stream);
-  CUDA_TRY(cudaMemcpyAsync(d_str->data(), str, length, cudaMemcpyHostToDevice, stream.value()));
-  stream.synchronize();
-
-  auto deleter = [d_str](string_view* sv) { delete d_str; };
-  return std::unique_ptr<string_view, decltype(deleter)>{
-    new string_view(reinterpret_cast<char*>(d_str->data()), length), deleter};
-}
 
 /**
  * @copydoc create_string_vector_from_column
@@ -117,12 +101,10 @@ std::unique_ptr<cudf::column> child_chars_from_string_vector(cudf::device_span<s
 
 //
 std::unique_ptr<column> create_chars_child_column(cudf::size_type strings_count,
-                                                  cudf::size_type null_count,
                                                   cudf::size_type total_bytes,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(null_count <= strings_count, "Invalid null count");
   return make_numeric_column(
     data_type{type_id::INT8}, total_bytes, mask_state::UNALLOCATED, stream, mr);
 }
diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh
deleted file mode 100644
index 4f234471e69..00000000000
--- a/cpp/src/strings/utilities.cuh
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/string_view.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <cstring>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-/**
- * @brief Copies input string data into a buffer and increments the pointer by the number of bytes
- * copied.
- *
- * @param buffer Device buffer to copy to.
- * @param input Data to copy from.
- * @param bytes Number of bytes to copy.
- * @return Pointer to the end of the output buffer after the copy.
- */
-__device__ inline char* copy_and_increment(char* buffer, const char* input, size_type bytes)
-{
-  memcpy(buffer, input, bytes);
-  return buffer + bytes;
-}
-
-/**
- * @brief Copies input string data into a buffer and increments the pointer by the number of bytes
- * copied.
- *
- * @param buffer Device buffer to copy to.
- * @param d_string String to copy.
- * @return Pointer to the end of the output buffer after the copy.
- */
-__device__ inline char* copy_string(char* buffer, const string_view& d_string)
-{
-  return copy_and_increment(buffer, d_string.data(), d_string.size_bytes());
-}
-
-/**
- * @brief Creates child offsets and chars columns by applying the template function that
- * can be used for computing the output size of each string as well as create the output.
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must also have members d_offsets and d_chars which are set to
- *         memory containing the offsets and chars columns during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string.
- *        After that, the d_offsets and d_chars are set and this is called again to fill in the
- *        chars memory.
- * @param strings_count Number of strings.
- * @param null_count Number of nulls in the strings column.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @return offsets child column and chars child column for a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(
-  SizeAndExecuteFunction size_and_exec_fn,
-  size_type strings_count,
-  size_type null_count,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view          = offsets_column->mutable_view();
-  auto d_offsets             = offsets_view.template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
-
-  // This is called twice -- once for offsets and once for chars.
-  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
-  auto for_each_fn = [strings_count, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       strings_count,
-                       size_and_exec_fn);
-  };
-
-  // Compute the offsets values
-  for_each_fn(size_and_exec_fn);
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
-
-  // Now build the chars column
-  std::unique_ptr<column> chars_column = create_chars_child_column(
-    strings_count, null_count, thrust::device_pointer_cast(d_offsets)[strings_count], stream, mr);
-  size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
-  for_each_fn(size_and_exec_fn);
-
-  return std::make_pair(std::move(offsets_column), std::move(chars_column));
-}
-
-/**
- * @brief Converts a single UTF-8 character into a code-point value that
- * can be used for lookup in the character flags or the character case tables.
- *
- * @param utf8_char Single UTF-8 character to convert.
- * @return Code-point for the UTF-8 character.
- */
-__device__ inline uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
-{
-  uint32_t unchr = 0;
-  if (utf8_char < 0x00000080)  // single-byte pass thru
-    unchr = utf8_char;
-  else if (utf8_char < 0x0000E000)  // two bytes
-  {
-    unchr = (utf8_char & 0x1F00) >> 2;  // shift and
-    unchr |= (utf8_char & 0x003F);      // unmask
-  } else if (utf8_char < 0x00F00000)    // three bytes
-  {
-    unchr = (utf8_char & 0x0F0000) >> 4;         // get upper 4 bits
-    unchr |= (utf8_char & 0x003F00) >> 2;        // shift and
-    unchr |= (utf8_char & 0x00003F);             // unmask
-  } else if (utf8_char <= (unsigned)0xF8000000)  // four bytes
-  {
-    unchr = (utf8_char & 0x03000000) >> 6;   // upper 3 bits
-    unchr |= (utf8_char & 0x003F0000) >> 4;  // next 6 bits
-    unchr |= (utf8_char & 0x00003F00) >> 2;  // next 6 bits
-    unchr |= (utf8_char & 0x0000003F);       // unmask
-  }
-  return unchr;
-}
-
-/**
- * @brief Converts a character code-point value into a UTF-8 character.
- *
- * @param unchr Character code-point to convert.
- * @return Single UTF-8 character.
- */
-__host__ __device__ inline cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
-{
-  cudf::char_utf8 utf8 = 0;
-  if (unchr < 0x00000080)  // single byte utf8
-    utf8 = unchr;
-  else if (unchr < 0x00000800)  // double byte utf8
-  {
-    utf8 = (unchr << 2) & 0x1F00;  // shift bits for
-    utf8 |= (unchr & 0x3F);        // utf8 encoding
-    utf8 |= 0x0000C080;
-  } else if (unchr < 0x00010000)  // triple byte utf8
-  {
-    utf8 = (unchr << 4) & 0x0F0000;   // upper 4 bits
-    utf8 |= (unchr << 2) & 0x003F00;  // next 6 bits
-    utf8 |= (unchr & 0x3F);           // last 6 bits
-    utf8 |= 0x00E08080;
-  } else if (unchr < 0x00110000)  // quadruple byte utf8
-  {
-    utf8 = (unchr << 6) & 0x07000000;   // upper 3 bits
-    utf8 |= (unchr << 4) & 0x003F0000;  // next 6 bits
-    utf8 |= (unchr << 2) & 0x00003F00;  // next 6 bits
-    utf8 |= (unchr & 0x3F);             // last 6 bits
-    utf8 |= (unsigned)0xF0808080;
-  }
-  return utf8;
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 7a16b593ad2..f21477b3582 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include <strings/char_types/is_flags.h>
-#include <strings/utilities.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -23,6 +22,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/case.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index 330cecd1815..833ceab7518 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
+#include <structs/utilities.hpp>
+
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/types.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
@@ -25,56 +25,6 @@
 #include <algorithm>
 #include <memory>
 namespace cudf {
-namespace {
-// Helper function to superimpose validity of parent struct
-// over the specified member (child) column.
-void superimpose_parent_nullmask(bitmask_type const* parent_null_mask,
-                                 std::size_t parent_null_mask_size,
-                                 size_type parent_null_count,
-                                 column& child,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  if (!child.nullable()) {
-    // Child currently has no null mask. Copy parent's null mask.
-    child.set_null_mask(rmm::device_buffer{parent_null_mask, parent_null_mask_size, stream, mr});
-    child.set_null_count(parent_null_count);
-  } else {
-    // Child should have a null mask.
-    // `AND` the child's null mask with the parent's.
-
-    auto current_child_mask = child.mutable_view().null_mask();
-
-    std::vector<bitmask_type const*> masks{
-      reinterpret_cast<bitmask_type const*>(parent_null_mask),
-      reinterpret_cast<bitmask_type const*>(current_child_mask)};
-    std::vector<size_type> begin_bits{0, 0};
-    cudf::detail::inplace_bitmask_and(
-      device_span<bitmask_type>(current_child_mask, num_bitmask_words(child.size())),
-      masks,
-      begin_bits,
-      child.size(),
-      stream,
-      mr);
-    child.set_null_count(UNKNOWN_NULL_COUNT);
-  }
-
-  // If the child is also a struct, repeat for all grandchildren.
-  if (child.type().id() == cudf::type_id::STRUCT) {
-    const auto current_child_mask = child.mutable_view().null_mask();
-    std::for_each(thrust::make_counting_iterator(0),
-                  thrust::make_counting_iterator(child.num_children()),
-                  [&current_child_mask, &child, parent_null_mask_size, stream, mr](auto i) {
-                    superimpose_parent_nullmask(current_child_mask,
-                                                parent_null_mask_size,
-                                                UNKNOWN_NULL_COUNT,
-                                                child.child(i),
-                                                stream,
-                                                mr);
-                  });
-  }
-}
-}  // namespace
 
 /// Column factory that adopts child columns.
 std::unique_ptr<cudf::column> make_structs_column(
@@ -95,22 +45,17 @@ std::unique_ptr<cudf::column> make_structs_column(
 
   if (!null_mask.is_empty()) {
     for (auto& child : child_columns) {
-      superimpose_parent_nullmask(static_cast<bitmask_type const*>(null_mask.data()),
-                                  null_mask.size(),
-                                  null_count,
-                                  *child,
-                                  stream,
-                                  mr);
+      cudf::structs::detail::superimpose_parent_nulls(
+        static_cast<bitmask_type const*>(null_mask.data()), null_count, *child, stream, mr);
     }
   }
 
-  return std::make_unique<column>(
-    cudf::data_type{type_id::STRUCT},
-    num_rows,
-    rmm::device_buffer{0, stream, mr},  // Empty data buffer. Structs hold no data.
-    null_mask,
-    null_count,
-    std::move(child_columns));
+  return std::make_unique<column>(cudf::data_type{type_id::STRUCT},
+                                  num_rows,
+                                  rmm::device_buffer{},  // Empty data buffer. Structs hold no data.
+                                  std::move(null_mask),
+                                  null_count,
+                                  std::move(child_columns));
 }
 
 }  // namespace cudf
diff --git a/cpp/src/structs/utilities.cu b/cpp/src/structs/utilities.cpp
similarity index 60%
rename from cpp/src/structs/utilities.cu
rename to cpp/src/structs/utilities.cpp
index 174e36a1628..80bea2ab55e 100644
--- a/cpp/src/structs/utilities.cu
+++ b/cpp/src/structs/utilities.cpp
@@ -16,11 +16,13 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
+#include <structs/utilities.hpp>
 
 namespace cudf {
 namespace structs {
@@ -73,11 +75,16 @@ struct flattened_table {
   std::vector<column_view> flat_columns;
   std::vector<order> flat_column_order;
   std::vector<null_order> flat_null_precedence;
+  column_nullability nullability;
 
   flattened_table(table_view const& input,
                   std::vector<order> const& column_order,
-                  std::vector<null_order> const& null_precedence)
-    : input(input), column_order(column_order), null_precedence(null_precedence)
+                  std::vector<null_order> const& null_precedence,
+                  column_nullability nullability)
+    : input(input),
+      column_order(column_order),
+      null_precedence(null_precedence),
+      nullability(nullability)
   {
   }
 
@@ -86,23 +93,36 @@ struct flattened_table {
                              order col_order,
                              null_order col_null_order)
   {
-    if (col.nullable()) {
+    // Even if it is not required to extract the bitmask to a separate column,
+    // we should always do that if the structs column has any null element.
+    //
+    // In addition, we should check for null by calling to `has_nulls()`, not `nullable()`.
+    // This is because when comparing structs columns, if one column has bitmask while the other
+    // does not (and both columns do not have any null element) then flattening them using
+    // `nullable()` will result in tables with different number of columns.
+    //
+    // Notice that, for comparing structs columns when one column has null while the other
+    // doesn't, `nullability` must be passed in with value `column_nullability::FORCE` to make
+    // sure the flattening results are tables having the same number of columns.
+
+    if (nullability == column_nullability::FORCE || col.has_nulls()) {
       validity_as_column.push_back(cudf::is_valid(col));
-      validity_as_column.back()->set_null_mask(copy_bitmask(col));
+      if (col.has_nulls()) {
+        // copy bitmask is needed only if the column has null
+        validity_as_column.back()->set_null_mask(copy_bitmask(col));
+      }
       flat_columns.push_back(validity_as_column.back()->view());
-      if (not column_order.empty()) flat_column_order.push_back(col_order);  // doesn't matter.
-      if (not null_precedence.empty()) flat_null_precedence.push_back(col_null_order);
+      if (not column_order.empty()) { flat_column_order.push_back(col_order); }  // doesn't matter.
+      if (not null_precedence.empty()) { flat_null_precedence.push_back(col_null_order); }
     }
     for (decltype(col.num_children()) i = 0; i < col.num_children(); ++i) {
       auto const& child = col.get_sliced_child(i);
       if (child.type().id() == type_id::STRUCT) {
-        flatten_struct_column(structs_column_view{child}, col_order, null_order::BEFORE);
-        // default spark behaviour is null_order::BEFORE
+        flatten_struct_column(structs_column_view{child}, col_order, col_null_order);
       } else {
         flat_columns.push_back(child);
         if (not column_order.empty()) flat_column_order.push_back(col_order);
-        if (not null_precedence.empty()) flat_null_precedence.push_back(null_order::BEFORE);
-        // default spark behaviour is null_order::BEFORE
+        if (not null_precedence.empty()) flat_null_precedence.push_back(col_null_order);
       }
     }
   }
@@ -145,7 +165,8 @@ std::tuple<table_view,
            std::vector<std::unique_ptr<column>>>
 flatten_nested_columns(table_view const& input,
                        std::vector<order> const& column_order,
-                       std::vector<null_order> const& null_precedence)
+                       std::vector<null_order> const& null_precedence,
+                       column_nullability nullability)
 {
   std::vector<std::unique_ptr<column>> validity_as_column;
   auto const has_struct = std::any_of(
@@ -153,7 +174,52 @@ flatten_nested_columns(table_view const& input,
   if (not has_struct)
     return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column));
 
-  return flattened_table{input, column_order, null_precedence}();
+  return flattened_table{input, column_order, null_precedence, nullability}();
+}
+
+// Helper function to superimpose validity of parent struct
+// over the specified member (child) column.
+void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
+                              size_type parent_null_count,
+                              column& child,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  if (!child.nullable()) {
+    // Child currently has no null mask. Copy parent's null mask.
+    child.set_null_mask(rmm::device_buffer{
+      parent_null_mask, cudf::bitmask_allocation_size_bytes(child.size()), stream, mr});
+    child.set_null_count(parent_null_count);
+  } else {
+    // Child should have a null mask.
+    // `AND` the child's null mask with the parent's.
+
+    auto current_child_mask = child.mutable_view().null_mask();
+
+    std::vector<bitmask_type const*> masks{
+      reinterpret_cast<bitmask_type const*>(parent_null_mask),
+      reinterpret_cast<bitmask_type const*>(current_child_mask)};
+    std::vector<size_type> begin_bits{0, 0};
+    cudf::detail::inplace_bitmask_and(
+      device_span<bitmask_type>(current_child_mask, num_bitmask_words(child.size())),
+      masks,
+      begin_bits,
+      child.size(),
+      stream,
+      mr);
+    child.set_null_count(UNKNOWN_NULL_COUNT);
+  }
+
+  // If the child is also a struct, repeat for all grandchildren.
+  if (child.type().id() == cudf::type_id::STRUCT) {
+    const auto current_child_mask = child.mutable_view().null_mask();
+    std::for_each(thrust::make_counting_iterator(0),
+                  thrust::make_counting_iterator(child.num_children()),
+                  [&current_child_mask, &child, stream, mr](auto i) {
+                    superimpose_parent_nulls(
+                      current_child_mask, UNKNOWN_NULL_COUNT, child.child(i), stream, mr);
+                  });
+  }
 }
 
 }  // namespace detail
diff --git a/cpp/src/structs/utilities.hpp b/cpp/src/structs/utilities.hpp
index c0111d0bbde..eee9ca63146 100644
--- a/cpp/src/structs/utilities.hpp
+++ b/cpp/src/structs/utilities.hpp
@@ -23,6 +23,11 @@ namespace cudf {
 namespace structs {
 namespace detail {
 
+enum class column_nullability {
+  MATCH_INCOMING,  // generate a null column if the incoming column has nulls
+  FORCE            // always generate a null column
+};
+
 /**
  * @brief Flatten the children of the input columns into a vector where the i'th element
  * is a vector of column_views representing the i'th child from each input column_view.
@@ -57,6 +62,8 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
  * @param input input table to be flattened
  * @param column_order column order for input table
  * @param null_precedence null order for input table
+ * @param nullability force output to have nullability columns even if input columns
+ * are all valid
  * @return tuple with flattened table, flattened column order, flattened null precedence,
  * vector of boolean columns (struct validity).
  */
@@ -66,7 +73,26 @@ std::tuple<table_view,
            std::vector<std::unique_ptr<column>>>
 flatten_nested_columns(table_view const& input,
                        std::vector<order> const& column_order,
-                       std::vector<null_order> const& null_precedence);
+                       std::vector<null_order> const& null_precedence,
+                       column_nullability nullability = column_nullability::MATCH_INCOMING);
+
+/**
+ * @brief Pushdown nulls from a parent mask into a child column, using AND.
+ *
+ * This function will recurse through all struct descendants. It is expected that
+ * the size of `parent_null_mask` in bits is the same as `child.size()`
+ *
+ * @param parent_null_mask The mask to be applied to descendants
+ * @param parent_null_count Null count in the null mask
+ * @param column Column to apply the null mask to.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr     Device memory resource used to allocate new device memory.
+ */
+void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
+                              size_type parent_null_count,
+                              column& child,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index efc9f587a79..061597ae817 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,14 +16,13 @@
 
 #include <nvtext/tokenize.hpp>
 
-#include <strings/utilities.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -176,7 +175,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
   cudf::size_type const total_bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), output_count, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(output_count, 0, total_bytes, stream, mr);
+    cudf::strings::detail::create_chars_child_column(output_count, total_bytes, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   thrust::for_each_n(
     rmm::exec_policy(stream),
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 4a41dacbd30..71ef0bac4f0 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <strings/utilities.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,6 +21,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -132,7 +131,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
   auto const ngrams_count = strings_count - ngrams + 1;
 
   auto children = cudf::strings::detail::make_strings_children(
-    ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, 0, stream, mr);
+    ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
   return cudf::make_strings_column(ngrams_count,
@@ -245,7 +244,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
   auto const chars_bytes =
     cudf::detail::get_value<int32_t>(offsets_column->view(), total_ngrams, stream);
   auto chars_column =
-    cudf::strings::detail::create_chars_child_column(total_ngrams, 0, chars_bytes, stream, mr);
+    cudf::strings::detail::create_chars_child_column(total_ngrams, chars_bytes, stream, mr);
   generator.d_chars = chars_column->mutable_view().data<char>();  // output chars
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<cudf::size_type>(0),
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 96b06e7a1eb..bdcaac4d4ea 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -14,17 +14,16 @@
  * limitations under the License.
  */
 
+#include <text/utilities/tokenize_ops.cuh>
+
 #include <nvtext/detail/tokenize.hpp>
 #include <nvtext/ngrams_tokenize.hpp>
 
-#include <strings/utilities.cuh>
-
-#include <text/utilities/tokenize_ops.cuh>
-
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -221,8 +220,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   auto d_ngram_sizes = ngram_sizes.data();                         // ngram to generate
 
   // build chars column
-  auto chars_column = cudf::strings::detail::create_chars_child_column(
-    strings_count, 0, output_chars_size, stream, mr);
+  auto chars_column =
+    cudf::strings::detail::create_chars_child_column(strings_count, output_chars_size, stream, mr);
   auto d_chars = chars_column->mutable_view().data<char>();
   // Generate the ngrams into the chars column data buffer.
   // The ngram_builder_fn functor also fills the d_ngram_sizes vector with the
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index e3a43ac25c0..d286aa55bdb 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -14,11 +14,10 @@
  * limitations under the License.
  */
 
-#include <nvtext/normalize.hpp>
 #include <text/subword/detail/data_normalizer.hpp>
 #include <text/utilities/tokenize_ops.cuh>
 
-#include <strings/utilities.cuh>
+#include <nvtext/normalize.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -29,6 +28,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -180,7 +180,7 @@ std::unique_ptr<cudf::column> normalize_spaces(
 
   // build offsets and children using the normalize_space_fn
   auto children = cudf::strings::detail::make_strings_children(
-    normalize_spaces_fn{*d_strings}, strings.size(), strings.null_count(), stream, mr);
+    normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
                                    std::move(children.first),
@@ -225,11 +225,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   // build offsets and children using the codepoint_to_utf8_fn
   auto children = cudf::strings::detail::make_strings_children(
-    codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets},
-    strings.size(),
-    strings.null_count(),
-    stream,
-    mr);
+    codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
                                    std::move(children.first),
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index e1a03c3462b..007011c1b6f 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <strings/utilities.cuh>
-
 #include <text/utilities/tokenize_ops.cuh>
 
 #include <nvtext/detail/tokenize.hpp>
@@ -26,6 +24,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -223,8 +222,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls replacer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    replacer, strings_count, strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -258,8 +256,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    filterer, strings_count, strings.null_count(), stream, mr);
+  auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 5af09e564f5..77e457bbf16 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <strings/utilities.cuh>
-
 #include <nvtext/stemmer.hpp>
 
 #include <cudf/column/column.hpp>
@@ -24,6 +22,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/text/subword/detail/hash_utils.cuh b/cpp/src/text/subword/detail/hash_utils.cuh
index bc6b5c3dc1f..dc0737118e8 100644
--- a/cpp/src/text/subword/detail/hash_utils.cuh
+++ b/cpp/src/text/subword/detail/hash_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -157,6 +157,8 @@ __device__ int retrieve(uint64_t const key,
   auto const inner_bin_b = (bin_params >> 9) & ((1 << 7) - 1);
   auto const bin_size    = static_cast<uint8_t>(bin_params);
 
+  if (bin_size == 0) { return -1; }  // key hash has no bin parameters
+
   auto const inner_offset = hash(key, inner_bin_a, inner_bin_b, bin_size);
   auto const kv_pair      = hash_table[start_ht_offset + inner_offset];
 
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index f3f96933f19..3800339a6a2 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -36,17 +36,12 @@
 namespace nvtext {
 namespace detail {
 
-/**
- * @brief Retrieve the code point metadata table.
- *
- * Build the code point metadata table in device memory
- * using the vector pieces from codepoint_metadata.ah
- */
-const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
-{
-  static cudf::strings::detail::thread_safe_per_context_cache<codepoint_metadata_type>
-    g_codepoint_metadata;
-  return g_codepoint_metadata.find_or_initialize([stream](void) {
+namespace {
+struct get_codepoint_metadata_init {
+  rmm::cuda_stream_view stream;
+
+  codepoint_metadata_type* operator()() const
+  {
     codepoint_metadata_type* table =
       static_cast<codepoint_metadata_type*>(rmm::mr::get_current_device_resource()->allocate(
         codepoint_metadata_size * sizeof(codepoint_metadata_type), stream));
@@ -66,20 +61,14 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
       cudaMemcpyHostToDevice,
       stream.value()));
     return table;
-  });
-}
+  };
+};
 
-/**
- * @brief Retrieve the aux code point data table.
- *
- * Build the aux code point data table in device memory
- * using the vector pieces from codepoint_metadata.ah
- */
-const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
-{
-  static cudf::strings::detail::thread_safe_per_context_cache<aux_codepoint_data_type>
-    g_aux_codepoint_data;
-  return g_aux_codepoint_data.find_or_initialize([stream](void) {
+struct get_aux_codepoint_data_init {
+  rmm::cuda_stream_view stream;
+
+  aux_codepoint_data_type* operator()() const
+  {
     aux_codepoint_data_type* table =
       static_cast<aux_codepoint_data_type*>(rmm::mr::get_current_device_resource()->allocate(
         aux_codepoint_data_size * sizeof(aux_codepoint_data_type), stream));
@@ -111,7 +100,37 @@ const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stre
       cudaMemcpyHostToDevice,
       stream.value()));
     return table;
-  });
+  }
+};
+}  // namespace
+
+/**
+ * @brief Retrieve the code point metadata table.
+ *
+ * Build the code point metadata table in device memory
+ * using the vector pieces from codepoint_metadata.ah
+ */
+const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
+{
+  static cudf::strings::detail::thread_safe_per_context_cache<codepoint_metadata_type>
+    g_codepoint_metadata;
+
+  get_codepoint_metadata_init function = {stream};
+  return g_codepoint_metadata.find_or_initialize(function);
+}
+
+/**
+ * @brief Retrieve the aux code point data table.
+ *
+ * Build the aux code point data table in device memory
+ * using the vector pieces from codepoint_metadata.ah
+ */
+const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
+{
+  static cudf::strings::detail::thread_safe_per_context_cache<aux_codepoint_data_type>
+    g_aux_codepoint_data;
+  get_aux_codepoint_data_init function = {stream};
+  return g_aux_codepoint_data.find_or_initialize(function);
 }
 
 namespace {
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 1639af0dbde..8c14f89d4d0 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -265,7 +265,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   do_lower_case,
                                   do_truncate,
                                   max_rows_tensor,
-                                  0,
+                                  rmm::cuda_stream_default,
                                   mr);
 }
 
@@ -286,7 +286,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   do_lower_case,
                                   do_truncate,
                                   max_rows_tensor,
-                                  0,
+                                  rmm::cuda_stream_default,
                                   mr);
 }
 
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 1b7e457367e..d66b45618aa 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -181,7 +181,6 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // To minimize memory, count the number of characters so we can
   // build the output offsets without an intermediate buffer.
   // In the worst case each byte is a character so the output is 4x the input.
-  auto strings_view = cudf::column_device_view::create(strings_column.parent(), stream);
   cudf::size_type num_characters = thrust::count_if(
     rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) {
       return cudf::strings::detail::is_begin_utf8_char(byte);
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index e823f6711ef..dadeaf7d1e0 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <numeric>
@@ -44,44 +44,13 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
   // side effects of this function we are now dependent on:
   // - resulting column elements are sorted ascending
   // - nulls are sorted to the beginning
-  auto keys_table = cudf::detail::drop_duplicates(
-    input_table, drop_keys, duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL, stream, mr);
-
-  if (cudf::has_nulls(keys_table->view())) {
-    // Rows with nulls appear at the top of `keys_table`, but we want them to appear at
-    // the bottom. Below, we rearrange the rows so that nulls appear at the bottom:
-    // TODO: we should be able to get rid of this logic once
-    // https://github.com/rapidsai/cudf/issues/6144 is resolved
-
-    auto num_rows = keys_table->num_rows();
-    auto mask     = cudf::detail::bitmask_and(keys_table->view(), stream);
-    auto num_rows_with_nulls =
-      cudf::count_unset_bits(reinterpret_cast<bitmask_type*>(mask.data()), 0, num_rows);
-
-    rmm::device_vector<cudf::size_type> gather_map(num_rows);
-
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<cudf::size_type>(0),
-                      thrust::make_counting_iterator<cudf::size_type>(num_rows),
-                      gather_map.begin(),
-                      [num_rows, num_rows_with_nulls] __device__(cudf::size_type i) {
-                        if (i < (num_rows - num_rows_with_nulls)) {
-                          return num_rows_with_nulls + i;
-                        } else {
-                          return num_rows - i - 1;
-                        }
-                      });
-
-    cudf::column_view gather_map_column(
-      cudf::data_type{type_id::INT32}, num_rows, thrust::raw_pointer_cast(gather_map.data()));
-
-    keys_table = cudf::detail::gather(keys_table->view(),
-                                      gather_map_column,
-                                      cudf::out_of_bounds_policy::DONT_CHECK,
-                                      cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                      stream,
-                                      mr);
-  }
+  auto keys_table = cudf::detail::drop_duplicates(input_table,
+                                                  drop_keys,
+                                                  duplicate_keep_option::KEEP_FIRST,
+                                                  null_equality::EQUAL,
+                                                  null_order::AFTER,
+                                                  stream,
+                                                  mr);
 
   auto indices_column =
     cudf::detail::lower_bound(keys_table->view(),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 342ec9145fd..ddeea40df77 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -54,35 +54,38 @@ ConfigureTest(ERROR_TEST error/error_handling_test.cu)
 ###################################################################################################
 # - groupby tests ---------------------------------------------------------------------------------
 ConfigureTest(GROUPBY_TEST
-    groupby/collect_set_test.cpp
-    groupby/groupby_groups_test.cpp
-    groupby/group_argmin_test.cpp
-    groupby/group_argmax_test.cpp
-    groupby/groupby_keys_test.cpp
-    groupby/group_count_test.cpp
-    groupby/group_sum_test.cpp
-    groupby/group_min_test.cpp
-    groupby/group_max_test.cpp
-    groupby/group_sum_of_squares_test.cpp
-    groupby/group_mean_test.cpp
-    groupby/group_var_test.cpp
-    groupby/group_std_test.cpp
-    groupby/group_median_test.cpp
-    groupby/group_quantile_test.cpp
-    groupby/group_nunique_test.cpp
-    groupby/group_nth_element_test.cpp
-    groupby/group_collect_test.cpp
-    groupby/group_sum_scan_test.cpp
-    groupby/group_min_scan_test.cpp
-    groupby/group_max_scan_test.cpp
-    groupby/group_count_scan_test.cpp)
+    groupby/argmin_tests.cpp
+    groupby/argmax_tests.cpp
+    groupby/collect_list_tests.cpp
+    groupby/collect_set_tests.cpp
+    groupby/count_scan_tests.cpp
+    groupby/count_tests.cpp
+    groupby/groups_tests.cpp
+    groupby/keys_tests.cpp
+    groupby/min_tests.cpp
+    groupby/max_scan_tests.cpp
+    groupby/max_tests.cpp
+    groupby/mean_tests.cpp
+    groupby/median_tests.cpp
+    groupby/min_scan_tests.cpp
+    groupby/nth_element_tests.cpp
+    groupby/nunique_tests.cpp
+    groupby/product_tests.cpp
+    groupby/quantile_tests.cpp
+    groupby/replace_nulls_tests.cpp
+    groupby/shift_tests.cpp
+    groupby/std_tests.cpp
+    groupby/sum_of_squares_tests.cpp
+    groupby/sum_scan_tests.cpp
+    groupby/sum_tests.cpp
+    groupby/var_tests.cpp)
 
 ###################################################################################################
 # - join tests ------------------------------------------------------------------------------------
 ConfigureTest(JOIN_TEST
     join/join_tests.cpp
     join/cross_join_tests.cpp
-    join/semi_join_tests.cpp)
+    join/semi_anti_join_tests.cpp)
 
 ###################################################################################################
 # - is_sorted tests -------------------------------------------------------------------------------
@@ -135,7 +138,9 @@ ConfigureTest(CLAMP_TEST replace/clamp_test.cpp)
 
 ###################################################################################################
 # - fixed_point tests -----------------------------------------------------------------------------
-ConfigureTest(FIXED_POINT_TEST fixed_point/fixed_point_tests.cu)
+ConfigureTest(FIXED_POINT_TEST
+    fixed_point/fixed_point_tests.cpp
+    fixed_point/fixed_point_tests.cu)
 
 ###################################################################################################
 # - unary tests -----------------------------------------------------------------------------------
@@ -193,10 +198,11 @@ ConfigureTest(SORT_TEST
 # - copying tests ---------------------------------------------------------------------------------
 ConfigureTest(COPYING_TEST
     copying/concatenate_tests.cu
+    copying/copy_if_else_nested_tests.cpp
     copying/copy_range_tests.cpp
-    copying/copy_tests.cu    
+    copying/copy_tests.cu
     copying/detail_gather_tests.cu
-    copying/gather_struct_tests.cu        
+    copying/gather_struct_tests.cu
     copying/gather_tests.cu
     copying/gather_str_tests.cu
     copying/gather_list_tests.cu
@@ -204,8 +210,9 @@ ConfigureTest(COPYING_TEST
     copying/pack_tests.cu
     copying/sample_tests.cpp
     copying/scatter_tests.cpp
-    copying/scatter_list_tests.cu
-    copying/scatter_struct_tests.cu
+    copying/scatter_list_tests.cpp
+    copying/scatter_list_scalar_tests.cpp
+    copying/scatter_struct_tests.cpp
     copying/segmented_gather_list_tests.cpp
     copying/shift_tests.cpp
     copying/slice_tests.cpp
@@ -228,9 +235,17 @@ ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu)
 ###################################################################################################
 # - iterator tests --------------------------------------------------------------------------------
 ConfigureTest(ITERATOR_TEST
-    iterator/value_iterator_test.cu
-    iterator/pair_iterator_test.cu
-    iterator/scalar_iterator_test.cu)
+    iterator/value_iterator.cpp
+    iterator/value_iterator_test_chrono.cu
+    iterator/value_iterator_test_numeric.cu
+    iterator/value_iterator_test_strings.cu
+    iterator/value_iterator_test_transform.cu
+    iterator/pair_iterator_test_chrono.cu
+    iterator/pair_iterator_test_numeric.cu
+    iterator/scalar_iterator_test.cu
+    iterator/optional_iterator_test_chrono.cu
+    iterator/optional_iterator_test_numeric.cu
+    )
 
 ###################################################################################################
 # - device atomics tests --------------------------------------------------------------------------
@@ -264,12 +279,14 @@ ConfigureTest(STREAM_COMPACTION_TEST
 
 ###################################################################################################
 # - rolling tests ---------------------------------------------------------------------------------
-ConfigureTest(ROLLING_TEST 
-    rolling/rolling_test.cpp 
+ConfigureTest(ROLLING_TEST
+    rolling/collect_ops_test.cpp
+    rolling/empty_input_test.cpp
     rolling/grouped_rolling_test.cpp
     rolling/lead_lag_test.cpp
-    rolling/collect_list_test.cpp
-    )
+    rolling/range_rolling_window_test.cpp
+    rolling/range_window_bounds_test.cpp
+    rolling/rolling_test.cpp)
 
 ###################################################################################################
 # - filling test ----------------------------------------------------------------------------------
@@ -280,7 +297,10 @@ ConfigureTest(FILLING_TEST
 
 ###################################################################################################
 # - search test -----------------------------------------------------------------------------------
-ConfigureTest(SEARCH_TEST search/search_test.cpp)
+ConfigureTest(SEARCH_TEST
+    search/search_dictionary_test.cpp
+    search/search_struct_test.cpp
+    search/search_test.cpp)
 
 ###################################################################################################
 # - reshape test ----------------------------------------------------------------------------------
@@ -306,18 +326,20 @@ ConfigureTest(DISPATCHER_TEST types/type_dispatcher_test.cu)
 ###################################################################################################
 # - strings test ----------------------------------------------------------------------------------
 ConfigureTest(STRINGS_TEST
-    strings/factories_test.cu
     strings/array_tests.cu
     strings/attrs_tests.cpp
     strings/booleans_tests.cpp
     strings/case_tests.cpp
     strings/chars_types_tests.cpp
-    strings/combine_tests.cpp
+    strings/combine/concatenate_tests.cpp
+    strings/combine/join_list_elements_tests.cpp
+    strings/combine/join_strings_tests.cpp
     strings/concatenate_tests.cpp
     strings/contains_tests.cpp
     strings/datetime_tests.cpp
     strings/durations_tests.cpp
     strings/extract_tests.cpp
+    strings/factories_test.cu
     strings/fill_tests.cpp
     strings/findall_tests.cpp
     strings/find_tests.cpp
@@ -325,7 +347,7 @@ ConfigureTest(STRINGS_TEST
     strings/fixed_point_tests.cpp
     strings/floats_tests.cpp
     strings/hash_string.cu
-    strings/integers_tests.cu
+    strings/integers_tests.cpp
     strings/ipv4_tests.cpp
     strings/json_tests.cpp
     strings/pad_tests.cpp
@@ -358,7 +380,8 @@ ConfigureTest(TEXT_TEST
 ConfigureTest(BITMASK_TEST
     bitmask/valid_if_tests.cu
     bitmask/set_nullmask_tests.cu
-    bitmask/bitmask_tests.cu)
+    bitmask/bitmask_tests.cpp
+    bitmask/is_element_valid_tests.cpp)
 
 
 ###################################################################################################
@@ -387,10 +410,12 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp)
 ###################################################################################################
 # - lists tests ----------------------------------------------------------------------------------
 ConfigureTest(LISTS_TEST
+    lists/combine/concatenate_list_elements_tests.cpp
+    lists/combine/concatenate_rows_tests.cpp
     lists/contains_tests.cpp
     lists/count_elements_tests.cpp
-    lists/explode_tests.cpp
     lists/drop_list_duplicates_tests.cpp
+    lists/explode_tests.cpp
     lists/extract_tests.cpp
     lists/sort_lists_tests.cpp)
 
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 8f4a46e2a54..74937d4deea 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -18,6 +18,7 @@
 #include <cudf/ast/transform.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -30,6 +31,8 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <limits>
 #include <type_traits>
 
@@ -55,6 +58,22 @@ TEST_F(TransformTest, BasicAddition)
   cudf::test::expect_columns_equal(expected, result->view(), true);
 }
 
+TEST_F(TransformTest, BasicAdditionLarge)
+{
+  auto a     = thrust::make_counting_iterator(0);
+  auto col   = column_wrapper<int32_t>(a, a + 2000);
+  auto table = cudf::table_view{{col, col}};
+
+  auto col_ref    = cudf::ast::column_reference(0);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);
+
+  auto b        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  auto expected = column_wrapper<int32_t>(b, b + 2000);
+  auto result   = cudf::ast::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
 TEST_F(TransformTest, LessComparator)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
@@ -71,6 +90,25 @@ TEST_F(TransformTest, LessComparator)
   cudf::test::expect_columns_equal(expected, result->view(), true);
 }
 
+TEST_F(TransformTest, LessComparatorLarge)
+{
+  auto a     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  auto b     = thrust::make_counting_iterator(500);
+  auto c_0   = column_wrapper<int32_t>(a, a + 2000);
+  auto c_1   = column_wrapper<int32_t>(b, b + 2000);
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+
+  auto c        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; });
+  auto expected = column_wrapper<bool>(c, c + 2000);
+  auto result   = cudf::ast::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
 TEST_F(TransformTest, MultiLevelTreeArithmetic)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
@@ -97,6 +135,34 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic)
   cudf::test::expect_columns_equal(expected, result->view(), true);
 }
 
+TEST_F(TransformTest, MultiLevelTreeArithmeticLarge)
+{
+  using namespace cudf::ast;
+
+  auto a     = thrust::make_counting_iterator(0);
+  auto b     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; });
+  auto c     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  auto c_0   = column_wrapper<int32_t>(a, a + 2000);
+  auto c_1   = column_wrapper<int32_t>(b, b + 2000);
+  auto c_2   = column_wrapper<int32_t>(c, c + 2000);
+  auto table = cudf::table_view{{c_0, c_1, c_2}};
+
+  auto col_ref_0 = column_reference(0);
+  auto col_ref_1 = column_reference(1);
+  auto col_ref_2 = column_reference(2);
+
+  auto expr_left_subtree  = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
+  auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0);
+  auto expr_tree          = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree);
+
+  auto result = cudf::ast::compute_column(table, expr_tree);
+  auto calc   = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); };
+  auto d      = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); });
+  auto expected = column_wrapper<int32_t>(d, d + 2000);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
 TEST_F(TransformTest, ImbalancedTreeArithmetic)
 {
   auto c_0   = column_wrapper<double>{0.15, 0.37, 4.2, 21.3};
diff --git a/cpp/tests/bitmask/bitmask_tests.cu b/cpp/tests/bitmask/bitmask_tests.cpp
similarity index 71%
rename from cpp/tests/bitmask/bitmask_tests.cu
rename to cpp/tests/bitmask/bitmask_tests.cpp
index 2f820da687e..3e6e88fa955 100644
--- a/cpp/tests/bitmask/bitmask_tests.cu
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -15,6 +15,7 @@
  */
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -23,9 +24,9 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 
-#include <thrust/device_ptr.h>
-#include <thrust/device_vector.h>
 #include <rmm/device_buffer.hpp>
+#include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_uvector.hpp"
 
 struct BitmaskUtilitiesTest : public cudf::test::BaseFixture {
 };
@@ -76,161 +77,177 @@ TEST_F(CountBitmaskTest, NullMask)
   }
 }
 
+// Utility to construct a mask vector. If fill_valid is false (default), it is initialized to all
+// null. Otherwise it is initialized to all valid.
+rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fill_valid = false)
+{
+  if (!fill_valid) {
+    return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(size);
+  } else {
+    auto ret = rmm::device_uvector<cudf::bitmask_type>(size, rmm::cuda_stream_default);
+    CUDA_TRY(cudaMemsetAsync(ret.data(),
+                             ~cudf::bitmask_type{0},
+                             size * sizeof(cudf::bitmask_type),
+                             rmm::cuda_stream_default.value()));
+    return ret;
+  }
+}
+
 TEST_F(CountBitmaskTest, NegativeStart)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  EXPECT_THROW(cudf::count_set_bits(mask.data().get(), -1, 32), cudf::logic_error);
+  auto mask = make_mask(1);
+  EXPECT_THROW(cudf::count_set_bits(mask.data(), -1, 32), cudf::logic_error);
 
   std::vector<cudf::size_type> indices = {0, 16, -1, 32};
-  EXPECT_THROW(cudf::segmented_count_set_bits(mask.data().get(), indices), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_count_set_bits(mask.data(), indices), cudf::logic_error);
 }
 
 TEST_F(CountBitmaskTest, StartLargerThanStop)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  EXPECT_THROW(cudf::count_set_bits(mask.data().get(), 32, 31), cudf::logic_error);
+  auto mask = make_mask(1);
+  EXPECT_THROW(cudf::count_set_bits(mask.data(), 32, 31), cudf::logic_error);
 
   std::vector<cudf::size_type> indices = {0, 16, 31, 30};
-  EXPECT_THROW(cudf::segmented_count_set_bits(mask.data().get(), indices), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_count_set_bits(mask.data(), indices), cudf::logic_error);
 }
 
 TEST_F(CountBitmaskTest, EmptyRange)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  EXPECT_EQ(0, cudf::count_set_bits(mask.data().get(), 17, 17));
+  auto mask = make_mask(1);
+  EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 17, 17));
 
   std::vector<cudf::size_type> indices = {0, 0, 17, 17};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordAllZero)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  EXPECT_EQ(0, cudf::count_set_bits(mask.data().get(), 0, 32));
+  auto mask = make_mask(1);
+  EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 0, 32));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleBitAllZero)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  EXPECT_EQ(0, cudf::count_set_bits(mask.data().get(), 17, 18));
+  auto mask = make_mask(1);
+  EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 17, 18));
 
   std::vector<cudf::size_type> indices = {17, 18, 7, 8};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleBitAllSet)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, ~cudf::bitmask_type{0});
-  EXPECT_EQ(1, cudf::count_set_bits(mask.data().get(), 13, 14));
+  auto mask = make_mask(1, true);
+  EXPECT_EQ(1, cudf::count_set_bits(mask.data(), 13, 14));
 
   std::vector<cudf::size_type> indices = {13, 14, 0, 1};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{1, 1}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordAllBitsSet)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, ~cudf::bitmask_type{0});
-  EXPECT_EQ(32, cudf::count_set_bits(mask.data().get(), 0, 32));
+  auto mask = make_mask(1, true);
+  EXPECT_EQ(32, cudf::count_set_bits(mask.data(), 0, 32));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{32, 32}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordPreSlack)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, ~cudf::bitmask_type{0});
-  EXPECT_EQ(25, cudf::count_set_bits(mask.data().get(), 7, 32));
+  auto mask = make_mask(1, true);
+  EXPECT_EQ(25, cudf::count_set_bits(mask.data(), 7, 32));
 
   std::vector<cudf::size_type> indices = {7, 32, 8, 32};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{25, 24}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordPostSlack)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, ~cudf::bitmask_type{0});
-  EXPECT_EQ(17, cudf::count_set_bits(mask.data().get(), 0, 17));
+  auto mask = make_mask(1, true);
+  EXPECT_EQ(17, cudf::count_set_bits(mask.data(), 0, 17));
 
   std::vector<cudf::size_type> indices = {0, 17, 0, 18};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{17, 18}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordSubset)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, ~cudf::bitmask_type{0});
-  EXPECT_EQ(30, cudf::count_set_bits(mask.data().get(), 1, 31));
+  auto mask = make_mask(1, true);
+  EXPECT_EQ(30, cudf::count_set_bits(mask.data(), 1, 31));
 
   std::vector<cudf::size_type> indices = {1, 31, 7, 17};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{30, 10}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordSubset2)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, ~cudf::bitmask_type{0});
-  EXPECT_EQ(28, cudf::count_set_bits(mask.data().get(), 2, 30));
+  auto mask = make_mask(1, true);
+  EXPECT_EQ(28, cudf::count_set_bits(mask.data(), 2, 30));
 
   std::vector<cudf::size_type> indices = {4, 16, 2, 30};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{12, 28}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsAllBits)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, ~cudf::bitmask_type{0});
-  EXPECT_EQ(320, cudf::count_set_bits(mask.data().get(), 0, 320));
+  auto mask = make_mask(10, true);
+  EXPECT_EQ(320, cudf::count_set_bits(mask.data(), 0, 320));
 
   std::vector<cudf::size_type> indices = {0, 320, 0, 320};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{320, 320}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSubsetWordBoundary)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, ~cudf::bitmask_type{0});
-  EXPECT_EQ(256, cudf::count_set_bits(mask.data().get(), 32, 288));
+  auto mask = make_mask(10, true);
+  EXPECT_EQ(256, cudf::count_set_bits(mask.data(), 32, 288));
 
   std::vector<cudf::size_type> indices = {32, 192, 32, 288};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{160, 256}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSplitWordBoundary)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, ~cudf::bitmask_type{0});
-  EXPECT_EQ(2, cudf::count_set_bits(mask.data().get(), 31, 33));
+  auto mask = make_mask(10, true);
+  EXPECT_EQ(2, cudf::count_set_bits(mask.data(), 31, 33));
 
   std::vector<cudf::size_type> indices = {31, 33, 60, 67};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{2, 7}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSubset)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, ~cudf::bitmask_type{0});
-  EXPECT_EQ(226, cudf::count_set_bits(mask.data().get(), 67, 293));
+  auto mask = make_mask(10, true);
+  EXPECT_EQ(226, cudf::count_set_bits(mask.data(), 67, 293));
 
   std::vector<cudf::size_type> indices = {67, 293, 37, 319};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{226, 282}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSingleBit)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, ~cudf::bitmask_type{0});
-  EXPECT_EQ(1, cudf::count_set_bits(mask.data().get(), 67, 68));
+  auto mask = make_mask(10, true);
+  EXPECT_EQ(1, cudf::count_set_bits(mask.data(), 67, 68));
 
   std::vector<cudf::size_type> indices = {67, 68, 31, 32, 192, 193};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{1, 1, 1}));
 }
 
@@ -238,11 +255,11 @@ using CountUnsetBitsTest = CountBitmaskTest;
 
 TEST_F(CountUnsetBitsTest, SingleBitAllSet)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, ~cudf::bitmask_type{0});
-  EXPECT_EQ(0, cudf::count_unset_bits(mask.data().get(), 13, 14));
+  auto mask = make_mask(1, true);
+  EXPECT_EQ(0, cudf::count_unset_bits(mask.data(), 13, 14));
 
   std::vector<cudf::size_type> indices = {13, 14, 31, 32};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
 }
 
@@ -258,101 +275,101 @@ TEST_F(CountUnsetBitsTest, NullMask)
 
 TEST_F(CountUnsetBitsTest, SingleWordAllBits)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, cudf::bitmask_type{0});
-  EXPECT_EQ(32, cudf::count_unset_bits(mask.data().get(), 0, 32));
+  auto mask = make_mask(1);
+  EXPECT_EQ(32, cudf::count_unset_bits(mask.data(), 0, 32));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{32, 32}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordPreSlack)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, cudf::bitmask_type{0});
-  EXPECT_EQ(25, cudf::count_unset_bits(mask.data().get(), 7, 32));
+  auto mask = make_mask(1);
+  EXPECT_EQ(25, cudf::count_unset_bits(mask.data(), 7, 32));
 
   std::vector<cudf::size_type> indices = {7, 32, 8, 32};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{25, 24}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordPostSlack)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, cudf::bitmask_type{0});
-  EXPECT_EQ(17, cudf::count_unset_bits(mask.data().get(), 0, 17));
+  auto mask = make_mask(1);
+  EXPECT_EQ(17, cudf::count_unset_bits(mask.data(), 0, 17));
 
   std::vector<cudf::size_type> indices = {0, 17, 0, 18};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{17, 18}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordSubset)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, cudf::bitmask_type{0});
-  EXPECT_EQ(30, cudf::count_unset_bits(mask.data().get(), 1, 31));
+  auto mask = make_mask(1);
+  EXPECT_EQ(30, cudf::count_unset_bits(mask.data(), 1, 31));
 
   std::vector<cudf::size_type> indices = {1, 31, 7, 17};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{30, 10}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordSubset2)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, cudf::bitmask_type{0});
-  EXPECT_EQ(28, cudf::count_unset_bits(mask.data().get(), 2, 30));
+  auto mask = make_mask(1);
+  EXPECT_EQ(28, cudf::count_unset_bits(mask.data(), 2, 30));
 
   std::vector<cudf::size_type> indices = {4, 16, 2, 30};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{12, 28}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsAllBits)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, cudf::bitmask_type{0});
-  EXPECT_EQ(320, cudf::count_unset_bits(mask.data().get(), 0, 320));
+  auto mask = make_mask(10);
+  EXPECT_EQ(320, cudf::count_unset_bits(mask.data(), 0, 320));
 
   std::vector<cudf::size_type> indices = {0, 320, 0, 320};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{320, 320}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSubsetWordBoundary)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, cudf::bitmask_type{0});
-  EXPECT_EQ(256, cudf::count_unset_bits(mask.data().get(), 32, 288));
+  auto mask = make_mask(10);
+  EXPECT_EQ(256, cudf::count_unset_bits(mask.data(), 32, 288));
 
   std::vector<cudf::size_type> indices = {32, 192, 32, 288};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{160, 256}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSplitWordBoundary)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, cudf::bitmask_type{0});
-  EXPECT_EQ(2, cudf::count_unset_bits(mask.data().get(), 31, 33));
+  auto mask = make_mask(10);
+  EXPECT_EQ(2, cudf::count_unset_bits(mask.data(), 31, 33));
 
   std::vector<cudf::size_type> indices = {31, 33, 60, 67};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, ::testing::ContainerEq(std::vector<cudf::size_type>{2, 7}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSubset)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, cudf::bitmask_type{0});
-  EXPECT_EQ(226, cudf::count_unset_bits(mask.data().get(), 67, 293));
+  auto mask = make_mask(10);
+  EXPECT_EQ(226, cudf::count_unset_bits(mask.data(), 67, 293));
 
   std::vector<cudf::size_type> indices = {67, 293, 37, 319};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, ::testing::ContainerEq(std::vector<cudf::size_type>{226, 282}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSingleBit)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(10, cudf::bitmask_type{0});
-  EXPECT_EQ(1, cudf::count_unset_bits(mask.data().get(), 67, 68));
+  auto mask = make_mask(10);
+  EXPECT_EQ(1, cudf::count_unset_bits(mask.data(), 67, 68));
 
   std::vector<cudf::size_type> indices = {67, 68, 31, 32, 192, 193};
-  auto counts = cudf::segmented_count_unset_bits(mask.data().get(), indices);
+  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
   EXPECT_THAT(counts, ::testing::ContainerEq(std::vector<cudf::size_type>{1, 1, 1}));
 }
 
@@ -362,7 +379,7 @@ struct CopyBitmaskTest : public cudf::test::BaseFixture, cudf::test::UniformRand
 
 void cleanEndWord(rmm::device_buffer &mask, int begin_bit, int end_bit)
 {
-  thrust::device_ptr<cudf::bitmask_type> ptr(static_cast<cudf::bitmask_type *>(mask.data()));
+  auto ptr = static_cast<cudf::bitmask_type *>(mask.data());
 
   auto number_of_mask_words = cudf::num_bitmask_words(static_cast<size_t>(end_bit - begin_bit));
   auto number_of_bits       = end_bit - begin_bit;
@@ -374,20 +391,20 @@ void cleanEndWord(rmm::device_buffer &mask, int begin_bit, int end_bit)
 
 TEST_F(CopyBitmaskTest, NegativeStart)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  EXPECT_THROW(cudf::copy_bitmask(mask.data().get(), -1, 32), cudf::logic_error);
+  auto mask = make_mask(1);
+  EXPECT_THROW(cudf::copy_bitmask(mask.data(), -1, 32), cudf::logic_error);
 }
 
 TEST_F(CopyBitmaskTest, StartLargerThanStop)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  EXPECT_THROW(cudf::copy_bitmask(mask.data().get(), 32, 31), cudf::logic_error);
+  auto mask = make_mask(1);
+  EXPECT_THROW(cudf::copy_bitmask(mask.data(), 32, 31), cudf::logic_error);
 }
 
 TEST_F(CopyBitmaskTest, EmptyRange)
 {
-  thrust::device_vector<cudf::bitmask_type> mask(1, 0);
-  auto buff = cudf::copy_bitmask(mask.data().get(), 17, 17);
+  auto mask = make_mask(1);
+  auto buff = cudf::copy_bitmask(mask.data(), 17, 17);
   EXPECT_EQ(0, static_cast<int>(buff.size()));
 }
 
@@ -399,7 +416,7 @@ TEST_F(CopyBitmaskTest, NullPtr)
 
 TEST_F(CopyBitmaskTest, TestZeroOffset)
 {
-  thrust::host_vector<int> validity_bit(1000);
+  std::vector<int> validity_bit(1000);
   for (auto &m : validity_bit) { m = this->generate(); }
   auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
@@ -419,7 +436,7 @@ TEST_F(CopyBitmaskTest, TestZeroOffset)
 
 TEST_F(CopyBitmaskTest, TestNonZeroOffset)
 {
-  thrust::host_vector<int> validity_bit(1000);
+  std::vector<int> validity_bit(1000);
   for (auto &m : validity_bit) { m = this->generate(); }
   auto input_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
@@ -441,12 +458,15 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
 {
   cudf::data_type t{cudf::type_id::INT32};
   cudf::size_type num_elements = 1001;
-  thrust::host_vector<int> validity_bit(num_elements);
+  std::vector<int> validity_bit(num_elements);
   for (auto &m : validity_bit) { m = this->generate(); }
   auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
-  auto copy_mask = gold_mask;
-  cudf::column original{t, num_elements, rmm::device_buffer{num_elements * sizeof(int)}, copy_mask};
+  rmm::device_buffer copy_mask{gold_mask, rmm::cuda_stream_default};
+  cudf::column original{t,
+                        num_elements,
+                        rmm::device_buffer{num_elements * sizeof(int), rmm::cuda_stream_default},
+                        std::move(copy_mask)};
   std::vector<cudf::size_type> indices{0,
                                        104,
                                        104,
@@ -476,7 +496,7 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
 {
   cudf::data_type t{cudf::type_id::INT32};
   cudf::size_type num_elements = 1001;
-  thrust::host_vector<int> validity_bit(num_elements);
+  std::vector<int> validity_bit(num_elements);
   for (auto &m : validity_bit) { m = this->generate(); }
   auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
   std::vector<cudf::size_type> split{0, 104, 128, 152, 311, 491, 583, 734, 760, num_elements};
@@ -484,11 +504,12 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
   std::vector<cudf::column> cols;
   std::vector<cudf::column_view> views;
   for (unsigned i = 0; i < split.size() - 1; i++) {
-    cols.emplace_back(t,
-                      split[i + 1] - split[i],
-                      rmm::device_buffer{sizeof(int) * (split[i + 1] - split[i])},
-                      cudf::test::detail::make_null_mask(validity_bit.begin() + split[i],
-                                                         validity_bit.begin() + split[i + 1]));
+    cols.emplace_back(
+      t,
+      split[i + 1] - split[i],
+      rmm::device_buffer{sizeof(int) * (split[i + 1] - split[i]), rmm::cuda_stream_default},
+      cudf::test::detail::make_null_mask(validity_bit.begin() + split[i],
+                                         validity_bit.begin() + split[i + 1]));
     views.push_back(cols.back());
   }
   rmm::device_buffer concatenated_bitmask = cudf::concatenate_masks(views);
diff --git a/cpp/tests/bitmask/is_element_valid_tests.cpp b/cpp/tests/bitmask/is_element_valid_tests.cpp
new file mode 100644
index 00000000000..383448c0dd8
--- /dev/null
+++ b/cpp/tests/bitmask/is_element_valid_tests.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/is_element_valid.hpp>
+#include <cudf/detail/iterator.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudf {
+namespace test {
+
+struct IsElementValidTest : public BaseFixture {
+};
+
+TEST_F(IsElementValidTest, IsElementValidBasic)
+{
+  fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
+  EXPECT_TRUE(cudf::detail::is_element_valid_sync(col, 0));
+  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 1));
+  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 2));
+  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 3));
+  EXPECT_TRUE(cudf::detail::is_element_valid_sync(col, 4));
+}
+
+TEST_F(IsElementValidTest, IsElementValidLarge)
+{
+  auto filter        = [](auto i) { return static_cast<bool>(i % 3); };
+  auto val           = thrust::make_counting_iterator(0);
+  auto valid         = cudf::detail::make_counting_transform_iterator(0, filter);
+  size_type num_rows = 1000;
+
+  fixed_width_column_wrapper<int32_t> col(val, val + num_rows, valid);
+
+  for (int i = 0; i < num_rows; i++) {
+    EXPECT_EQ(cudf::detail::is_element_valid_sync(col, i), filter(i));
+  }
+}
+
+TEST_F(IsElementValidTest, IsElementValidOffset)
+{
+  fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
+  {
+    auto offset_col = slice(col, {1, 5}).front();
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 0));
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 1));
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 2));
+    EXPECT_TRUE(cudf::detail::is_element_valid_sync(offset_col, 3));
+  }
+  {
+    auto offset_col = slice(col, {2, 5}).front();
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 0));
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 1));
+    EXPECT_TRUE(cudf::detail::is_element_valid_sync(offset_col, 2));
+  }
+}
+
+TEST_F(IsElementValidTest, IsElementValidOffsetLarge)
+{
+  auto filter        = [](auto i) { return static_cast<bool>(i % 3); };
+  size_type offset   = 37;
+  auto val           = thrust::make_counting_iterator(0);
+  auto valid         = cudf::detail::make_counting_transform_iterator(0, filter);
+  size_type num_rows = 1000;
+
+  fixed_width_column_wrapper<int32_t> col(val, val + num_rows, valid);
+  auto offset_col = slice(col, {offset, num_rows}).front();
+
+  for (int i = 0; i < offset_col.size(); i++) {
+    EXPECT_EQ(cudf::detail::is_element_valid_sync(offset_col, i), filter(i + offset));
+  }
+}
+
+}  // namespace test
+
+}  // namespace cudf
diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu
index ae4896827fd..235aec7ddf8 100644
--- a/cpp/tests/bitmask/set_nullmask_tests.cu
+++ b/cpp/tests/bitmask/set_nullmask_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,15 @@
 
 #include <cudf_test/base_fixture.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/utilities/bit.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/transform.h>
-#include <cudf/utilities/bit.hpp>
 
 struct valid_bit_functor {
   cudf::bitmask_type const* _null_mask;
@@ -41,12 +45,18 @@ std::ostream& operator<<(std::ostream& stream, thrust::host_vector<bool> const&
 struct SetBitmaskTest : public cudf::test::BaseFixture {
   void expect_bitmask_equal(cudf::bitmask_type const* bitmask,  // Device Ptr
                             cudf::size_type start_bit,
-                            thrust::host_vector<bool> const& expect)
+                            thrust::host_vector<bool> const& expect,
+                            rmm::cuda_stream_view stream = rmm::cuda_stream_default)
   {
-    auto itb_dev = thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>{0},
-                                                   valid_bit_functor{bitmask});
-    thrust::device_vector<bool> result(itb_dev + start_bit, itb_dev + start_bit + expect.size());
-    thrust::host_vector<bool> host_result(result);
+    rmm::device_uvector<bool> result(expect.size(), stream);
+    auto counting_iter = thrust::counting_iterator<cudf::size_type>{0};
+    thrust::transform(rmm::exec_policy(stream),
+                      counting_iter + start_bit,
+                      counting_iter + start_bit + expect.size(),
+                      result.begin(),
+                      valid_bit_functor{bitmask});
+
+    auto host_result = cudf::detail::make_host_vector_sync(result, stream);
     EXPECT_THAT(host_result, testing::ElementsAreArray(expect));
   }
 
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index fc60011d39d..a69f5609fef 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -40,7 +40,7 @@ TEST_F(ValidIfTest, EmptyRange)
 {
   auto actual = cudf::detail::valid_if(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(0), odds_valid{});
-  auto buffer = actual.first;
+  auto const& buffer = actual.first;
   EXPECT_EQ(0u, buffer.size());
   EXPECT_EQ(nullptr, buffer.data());
   EXPECT_EQ(0, actual.second);
diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu
index d23c0d74aad..909b8fc1b6b 100644
--- a/cpp/tests/column/column_test.cu
+++ b/cpp/tests/column/column_test.cu
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -37,8 +38,8 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 
   TypedColumnTest()
-    : data{_num_elements * cudf::size_of(type())},
-      mask{cudf::bitmask_allocation_size_bytes(_num_elements)}
+    : data{_num_elements * cudf::size_of(type()), rmm::cuda_stream_default},
+      mask{cudf::bitmask_allocation_size_bytes(_num_elements), rmm::cuda_stream_default}
   {
     auto typed_data = static_cast<char*>(data.data());
     auto typed_mask = static_cast<char*>(mask.data());
@@ -86,7 +87,7 @@ void verify_column_views(cudf::column col)
 
 TYPED_TEST(TypedColumnTest, DefaultNullCountNoMask)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data};
+  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
   EXPECT_FALSE(col.nullable());
   EXPECT_FALSE(col.has_nulls());
   EXPECT_EQ(0, col.null_count());
@@ -94,7 +95,7 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountNoMask)
 
 TYPED_TEST(TypedColumnTest, DefaultNullCountEmptyMask)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, rmm::device_buffer{}};
+  cudf::column col{this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}};
   EXPECT_FALSE(col.nullable());
   EXPECT_FALSE(col.has_nulls());
   EXPECT_EQ(0, col.null_count());
@@ -102,7 +103,8 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountEmptyMask)
 
 TYPED_TEST(TypedColumnTest, DefaultNullCountAllValid)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
   EXPECT_TRUE(col.nullable());
   EXPECT_FALSE(col.has_nulls());
   EXPECT_EQ(0, col.null_count());
@@ -110,7 +112,8 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountAllValid)
 
 TYPED_TEST(TypedColumnTest, ExplicitNullCountAllValid)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_valid_mask, 0};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask), 0};
   EXPECT_TRUE(col.nullable());
   EXPECT_FALSE(col.has_nulls());
   EXPECT_EQ(0, col.null_count());
@@ -118,7 +121,8 @@ TYPED_TEST(TypedColumnTest, ExplicitNullCountAllValid)
 
 TYPED_TEST(TypedColumnTest, DefaultNullCountAllNull)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_null_mask};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)};
   EXPECT_TRUE(col.nullable());
   EXPECT_TRUE(col.has_nulls());
   EXPECT_EQ(this->num_elements(), col.null_count());
@@ -126,8 +130,11 @@ TYPED_TEST(TypedColumnTest, DefaultNullCountAllNull)
 
 TYPED_TEST(TypedColumnTest, ExplicitNullCountAllNull)
 {
-  cudf::column col{
-    this->type(), this->num_elements(), this->data, this->all_null_mask, this->num_elements()};
+  cudf::column col{this->type(),
+                   this->num_elements(),
+                   std::move(this->data),
+                   std::move(this->all_null_mask),
+                   this->num_elements()};
   EXPECT_TRUE(col.nullable());
   EXPECT_TRUE(col.has_nulls());
   EXPECT_EQ(this->num_elements(), col.null_count());
@@ -135,20 +142,20 @@ TYPED_TEST(TypedColumnTest, ExplicitNullCountAllNull)
 
 TYPED_TEST(TypedColumnTest, SetNullCountNoMask)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data};
+  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
   EXPECT_THROW(col.set_null_count(1), cudf::logic_error);
 }
 
 TYPED_TEST(TypedColumnTest, SetEmptyNullMaskNonZeroNullCount)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data};
+  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
   rmm::device_buffer empty_null_mask{};
   EXPECT_THROW(col.set_null_mask(empty_null_mask, this->num_elements()), cudf::logic_error);
 }
 
 TYPED_TEST(TypedColumnTest, SetInvalidSizeNullMaskNonZeroNullCount)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data};
+  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
   auto invalid_size_null_mask =
     create_null_mask(std::min(this->num_elements() - 50, 0), cudf::mask_state::ALL_VALID);
   EXPECT_THROW(col.set_null_mask(invalid_size_null_mask, this->num_elements()), cudf::logic_error);
@@ -156,27 +163,30 @@ TYPED_TEST(TypedColumnTest, SetInvalidSizeNullMaskNonZeroNullCount)
 
 TYPED_TEST(TypedColumnTest, SetNullCountEmptyMask)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, rmm::device_buffer{}};
+  cudf::column col{this->type(), this->num_elements(), std::move(this->data), rmm::device_buffer{}};
   EXPECT_THROW(col.set_null_count(1), cudf::logic_error);
 }
 
 TYPED_TEST(TypedColumnTest, SetNullCountAllValid)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
   EXPECT_NO_THROW(col.set_null_count(0));
   EXPECT_EQ(0, col.null_count());
 }
 
 TYPED_TEST(TypedColumnTest, SetNullCountAllNull)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_null_mask};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)};
   EXPECT_NO_THROW(col.set_null_count(this->num_elements()));
   EXPECT_EQ(this->num_elements(), col.null_count());
 }
 
 TYPED_TEST(TypedColumnTest, ResetNullCountAllNull)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_null_mask};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_null_mask)};
 
   EXPECT_EQ(this->num_elements(), col.null_count());
   EXPECT_NO_THROW(col.set_null_count(cudf::UNKNOWN_NULL_COUNT));
@@ -185,7 +195,8 @@ TYPED_TEST(TypedColumnTest, ResetNullCountAllNull)
 
 TYPED_TEST(TypedColumnTest, ResetNullCountAllValid)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
   EXPECT_EQ(0, col.null_count());
   EXPECT_NO_THROW(col.set_null_count(cudf::UNKNOWN_NULL_COUNT));
   EXPECT_EQ(0, col.null_count());
@@ -193,7 +204,7 @@ TYPED_TEST(TypedColumnTest, ResetNullCountAllValid)
 
 TYPED_TEST(TypedColumnTest, CopyDataNoMask)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data};
+  cudf::column col{this->type(), this->num_elements(), std::move(this->data)};
   EXPECT_EQ(this->type(), col.type());
   EXPECT_FALSE(col.nullable());
   EXPECT_EQ(0, col.null_count());
@@ -227,7 +238,10 @@ TYPED_TEST(TypedColumnTest, MoveDataNoMask)
 
 TYPED_TEST(TypedColumnTest, CopyDataAndMask)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column col{this->type(),
+                   this->num_elements(),
+                   rmm::device_buffer{this->data, rmm::cuda_stream_default},
+                   rmm::device_buffer{this->all_valid_mask, rmm::cuda_stream_default}};
   EXPECT_EQ(this->type(), col.type());
   EXPECT_TRUE(col.nullable());
   EXPECT_EQ(0, col.null_count());
@@ -266,7 +280,7 @@ TYPED_TEST(TypedColumnTest, MoveDataAndMask)
 
 TYPED_TEST(TypedColumnTest, CopyConstructorNoMask)
 {
-  cudf::column original{this->type(), this->num_elements(), this->data};
+  cudf::column original{this->type(), this->num_elements(), std::move(this->data)};
   cudf::column copy{original};
   verify_column_views(copy);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(original, copy);
@@ -279,7 +293,8 @@ TYPED_TEST(TypedColumnTest, CopyConstructorNoMask)
 
 TYPED_TEST(TypedColumnTest, CopyConstructorWithMask)
 {
-  cudf::column original{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column original{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
   cudf::column copy{original};
   verify_column_views(copy);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(original, copy);
@@ -293,7 +308,7 @@ TYPED_TEST(TypedColumnTest, CopyConstructorWithMask)
 
 TYPED_TEST(TypedColumnTest, MoveConstructorNoMask)
 {
-  cudf::column original{this->type(), this->num_elements(), this->data};
+  cudf::column original{this->type(), this->num_elements(), std::move(this->data)};
 
   auto original_data = original.view().head();
 
@@ -311,7 +326,8 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask)
 
 TYPED_TEST(TypedColumnTest, MoveConstructorWithMask)
 {
-  cudf::column original{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column original{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
   auto original_data = original.view().head();
   auto original_mask = original.view().null_mask();
   cudf::column moved_to{std::move(original)};
@@ -329,14 +345,21 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask)
 TYPED_TEST(TypedColumnTest, ConstructWithChildren)
 {
   std::vector<std::unique_ptr<cudf::column>> children;
+  ;
   children.emplace_back(std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT8}, 42, this->data, this->all_valid_mask));
+    cudf::data_type{cudf::type_id::INT8},
+    42,
+    rmm::device_buffer{this->data, rmm::cuda_stream_default},
+    rmm::device_buffer{this->all_valid_mask, rmm::cuda_stream_default}));
   children.emplace_back(std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::FLOAT64}, 314, this->data, this->all_valid_mask));
+    cudf::data_type{cudf::type_id::FLOAT64},
+    314,
+    rmm::device_buffer{this->data, rmm::cuda_stream_default},
+    rmm::device_buffer{this->all_valid_mask, rmm::cuda_stream_default}));
   cudf::column col{this->type(),
                    this->num_elements(),
-                   this->data,
-                   this->all_valid_mask,
+                   rmm::device_buffer{this->data, rmm::cuda_stream_default},
+                   rmm::device_buffer{this->all_valid_mask, rmm::cuda_stream_default},
                    cudf::UNKNOWN_NULL_COUNT,
                    std::move(children)};
 
@@ -350,7 +373,8 @@ TYPED_TEST(TypedColumnTest, ConstructWithChildren)
 
 TYPED_TEST(TypedColumnTest, ReleaseNoChildren)
 {
-  cudf::column col{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column col{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
   auto original_data = col.view().head();
   auto original_mask = col.view().null_mask();
 
@@ -368,13 +392,19 @@ TYPED_TEST(TypedColumnTest, ReleaseWithChildren)
 {
   std::vector<std::unique_ptr<cudf::column>> children;
   children.emplace_back(std::make_unique<cudf::column>(
-    this->type(), this->num_elements(), this->data, this->all_valid_mask));
+    this->type(),
+    this->num_elements(),
+    rmm::device_buffer{this->data, rmm::cuda_stream_default},
+    rmm::device_buffer{this->all_valid_mask, rmm::cuda_stream_default}));
   children.emplace_back(std::make_unique<cudf::column>(
-    this->type(), this->num_elements(), this->data, this->all_valid_mask));
+    this->type(),
+    this->num_elements(),
+    rmm::device_buffer{this->data, rmm::cuda_stream_default},
+    rmm::device_buffer{this->all_valid_mask, rmm::cuda_stream_default}));
   cudf::column col{this->type(),
                    this->num_elements(),
-                   this->data,
-                   this->all_valid_mask,
+                   rmm::device_buffer{this->data, rmm::cuda_stream_default},
+                   rmm::device_buffer{this->all_valid_mask, rmm::cuda_stream_default},
                    cudf::UNKNOWN_NULL_COUNT,
                    std::move(children)};
 
@@ -393,7 +423,8 @@ TYPED_TEST(TypedColumnTest, ReleaseWithChildren)
 
 TYPED_TEST(TypedColumnTest, ColumnViewConstructorWithMask)
 {
-  cudf::column original{this->type(), this->num_elements(), this->data, this->all_valid_mask};
+  cudf::column original{
+    this->type(), this->num_elements(), std::move(this->data), std::move(this->all_valid_mask)};
   cudf::column_view original_view = original;
   cudf::column copy{original_view};
   verify_column_views(copy);
@@ -434,6 +465,96 @@ TYPED_TEST(ListsColumnTest, ListsSlicedColumnViewConstructor)
   cudf::test::expect_columns_equal(expect, result->view());
 }
 
+TYPED_TEST(ListsColumnTest, ListsSlicedIncludesEmpty)
+{
+  cudf::test::lists_column_wrapper<TypeParam> list{{1, 2}, {}, {3, 4}, {8, 9}};
+  cudf::test::lists_column_wrapper<TypeParam> expect{{}, {3, 4}};
+
+  auto sliced = cudf::slice(list, {1, 3}).front();
+  auto result = std::make_unique<cudf::column>(sliced);
+
+  cudf::test::expect_columns_equal(expect, result->view());
+}
+
+TYPED_TEST(ListsColumnTest, ListsSlicedNonNestedEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<TypeParam>;
+
+  // Column of List<int>
+  LCW list{{1, 2}, {}, {3, 4}, {8, 9}};
+  // Column of 1 row, an empty List<int>
+  LCW expect{LCW{}};
+
+  auto sliced = cudf::slice(list, {1, 2}).front();
+  auto result = std::make_unique<cudf::column>(sliced);
+
+  cudf::test::expect_columns_equal(expect, result->view());
+}
+
+TYPED_TEST(ListsColumnTest, ListsSlicedNestedEmpty)
+{
+  using LCW     = cudf::test::lists_column_wrapper<TypeParam>;
+  using FWCW_SZ = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
+
+  // Column of List<List<int>>, with incomplete hierarchy
+  LCW list{{LCW{1}, LCW{2}},
+           {},  // < ----------- empty List<List<int>>, slice this
+           {LCW{3}, LCW{4, 5}}};
+
+  // Make 1-row column of type List<List<int>>, the row data contains 0 element.
+  // Well-formed memory layout:
+  // type: List<List<int>>
+  // Length: 1
+  // Mask: 1
+  // Offsets: 0, 0
+  //    List<int>
+  //    Length: 0
+  //    Offset:
+  //        INT
+  //        Length: 0
+  auto leaf      = std::make_unique<cudf::column>(cudf::column(LCW{}));
+  auto offset    = std::make_unique<cudf::column>(cudf::column(FWCW_SZ{0, 0}));
+  auto null_mask = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
+  auto expect =
+    cudf::make_lists_column(1, std::move(offset), std::move(leaf), 0, std::move(null_mask));
+
+  auto sliced = cudf::slice(list, {1, 2}).front();
+  auto result = std::make_unique<cudf::column>(sliced);
+
+  cudf::test::expect_columns_equal(*expect, result->view());
+}
+
+TYPED_TEST(ListsColumnTest, ListsSlicedZeroSliceLengthNested)
+{
+  using LCW     = cudf::test::lists_column_wrapper<TypeParam>;
+  using FWCW_SZ = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
+
+  // Column of List<List<int>>, with incomplete hierarchy
+  LCW list{{LCW{1}, LCW{2}}, {}, {LCW{3}, LCW{4, 5}}};
+
+  auto expect = cudf::empty_like(list);
+
+  auto sliced = cudf::slice(list, {0, 0}).front();
+  auto result = std::make_unique<cudf::column>(sliced);
+
+  cudf::test::expect_columns_equal(*expect, result->view());
+}
+
+TYPED_TEST(ListsColumnTest, ListsSlicedZeroSliceLengthNonNested)
+{
+  using LCW     = cudf::test::lists_column_wrapper<TypeParam>;
+  using FWCW_SZ = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
+
+  LCW list{{1, 2}, {}, {3, 4}, {8, 9}};
+
+  auto expect = cudf::empty_like(list);
+
+  auto sliced = cudf::slice(list, {0, 0}).front();
+  auto result = std::make_unique<cudf::column>(sliced);
+
+  cudf::test::expect_columns_equal(*expect, result->view());
+}
+
 TYPED_TEST(ListsColumnTest, ListsSlicedColumnViewConstructorWithNulls)
 {
   auto valids = cudf::detail::make_counting_transform_iterator(
diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu
index 97a6dbb0c22..9a0259ee49a 100644
--- a/cpp/tests/column/compound_test.cu
+++ b/cpp/tests/column/compound_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,10 +21,13 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
+
 #include <vector>
 
 struct CompoundColumnTest : public cudf::test::BaseFixture {
@@ -61,19 +64,31 @@ struct checker_for_level2 {
 
 TEST_F(CompoundColumnTest, ChildrenLevel1)
 {
-  thrust::device_vector<int32_t> data(1000);
-  thrust::sequence(thrust::device, data.begin(), data.end(), 1);
+  rmm::device_uvector<int32_t> data(1000, rmm::cuda_stream_default);
+  thrust::sequence(rmm::exec_policy(), data.begin(), data.end(), 1);
 
   auto null_mask = cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED);
-  rmm::device_buffer data1(data.data().get() + 100, 100 * sizeof(int32_t));
-  rmm::device_buffer data2(data.data().get() + 200, 100 * sizeof(int32_t));
-  rmm::device_buffer data3(data.data().get() + 300, 100 * sizeof(int32_t));
+  rmm::device_buffer data1{data.data() + 100, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  rmm::device_buffer data2{data.data() + 200, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  rmm::device_buffer data3{data.data() + 300, 100 * sizeof(int32_t), rmm::cuda_stream_default};
   auto child1 =
-    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32}, 100, data1, null_mask, 0);
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   100,
+                                   std::move(data1),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
   auto child2 =
-    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32}, 200, data2, null_mask, 0);
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   200,
+                                   std::move(data2),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
   auto child3 =
-    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32}, 300, data3, null_mask, 0);
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   300,
+                                   std::move(data3),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
 
   std::vector<std::unique_ptr<cudf::column>> children;
   children.emplace_back(std::move(child1));
@@ -82,21 +97,21 @@ TEST_F(CompoundColumnTest, ChildrenLevel1)
 
   auto parent = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::STRING},
                                                100,
-                                               rmm::device_buffer{0},
-                                               rmm::device_buffer{0},
+                                               rmm::device_buffer{},
+                                               rmm::device_buffer{},
                                                0,
                                                std::move(children));
 
   {
     auto column = cudf::column_device_view::create(parent->view());
-    EXPECT_TRUE(thrust::any_of(thrust::device,
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level1<cudf::column_device_view>{*column}));
   }
   {
     auto column = cudf::mutable_column_device_view::create(parent->mutable_view());
-    EXPECT_TRUE(thrust::any_of(thrust::device,
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level1<cudf::mutable_column_device_view>{*column}));
@@ -105,28 +120,52 @@ TEST_F(CompoundColumnTest, ChildrenLevel1)
 
 TEST_F(CompoundColumnTest, ChildrenLevel2)
 {
-  thrust::device_vector<int32_t> data(1000);
-  thrust::sequence(thrust::device, data.begin(), data.end(), 1);
+  rmm::device_uvector<int32_t> data(1000, rmm::cuda_stream_default);
+  thrust::sequence(rmm::exec_policy(), data.begin(), data.end(), 1);
 
   auto null_mask = cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED);
-  rmm::device_buffer data11(data.data().get() + 100, 100 * sizeof(int32_t));
-  rmm::device_buffer data12(data.data().get() + 200, 100 * sizeof(int32_t));
-  rmm::device_buffer data13(data.data().get() + 300, 100 * sizeof(int32_t));
-  rmm::device_buffer data21(data.data().get() + 400, 100 * sizeof(int32_t));
-  rmm::device_buffer data22(data.data().get() + 500, 100 * sizeof(int32_t));
-  rmm::device_buffer data23(data.data().get() + 600, 100 * sizeof(int32_t));
-  auto gchild11 = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT32}, 100, data11, null_mask, 0);
-  auto gchild12 = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT32}, 200, data12, null_mask, 0);
-  auto gchild13 = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT32}, 300, data13, null_mask, 0);
-  auto gchild21 = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT32}, 400, data21, null_mask, 0);
-  auto gchild22 = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT32}, 500, data22, null_mask, 0);
-  auto gchild23 = std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_id::INT32}, 600, data23, null_mask, 0);
+  rmm::device_buffer data11{data.data() + 100, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  rmm::device_buffer data12{data.data() + 200, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  rmm::device_buffer data13{data.data() + 300, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  rmm::device_buffer data21{data.data() + 400, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  rmm::device_buffer data22{data.data() + 500, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  rmm::device_buffer data23{data.data() + 600, 100 * sizeof(int32_t), rmm::cuda_stream_default};
+  auto gchild11 =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   100,
+                                   std::move(data11),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
+  auto gchild12 =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   200,
+                                   std::move(data12),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
+  auto gchild13 =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   300,
+                                   std::move(data13),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
+  auto gchild21 =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   400,
+                                   std::move(data21),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
+  auto gchild22 =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   500,
+                                   std::move(data22),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
+  auto gchild23 =
+    std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
+                                   600,
+                                   std::move(data23),
+                                   cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED),
+                                   0);
 
   std::vector<std::unique_ptr<cudf::column>> gchildren1;
   gchildren1.emplace_back(std::move(gchild11));
@@ -139,14 +178,14 @@ TEST_F(CompoundColumnTest, ChildrenLevel2)
 
   auto children1 = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::STRING},
                                                   100,
-                                                  rmm::device_buffer{0},
-                                                  rmm::device_buffer{0},
+                                                  rmm::device_buffer{},
+                                                  rmm::device_buffer{},
                                                   0,
                                                   std::move(gchildren1));
   auto children2 = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::STRING},
                                                   100,
-                                                  rmm::device_buffer{0},
-                                                  rmm::device_buffer{0},
+                                                  rmm::device_buffer{},
+                                                  rmm::device_buffer{},
                                                   0,
                                                   std::move(gchildren2));
 
@@ -155,21 +194,21 @@ TEST_F(CompoundColumnTest, ChildrenLevel2)
   children.emplace_back(std::move(children2));
   auto parent = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::STRING},
                                                100,
-                                               rmm::device_buffer{0},
-                                               rmm::device_buffer{0},
+                                               rmm::device_buffer{},
+                                               rmm::device_buffer{},
                                                0,
                                                std::move(children));
 
   {
     auto column = cudf::column_device_view::create(parent->view());
-    EXPECT_TRUE(thrust::any_of(thrust::device,
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level2<cudf::column_device_view>{*column}));
   }
   {
     auto column = cudf::mutable_column_device_view::create(parent->mutable_view());
-    EXPECT_TRUE(thrust::any_of(thrust::device,
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level2<cudf::mutable_column_device_view>{*column}));
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index d30929b90c6..2c7984b5f79 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,14 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -147,7 +150,7 @@ TYPED_TEST(NumericFactoryTest, NullMaskAsParm)
   rmm::device_buffer null_mask{create_null_mask(this->size(), cudf::mask_state::ALL_NULL)};
   auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                           this->size(),
-                                          null_mask,
+                                          std::move(null_mask),
                                           this->size(),
                                           this->stream(),
                                           this->mr());
@@ -161,10 +164,9 @@ TYPED_TEST(NumericFactoryTest, NullMaskAsParm)
 
 TYPED_TEST(NumericFactoryTest, NullMaskAsEmptyParm)
 {
-  rmm::device_buffer null_mask{};
   auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                           this->size(),
-                                          null_mask,
+                                          rmm::device_buffer{},
                                           0,
                                           this->stream(),
                                           this->mr());
@@ -324,7 +326,7 @@ TYPED_TEST(FixedWidthFactoryTest, NullMaskAsParm)
   rmm::device_buffer null_mask{create_null_mask(this->size(), cudf::mask_state::ALL_NULL)};
   auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                               this->size(),
-                                              null_mask,
+                                              std::move(null_mask),
                                               this->size(),
                                               this->stream(),
                                               this->mr());
@@ -338,10 +340,9 @@ TYPED_TEST(FixedWidthFactoryTest, NullMaskAsParm)
 
 TYPED_TEST(FixedWidthFactoryTest, NullMaskAsEmptyParm)
 {
-  rmm::device_buffer null_mask{};
   auto column = cudf::make_fixed_width_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                               this->size(),
-                                              null_mask,
+                                              rmm::device_buffer{},
                                               0,
                                               this->stream(),
                                               this->mr());
@@ -460,3 +461,328 @@ TEST_F(ColumnFactoryTest, DictionaryFromStringScalarError)
   cudf::string_scalar value("hello", false);
   EXPECT_THROW(cudf::make_dictionary_from_scalar(value, 1), cudf::logic_error);
 }
+
+template <typename T>
+class ListsFixedWidthLeafTest : public ColumnFactoryTest {
+};
+
+TYPED_TEST_CASE(ListsFixedWidthLeafTest, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(ListsFixedWidthLeafTest, FromNonNested)
+{
+  using FCW     = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using LCW     = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using valid_t = std::vector<cudf::valid_type>;
+
+  auto s   = cudf::make_list_scalar(FCW({1, -1, 3}, {1, 0, 1}));
+  auto col = cudf::make_column_from_scalar(*s, 3);
+
+  auto expected = LCW{LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()),
+                      LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()),
+                      LCW({1, 2, 3}, valid_t{1, 0, 1}.begin())};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected);
+}
+
+TYPED_TEST(ListsFixedWidthLeafTest, FromNested)
+{
+  using LCW     = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using valid_t = std::vector<cudf::valid_type>;
+
+#define row_data \
+  LCW({LCW({-1, -1, 3}, valid_t{0, 0, 1}.begin()), LCW{}, LCW{}}, valid_t{1, 0, 1}.begin())
+
+  auto s   = cudf::make_list_scalar(row_data);
+  auto col = cudf::make_column_from_scalar(*s, 5);
+
+  auto expected = LCW{row_data, row_data, row_data, row_data, row_data};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected);
+
+#undef row_data
+}
+
+template <typename T>
+class ListsDictionaryLeafTest : public ColumnFactoryTest {
+};
+
+TYPED_TEST_CASE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(ListsDictionaryLeafTest, FromNonNested)
+{
+  using DCW      = cudf::test::dictionary_column_wrapper<TypeParam>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+
+  auto s   = cudf::make_list_scalar(DCW({1, 3, -1, 1, 3}, {1, 1, 0, 1, 1}));
+  auto col = cudf::make_column_from_scalar(*s, 2);
+
+  DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1});
+  offset_t offsets{0, 5, 10};
+  auto mask = cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED);
+
+  auto expected = cudf::make_lists_column(2, offsets.release(), leaf.release(), 0, std::move(mask));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected);
+}
+
+TYPED_TEST(ListsDictionaryLeafTest, FromNested)
+{
+  using DCW      = cudf::test::dictionary_column_wrapper<TypeParam>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+
+  DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1});
+  offset_t offsets{0, 3, 3, 6, 6, 10};
+  auto mask = cudf::create_null_mask(5, cudf::mask_state::ALL_VALID);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask.data()), 1, 2, false);
+  auto data = cudf::make_lists_column(5, offsets.release(), leaf.release(), 0, std::move(mask));
+
+  auto s   = cudf::make_list_scalar(*data);
+  auto col = cudf::make_column_from_scalar(*s, 3);
+
+  DCW leaf2(
+    {1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3,
+     1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3},
+    {1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1});
+  offset_t offsets2{0, 3, 3, 6, 6, 10, 13, 13, 16, 16, 20, 23, 23, 26, 26, 30};
+  auto mask2 = cudf::create_null_mask(15, cudf::mask_state::ALL_VALID);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 1, 2, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 6, 7, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 11, 12, false);
+  auto nested =
+    cudf::make_lists_column(15, offsets2.release(), leaf2.release(), 3, std::move(mask2));
+
+  offset_t offsets3{0, 5, 10, 15};
+  auto mask3 = cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED);
+  auto expected =
+    cudf::make_lists_column(3, offsets3.release(), std::move(nested), 0, std::move(mask3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected);
+}
+
+class ListsStringLeafTest : public ColumnFactoryTest {
+};
+
+TEST_F(ListsStringLeafTest, FromNonNested)
+{
+  using SCW     = cudf::test::strings_column_wrapper;
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  using valid_t = std::vector<cudf::valid_type>;
+
+  auto s   = cudf::make_list_scalar(SCW({"xx", "", "z"}, {true, false, true}));
+  auto col = cudf::make_column_from_scalar(*s, 4);
+
+  auto expected = LCW{LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()),
+                      LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()),
+                      LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()),
+                      LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin())};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected);
+}
+
+TEST_F(ListsStringLeafTest, FromNested)
+{
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  using valid_t = std::vector<cudf::valid_type>;
+
+#define row_data                                                              \
+  LCW({LCW{},                                                                 \
+       LCW({"@@", "rapids", "", "四", "ら"}, valid_t{1, 1, 0, 1, 1}.begin()), \
+       LCW{},                                                                 \
+       LCW({"hello", ""}, valid_t{1, 0}.begin())},                            \
+      valid_t{0, 1, 1, 1}.begin())
+
+  auto s = cudf::make_list_scalar(row_data);
+
+  auto col = cudf::make_column_from_scalar(*s, 3);
+
+  auto expected = LCW{row_data, row_data, row_data};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected);
+#undef row_data
+}
+
+template <typename T>
+class ListsStructsLeafTest : public ColumnFactoryTest {
+ protected:
+  using SCW = cudf::test::structs_column_wrapper;
+  /**
+   * @brief Create a structs column that contains 3 fields: int, string, List<int>
+   */
+  template <typename MaskIterator>
+  SCW make_test_structs_column(cudf::test::fixed_width_column_wrapper<T> field1,
+                               cudf::test::strings_column_wrapper field2,
+                               cudf::test::lists_column_wrapper<T, int32_t> field3,
+                               MaskIterator mask)
+  {
+    return SCW{{field1, field2, field3}, mask};
+  }
+};
+
+TYPED_TEST_CASE(ListsStructsLeafTest, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(ListsStructsLeafTest, FromNonNested)
+{
+  using LCWinner_t = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using StringCW   = cudf::test::strings_column_wrapper;
+  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using valid_t    = std::vector<cudf::valid_type>;
+
+  auto data = this->make_test_structs_column(
+    {{1, 3, 5, 2, 4}, {1, 0, 1, 0, 1}},
+    StringCW({"fleur", "flower", "", "花", "はな"}, {true, true, false, true, true}),
+    LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}}, valid_t{1, 1, 1, 1, 0}.begin()),
+    valid_t{1, 1, 1, 0, 1}.begin());
+  auto s   = cudf::make_list_scalar(data);
+  auto col = cudf::make_column_from_scalar(*s, 2);
+
+  auto leaf = this->make_test_structs_column(
+    {{1, 3, 5, 2, 4, 1, 3, 5, 2, 4}, {1, 0, 1, 0, 1, 1, 0, 1, 0, 1}},
+    StringCW({"fleur", "flower", "", "花", "はな", "fleur", "flower", "", "花", "はな"},
+             {true, true, false, true, true, true, true, false, true, true}),
+    LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}, {1, 2}, {}, {4, 5}, {-1}, {}},
+               valid_t{1, 1, 1, 1, 0, 1, 1, 1, 1, 0}.begin()),
+    valid_t{1, 1, 1, 0, 1, 1, 1, 1, 0, 1}.begin());
+  auto expected = cudf::make_lists_column(2,
+                                          offset_t{0, 5, 10}.release(),
+                                          leaf.release(),
+                                          0,
+                                          cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected);
+}
+
+TYPED_TEST(ListsStructsLeafTest, FromNested)
+{
+  using LCWinner_t = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using StringCW   = cudf::test::strings_column_wrapper;
+  using offset_t   = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  using valid_t    = std::vector<cudf::valid_type>;
+  auto leaf        = this->make_test_structs_column(
+    {{1, 2}, {0, 1}},
+    StringCW({"étoile", "星"}, {true, true}),
+    LCWinner_t({LCWinner_t{}, LCWinner_t{42}}, valid_t{1, 1}.begin()),
+    valid_t{0, 1}.begin());
+  auto mask = cudf::create_null_mask(3, cudf::mask_state::ALL_VALID);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask.data()), 0, 1, false);
+  auto data =
+    cudf::make_lists_column(3, offset_t{0, 0, 1, 2}.release(), leaf.release(), 1, std::move(mask));
+  auto s = cudf::make_list_scalar(*data);
+
+  auto col = cudf::make_column_from_scalar(*s, 3);
+
+  auto leaf2 = this->make_test_structs_column(
+    {{1, 2, 1, 2, 1, 2}, {0, 1, 0, 1, 0, 1}},
+    StringCW({"étoile", "星", "étoile", "星", "étoile", "星"},
+             {true, true, true, true, true, true}),
+    LCWinner_t(
+      {LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}},
+      valid_t{1, 1, 1, 1, 1, 1}.begin()),
+    valid_t{0, 1, 0, 1, 0, 1}.begin());
+  auto mask2 = cudf::create_null_mask(9, cudf::mask_state::ALL_VALID);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 0, 1, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 3, 4, false);
+  cudf::set_null_mask(static_cast<cudf::bitmask_type *>(mask2.data()), 6, 7, false);
+  auto data2 = cudf::make_lists_column(
+    9, offset_t{0, 0, 1, 2, 2, 3, 4, 4, 5, 6}.release(), leaf2.release(), 3, std::move(mask2));
+  auto expected = cudf::make_lists_column(3,
+                                          offset_t{0, 3, 6, 9}.release(),
+                                          std::move(data2),
+                                          0,
+                                          cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected);
+}
+
+class ListsZeroLengthColumnTest : public ColumnFactoryTest {
+ protected:
+  using StructsCW = cudf::test::structs_column_wrapper;
+  StructsCW make_test_structs_column(cudf::test::fixed_width_column_wrapper<int32_t> field1,
+                                     cudf::test::strings_column_wrapper field2,
+                                     cudf::test::lists_column_wrapper<int32_t> field3)
+  {
+    return StructsCW{field1, field2, field3};
+  }
+};
+
+TEST_F(ListsZeroLengthColumnTest, MixedTypes)
+{
+  using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using StringCW = cudf::test::strings_column_wrapper;
+  using LCW      = cudf::test::lists_column_wrapper<int32_t>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<cudf::offset_type>;
+  {
+    auto s   = cudf::make_list_scalar(FCW{1, 2, 3});
+    auto got = cudf::make_column_from_scalar(*s, 0);
+    auto expected =
+      cudf::make_lists_column(0,
+                              offset_t{}.release(),
+                              FCW{}.release(),
+                              0,
+                              cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected);
+  }
+
+  {
+    auto s      = cudf::make_list_scalar(LCW{LCW{1, 2, 3}, LCW{}, LCW{5, 6}});
+    auto got    = cudf::make_column_from_scalar(*s, 0);
+    auto nested = cudf::make_lists_column(0,
+                                          offset_t{}.release(),
+                                          FCW{}.release(),
+                                          0,
+                                          cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED));
+    auto expected =
+      cudf::make_lists_column(0,
+                              offset_t{}.release(),
+                              std::move(nested),
+                              0,
+                              cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected);
+  }
+
+  {
+    auto s = cudf::make_list_scalar(
+      this->make_test_structs_column({1, 2, 3}, StringCW({"x", "", "y"}), LCW{{5, 6}, {}, {7}}));
+    auto got = cudf::make_column_from_scalar(*s, 0);
+
+    std::vector<std::unique_ptr<cudf::column>> children;
+    children.emplace_back(FCW{}.release());
+    children.emplace_back(StringCW{}.release());
+    children.emplace_back(LCW{}.release());
+    auto nested = cudf::make_structs_column(
+      0, std::move(children), 0, cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED));
+
+    auto expected =
+      cudf::make_lists_column(0,
+                              offset_t{}.release(),
+                              std::move(nested),
+                              0,
+                              cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected);
+  }
+}
+
+void struct_from_scalar(bool is_valid)
+{
+  using LCW = cudf::test::lists_column_wrapper<int>;
+
+  cudf::test::fixed_width_column_wrapper<int> col0{1};
+  cudf::test::strings_column_wrapper col1{"abc"};
+  cudf::test::lists_column_wrapper<int> col2{{1, 2, 3}};
+  cudf::test::lists_column_wrapper<int> col3{LCW{}};
+
+  std::vector<cudf::column_view> src_children({col0, col1, col2, col3});
+  auto value = cudf::struct_scalar(src_children, is_valid);
+  cudf::test::structs_column_wrapper struct_col({col0, col1, col2, col3}, {is_valid});
+
+  auto const num_rows = 32;
+  auto result         = cudf::make_column_from_scalar(value, num_rows);
+
+  // generate a column of size num_rows
+  std::vector<cudf::column_view> cols;
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(iter, iter + num_rows, std::back_inserter(cols), [&](int i) {
+    return static_cast<cudf::column_view>(struct_col);
+  });
+  auto expected = cudf::concatenate(cols);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+TEST_F(ColumnFactoryTest, FromStructScalar) { struct_from_scalar(true); }
+
+TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index cea53326895..a5564062de6 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -54,8 +54,8 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 
   TypedColumnTest()
-    : data{_num_elements * cudf::size_of(type())},
-      mask{cudf::bitmask_allocation_size_bytes(_num_elements)}
+    : data{_num_elements * cudf::size_of(type()), rmm::cuda_stream_default},
+      mask{cudf::bitmask_allocation_size_bytes(_num_elements), rmm::cuda_stream_default}
   {
     auto typed_data = static_cast<char*>(data.data());
     auto typed_mask = static_cast<char*>(mask.data());
@@ -99,7 +99,7 @@ TYPED_TEST(TypedColumnTest, ConcatenateNoColumns)
 
 TYPED_TEST(TypedColumnTest, ConcatenateColumnView)
 {
-  column original{this->type(), this->num_elements(), this->data, this->mask};
+  column original{this->type(), this->num_elements(), std::move(this->data), std::move(this->mask)};
   std::vector<cudf::size_type> indices{0,
                                        this->num_elements() / 3,
                                        this->num_elements() / 3,
@@ -354,7 +354,7 @@ TEST_F(TableTest, SizeOverflowTest)
     auto offsets    = cudf::test::fixed_width_column_wrapper<int>{0, size};
     auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size);
     auto col        = cudf::make_strings_column(
-      1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
 
     cudf::table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -371,7 +371,7 @@ TEST_F(TableTest, SizeOverflowTest)
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size + 1);
     auto chars = cudf::test::fixed_width_column_wrapper<int8_t>{0, 1, 2};
     auto col   = cudf::make_strings_column(
-      size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{0});
+      size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{});
 
     cudf::table_view tbl({*col});
     EXPECT_THROW(cudf::concatenate(std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl})),
@@ -390,12 +390,12 @@ TEST_F(TableTest, SizeOverflowTest)
     children.push_back(
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size));
     auto struct_col =
-      cudf::make_structs_column(inner_size, std::move(children), 0, rmm::device_buffer{0});
+      cudf::make_structs_column(inner_size, std::move(children), 0, rmm::device_buffer{});
 
     // list
     auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, inner_size};
-    auto col     = cudf::make_lists_column(
-      1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{0});
+    auto col =
+      cudf::make_lists_column(1, offsets.release(), std::move(struct_col), 0, rmm::device_buffer{});
 
     cudf::table_view tbl({*col});
     auto tables = std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl});
@@ -412,14 +412,14 @@ TEST_F(TableTest, SizeOverflowTest)
     auto offsets = cudf::test::fixed_width_column_wrapper<int>{0, 0, 0, inner_size};
     auto many_chars =
       cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size);
-    auto list_col = cudf::make_lists_column(
-      3, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{0});
+    auto list_col =
+      cudf::make_lists_column(3, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
 
     // struct
     std::vector<std::unique_ptr<column>> children;
     children.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size));
     children.push_back(std::move(list_col));
-    auto col = cudf::make_structs_column(size, std::move(children), 0, rmm::device_buffer{0});
+    auto col = cudf::make_structs_column(size, std::move(children), 0, rmm::device_buffer{});
 
     cudf::table_view tbl({*col});
     auto tables = std::vector<TView>({tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl, tbl});
@@ -703,7 +703,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
   }
 
   {
-    cudf::test::lists_column_wrapper<int> a{LCW{}};
+    cudf::test::lists_column_wrapper<int> a{{LCW{}}};
     cudf::test::lists_column_wrapper<int> b{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{LCW{}, {4, 5, 6, 7}};
 
@@ -713,7 +713,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
   }
 
   {
-    cudf::test::lists_column_wrapper<int> a{LCW{}}, b{LCW{}}, c{LCW{}};
+    cudf::test::lists_column_wrapper<int> a{{LCW{}}}, b{{LCW{}}}, c{{LCW{}}};
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{LCW{}, LCW{}, LCW{}, {4, 5, 6, 7}};
 
@@ -724,7 +724,7 @@ TEST_F(ListsColumnTest, ConcatenateEmptyLists)
 
   {
     cudf::test::lists_column_wrapper<int> a{1, 2};
-    cudf::test::lists_column_wrapper<int> b{LCW{}}, c{LCW{}};
+    cudf::test::lists_column_wrapper<int> b{{LCW{}}}, c{{LCW{}}};
     cudf::test::lists_column_wrapper<int> d{4, 5, 6, 7};
     cudf::test::lists_column_wrapper<int> expected{{1, 2}, LCW{}, LCW{}, {4, 5, 6, 7}};
 
diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp
new file mode 100644
index 00000000000..9ac34a3044e
--- /dev/null
+++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+struct CopyIfElseNestedTest : cudf::test::BaseFixture {
+};
+
+template <typename T>
+struct TypedCopyIfElseNestedTest : CopyIfElseNestedTest {
+};
+
+TYPED_TEST_CASE(TypedCopyIfElseNestedTest, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(TypedCopyIfElseNestedTest, Structs)
+{
+  using T = TypeParam;
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using ints    = fixed_width_column_wrapper<T, int32_t>;
+  using strings = strings_column_wrapper;
+  using structs = structs_column_wrapper;
+  using bools   = fixed_width_column_wrapper<bool, int32_t>;
+
+  auto lhs_ints_child     = ints{0, 1, 2, 3, 4, 5, 6};
+  auto lhs_strings_child  = strings{"0", "1", "2", "3", "4", "5", "6"};
+  auto lhs_structs_column = structs{{lhs_ints_child, lhs_strings_child}}.release();
+
+  auto rhs_ints_child     = ints{0, 11, 22, 33, 44, 55, 66};
+  auto rhs_strings_child  = strings{"00", "11", "22", "33", "44", "55", "66"};
+  auto rhs_structs_column = structs{{rhs_ints_child, rhs_strings_child}}.release();
+
+  auto selector_column = bools{1, 1, 0, 1, 1, 0, 1}.release();
+
+  auto result_column =
+    copy_if_else(lhs_structs_column->view(), rhs_structs_column->view(), selector_column->view());
+
+  auto expected_ints    = ints{0, 1, 22, 3, 4, 55, 6};
+  auto expected_strings = strings{"0", "1", "22", "3", "4", "55", "6"};
+  auto expected_result  = structs{{expected_ints, expected_strings}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result_column->view(), expected_result->view());
+}
+
+TYPED_TEST(TypedCopyIfElseNestedTest, StructsWithNulls)
+{
+  using T = TypeParam;
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using ints    = fixed_width_column_wrapper<T, int32_t>;
+  using strings = strings_column_wrapper;
+  using structs = structs_column_wrapper;
+  using bools   = fixed_width_column_wrapper<bool, int32_t>;
+
+  auto null_at_0 = iterator_with_null_at(0);
+  auto null_at_3 = iterator_with_null_at(3);
+  auto null_at_5 = iterator_with_null_at(5);
+
+  auto lhs_ints_child     = ints{{0, 1, 2, 3, 4, 5, 6}, null_at_0};
+  auto lhs_strings_child  = strings{"0", "1", "2", "3", "4", "5", "6"};
+  auto lhs_structs_column = structs{{lhs_ints_child, lhs_strings_child}, null_at_3}.release();
+
+  auto rhs_ints_child     = ints{0, 11, 22, 33, 44, 55, 66};
+  auto rhs_strings_child  = strings{{"00", "11", "22", "33", "44", "55", "66"}, null_at_5};
+  auto rhs_structs_column = structs{{rhs_ints_child, rhs_strings_child}}.release();
+
+  auto selector_column = bools{1, 1, 0, 1, 1, 0, 1}.release();
+
+  auto result_column =
+    copy_if_else(lhs_structs_column->view(), rhs_structs_column->view(), selector_column->view());
+
+  auto null_at_0_3 = iterator_with_null_at(std::vector<size_type>{0, 3});
+  auto null_at_3_5 = iterator_with_null_at(std::vector<size_type>{3, 5});
+
+  auto expected_ints    = ints{{-1, 1, 22, 3, 4, 55, 6}, null_at_0_3};
+  auto expected_strings = strings{{"0", "1", "22", "", "4", "", "6"}, null_at_3_5};
+  auto expected_result  = structs{{expected_ints, expected_strings}, null_at_3}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result_column->view(), expected_result->view());
+}
+
+TYPED_TEST(TypedCopyIfElseNestedTest, Lists)
+{
+  using T = TypeParam;
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using lcw = lists_column_wrapper<T, int32_t>;
+
+  auto lhs =
+    lcw{{0, 0}, {1, 1}, {2, 2}, {3, 3, 3}, {4, 4, 4, 4}, {5, 5, 5, 5, 5}, {6, 6, 6, 6, 6, 6}}
+      .release();
+
+  auto rhs = lcw{{0, 0},
+                 {11, 11},
+                 {22, 22},
+                 {33, 33, 33},
+                 {44, 44, 44, 44},
+                 {55, 55, 55, 55, 55},
+                 {66, 66, 66, 66, 66, 66}}
+               .release();
+
+  auto selector_column = fixed_width_column_wrapper<bool, int32_t>{1, 1, 0, 1, 1, 0, 1}.release();
+
+  auto result_column = copy_if_else(lhs->view(), rhs->view(), selector_column->view());
+
+  auto expected_output =
+    lcw{{0, 0}, {1, 1}, {22, 22}, {3, 3, 3}, {4, 4, 4, 4}, {55, 55, 55, 55, 55}, {6, 6, 6, 6, 6, 6}}
+      .release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result_column->view(), expected_output->view());
+}
+
+TYPED_TEST(TypedCopyIfElseNestedTest, ListsWithNulls)
+{
+  using T = TypeParam;
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using lcw = lists_column_wrapper<T, int32_t>;
+
+  auto null_at_0 = iterator_with_null_at(0);
+  auto null_at_2 = iterator_with_null_at(2);
+  auto null_at_4 = iterator_with_null_at(4);
+  auto null_at_5 = iterator_with_null_at(5);
+
+  auto lhs = lcw{{{0, 0},
+                  {1, 1},
+                  lcw{{2, 2}, null_at_0},
+                  lcw{{3, 3, 3}, null_at_0},
+                  {4, 4, 4, 4},
+                  {5, 5, 5, 5, 5},
+                  {6, 6, 6, 6, 6, 6}},
+                 null_at_4}
+               .release();
+
+  auto rhs = lcw{{{0, 0},
+                  {11, 11},
+                  {22, 22},
+                  {33, 33, 33},
+                  {44, 44, 44, 44},
+                  {55, 55, 55, 55, 55},
+                  {66, 66, 66, 66, 66, 66}},
+                 null_at_5}
+               .release();
+
+  auto selector_column = fixed_width_column_wrapper<bool, int32_t>{1, 1, 0, 1, 1, 0, 1}.release();
+
+  auto result_column = copy_if_else(lhs->view(), rhs->view(), selector_column->view());
+
+  auto null_at_4_5 = iterator_with_null_at(std::vector{4, 5});
+
+  auto expected_output =
+    lcw{{{0, 0}, {1, 1}, {22, 22}, lcw{{3, 3, 3}, null_at_0}, {}, {}, {6, 6, 6, 6, 6, 6}},
+        null_at_4_5}
+      .release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result_column->view(), expected_output->view());
+}
+
+TYPED_TEST(TypedCopyIfElseNestedTest, ListsWithStructs)
+{
+  using T = TypeParam;
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using ints    = fixed_width_column_wrapper<T, int32_t>;
+  using strings = strings_column_wrapper;
+  using structs = structs_column_wrapper;
+  using bools   = fixed_width_column_wrapper<bool, int32_t>;
+  using offsets = fixed_width_column_wrapper<offset_type, int32_t>;
+
+  auto const null_at_0 = iterator_with_null_at(0);
+  auto const null_at_3 = iterator_with_null_at(3);
+  auto const null_at_4 = iterator_with_null_at(4);
+  auto const null_at_6 = iterator_with_null_at(6);
+  auto const null_at_7 = iterator_with_null_at(7);
+  auto const null_at_8 = iterator_with_null_at(8);
+
+  auto lhs_ints    = ints{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, null_at_3};
+  auto lhs_strings = strings{{"0", "1", "2", "3", "4", "5", "6", "7", "8", "9"}, null_at_4};
+  auto lhs_structs = structs{{lhs_ints, lhs_strings}}.release();
+  auto lhs_offsets = offsets{0, 2, 4, 6, 10, 10}.release();
+  auto const lhs   = make_lists_column(5,
+                                     std::move(lhs_offsets),
+                                     std::move(lhs_structs),
+                                     1,
+                                     cudf::test::detail::make_null_mask(null_at_4, null_at_4 + 5));
+
+  auto rhs_ints = ints{{0, 11, 22, 33, 44, 55, 66, 77, 88, 99}, null_at_6};
+  auto rhs_strings =
+    strings{{"00", "11", "22", "33", "44", "55", "66", "77", "88", "99"}, null_at_7};
+  auto rhs_structs = structs{{rhs_ints, rhs_strings}, null_at_8};
+  auto rhs_offsets = offsets{0, 0, 4, 6, 8, 10};
+  auto const rhs   = make_lists_column(5,
+                                     rhs_offsets.release(),
+                                     rhs_structs.release(),
+                                     1,
+                                     cudf::test::detail::make_null_mask(null_at_0, null_at_0 + 5));
+
+  auto selector_column = bools{1, 0, 1, 0, 1}.release();
+
+  auto result_column = copy_if_else(lhs->view(), rhs->view(), selector_column->view());
+
+  auto const null_at_6_9 = iterator_with_null_at(std::vector{6, 9});
+  auto expected_ints     = ints{{0, 1, 0, 11, 22, 33, 4, 5, -1, 77}, null_at_8};
+  auto expected_strings =
+    strings{{"0", "1", "00", "11", "22", "33", "", "5", "66", ""}, null_at_6_9};
+  auto expected_structs = structs{{expected_ints, expected_strings}};
+  auto expected_offsets = offsets{0, 2, 6, 8, 10, 10};
+  auto const expected =
+    make_lists_column(5,
+                      expected_offsets.release(),
+                      expected_structs.release(),
+                      1,
+                      cudf::test::detail::make_null_mask(null_at_4, null_at_4 + 5));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result_column->view(), expected->view());
+}
diff --git a/cpp/tests/copying/copy_tests.cu b/cpp/tests/copying/copy_tests.cu
index e9249a6bd0e..03869c37adf 100644
--- a/cpp/tests/copying/copy_tests.cu
+++ b/cpp/tests/copying/copy_tests.cu
@@ -285,6 +285,85 @@ TYPED_TEST(CopyTest, CopyIfElseBadInputLength)
   }
 }
 
+struct CopyEmptyNested : public cudf::test::BaseFixture {
+};
+
+TEST_F(CopyEmptyNested, CopyIfElseTestEmptyNestedColumns)
+{
+  // lists
+  {
+    cudf::test::lists_column_wrapper<cudf::string_view> col{{{"abc", "def"}, {"xyz"}}};
+    auto lhs = cudf::empty_like(col);
+    auto rhs = cudf::empty_like(col);
+    cudf::test::fixed_width_column_wrapper<bool> mask{};
+
+    auto expected = empty_like(col);
+
+    auto out = cudf::copy_if_else(*lhs, *rhs, mask);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), *expected);
+  }
+
+  // structs
+  {
+    cudf::test::lists_column_wrapper<cudf::string_view> _col0{{{"abc", "def"}, {"xyz"}}};
+    auto col0 = cudf::empty_like(_col0);
+    cudf::test::fixed_width_column_wrapper<int> col1;
+
+    std::vector<std::unique_ptr<cudf::column>> cols;
+    cols.push_back(std::move(col0));
+    cols.push_back(col1.release());
+    cudf::test::structs_column_wrapper struct_col(std::move(cols));
+    auto lhs = cudf::empty_like(struct_col);
+    auto rhs = cudf::empty_like(struct_col);
+
+    cudf::test::fixed_width_column_wrapper<bool> mask{};
+
+    auto expected = cudf::empty_like(struct_col);
+
+    auto out = cudf::copy_if_else(*lhs, *rhs, mask);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), *expected);
+  }
+}
+
+TEST_F(CopyEmptyNested, CopyIfElseTestEmptyNestedScalars)
+{
+  // lists
+  {
+    cudf::test::lists_column_wrapper<cudf::string_view> _col{{{"abc", "def"}, {"xyz"}}};
+    std::unique_ptr<cudf::scalar> lhs = cudf::get_element(_col, 0);
+    std::unique_ptr<cudf::scalar> rhs = cudf::get_element(_col, 0);
+
+    cudf::test::fixed_width_column_wrapper<bool> mask{};
+
+    auto expected = empty_like(_col);
+
+    auto out = cudf::copy_if_else(*lhs, *rhs, mask);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), *expected);
+  }
+
+  // structs
+  {
+    cudf::test::lists_column_wrapper<cudf::string_view> col0{{{"abc", "def"}, {"xyz"}}};
+    cudf::test::fixed_width_column_wrapper<int> col1{1};
+
+    cudf::table_view tbl({col0, col1});
+    cudf::struct_scalar lhs(tbl);
+    cudf::struct_scalar rhs(tbl);
+
+    std::vector<std::unique_ptr<cudf::column>> cols;
+    cols.push_back(col0.release());
+    cols.push_back(col1.release());
+    cudf::test::structs_column_wrapper struct_col(std::move(cols));
+
+    cudf::test::fixed_width_column_wrapper<bool> mask{};
+
+    auto expected = cudf::empty_like(struct_col);
+
+    auto out = cudf::copy_if_else(lhs, rhs, mask);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), *expected);
+  }
+}
+
 template <typename T>
 struct CopyTestNumeric : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index 8457171ac6a..f976a6bcf58 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -30,6 +30,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <rmm/device_uvector.hpp>
+
 template <typename T>
 class GatherTest : public cudf::test::BaseFixture {
 };
@@ -37,11 +39,11 @@ class GatherTest : public cudf::test::BaseFixture {
 TYPED_TEST_CASE(GatherTest, cudf::test::NumericTypes);
 
 // This test exercises using different iterator types as gather map inputs
-// to cudf::detail::gather -- device_vector and raw pointers.
+// to cudf::detail::gather -- device_uvector and raw pointers.
 TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
 {
   constexpr cudf::size_type source_size{1000};
-  rmm::device_vector<cudf::size_type> gather_map(source_size);
+  rmm::device_uvector<cudf::size_type> gather_map(source_size, rmm::cuda_stream_default);
   thrust::sequence(thrust::device, gather_map.begin(), gather_map.end());
 
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
@@ -63,8 +65,8 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
 
   // test with raw pointers
   {
-    std::unique_ptr<cudf::table> result = cudf::detail::gather(
-      source_table, gather_map.data().get(), gather_map.data().get() + gather_map.size());
+    std::unique_ptr<cudf::table> result =
+      cudf::detail::gather(source_table, gather_map.data(), gather_map.data() + gather_map.size());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
diff --git a/cpp/tests/copying/gather_str_tests.cu b/cpp/tests/copying/gather_str_tests.cu
index c597c4ae8c2..98a5a48ea0d 100644
--- a/cpp/tests/copying/gather_str_tests.cu
+++ b/cpp/tests/copying/gather_str_tests.cu
@@ -26,6 +26,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <rmm/device_uvector.hpp>
+
 class GatherTestStr : public cudf::test::BaseFixture {
 };
 
@@ -131,7 +133,7 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  rmm::device_vector<cudf::size_type> gather_map{};
+  rmm::device_uvector<cudf::size_type> gather_map{0, rmm::cuda_stream_default};
   auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
                                       gather_map.begin(),
                                       gather_map.end(),
diff --git a/cpp/tests/copying/gather_struct_tests.cu b/cpp/tests/copying/gather_struct_tests.cu
index a40e10d5e83..bcb4f83e7cb 100644
--- a/cpp/tests/copying/gather_struct_tests.cu
+++ b/cpp/tests/copying/gather_struct_tests.cu
@@ -189,7 +189,8 @@ TYPED_TEST(TypedStructGatherTest, TestGatherStructOfLists)
       cudf::detail::make_counting_transform_iterator(0, [](auto i) { return !(i % 3); })};
   };
 
-  auto lists_column = std::make_unique<cudf::column>(cudf::column(lists_column_exemplar(), 0));
+  auto lists_column =
+    std::make_unique<cudf::column>(cudf::column(lists_column_exemplar(), rmm::cuda_stream_default));
 
   // Assemble struct column.
   std::vector<std::unique_ptr<cudf::column>> vector_of_columns;
@@ -242,7 +243,8 @@ TYPED_TEST(TypedStructGatherTest, TestGatherStructOfListsOfLists)
       cudf::detail::make_counting_transform_iterator(0, [](auto i) { return !(i % 3); })};
   };
 
-  auto lists_column = std::make_unique<cudf::column>(cudf::column(lists_column_exemplar(), 0));
+  auto lists_column =
+    std::make_unique<cudf::column>(cudf::column(lists_column_exemplar(), rmm::cuda_stream_default));
 
   // Assemble struct column.
   std::vector<std::unique_ptr<cudf::column>> vector_of_columns;
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 5a7a4f95066..40dc07512eb 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,10 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
@@ -41,7 +44,7 @@ TYPED_TEST(FixedWidthGetValueTest, BasicGet)
   auto s = get_element(col, 0);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const*>(s.get());
+  auto typed_s     = static_cast<ScalarType const *>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(9), typed_s->value());
@@ -53,7 +56,7 @@ TYPED_TEST(FixedWidthGetValueTest, GetFromNullable)
   auto s = get_element(col, 1);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const*>(s.get());
+  auto typed_s     = static_cast<ScalarType const *>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(8), typed_s->value());
@@ -83,7 +86,7 @@ TEST_F(StringGetValueTest, BasicGet)
   strings_column_wrapper col{"this", "is", "a", "test"};
   auto s = get_element(col, 3);
 
-  auto typed_s = static_cast<string_scalar const*>(s.get());
+  auto typed_s = static_cast<string_scalar const *>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ("test", typed_s->to_string());
@@ -94,7 +97,7 @@ TEST_F(StringGetValueTest, GetEmpty)
   strings_column_wrapper col{"this", "is", "", "test"};
   auto s = get_element(col, 2);
 
-  auto typed_s = static_cast<string_scalar const*>(s.get());
+  auto typed_s = static_cast<string_scalar const *>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ("", typed_s->to_string());
@@ -105,7 +108,7 @@ TEST_F(StringGetValueTest, GetFromNullable)
   strings_column_wrapper col({"this", "is", "a", "test"}, {0, 1, 0, 1});
   auto s = get_element(col, 1);
 
-  auto typed_s = static_cast<string_scalar const*>(s.get());
+  auto typed_s = static_cast<string_scalar const *>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ("is", typed_s->to_string());
@@ -134,7 +137,7 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet)
   auto s = get_element(*col, 2);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const*>(s.get());
+  auto typed_s     = static_cast<ScalarType const *>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(7), typed_s->value());
@@ -149,7 +152,7 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
   auto s = get_element(*col, 3);
 
   using ScalarType = scalar_type_t<TypeParam>;
-  auto typed_s     = static_cast<ScalarType const*>(s.get());
+  auto typed_s     = static_cast<ScalarType const *>(s.get());
 
   EXPECT_TRUE(s->is_valid());
   EXPECT_EQ(cudf::test::make_type_param_scalar<TypeParam>(8), typed_s->value());
@@ -166,5 +169,628 @@ TYPED_TEST(DictionaryGetValueTest, GetNull)
   EXPECT_FALSE(s->is_valid());
 }
 
+/*
+ * Lists test grid:
+ * Dim1 nestedness:          {Nested, Non-nested}
+ * Dim2 validity, emptiness: {Null element, Non-null non-empty list, Non-null empty list}
+ * Dim3 leaf data type:      {Fixed-width, string, struct}
+ */
+
+template <typename T>
+struct ListGetFixedWidthValueTest : public BaseFixture {
+  auto odds_valid()
+  {
+    return cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  }
+  auto nth_valid(size_type x)
+  {
+    return cudf::detail::make_counting_transform_iterator(0, [=](auto i) { return x == i; });
+  }
+};
+
+TYPED_TEST_CASE(ListGetFixedWidthValueTest, FixedWidthTypes);
+
+TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNonNullNonEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  LCW col{LCW({1, 2, 34}, this->odds_valid()), LCW{}, LCW{1}, LCW{}};
+  fixed_width_column_wrapper<TypeParam> expected_data({1, 2, 34}, this->odds_valid());
+  size_type index = 0;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNonNullEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  LCW col{LCW{1, 2, 34}, LCW{}, LCW{1}, LCW{}};
+  fixed_width_column_wrapper<TypeParam> expected_data{};
+  size_type index = 1;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNull)
+{
+  using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = cudf::test::fixed_width_column_wrapper<TypeParam>;
+
+  LCW col({LCW{1, 2, 34}, LCW{}, LCW{1}, LCW{}}, this->odds_valid());
+  size_type index = 2;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_FALSE(s->is_valid());
+  // Test preserve column hierarchy
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(typed_s->view(), FCW{});
+}
+
+TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullNonEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  // clang-format off
+  LCW col{
+    LCW{LCW{1, 2}, LCW{34}},
+    LCW{},
+    LCW{LCW{1}},
+    LCW{LCW{42}, LCW{10}}
+  };
+  // clang-format on
+  LCW expected_data{LCW{42}, LCW{10}};
+
+  size_type index = 3;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullNonEmptyPreserveNull)
+{
+  using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  std::vector<valid_type> valid{0, 1, 1};
+  // clang-format off
+  LCW col{
+    LCW{LCW{1, 2}, LCW{34}},
+    LCW{},
+    LCW{LCW{1}},
+    LCW({LCW{42}, LCW{10}, LCW({1, 3, 2}, this->nth_valid(1))}, valid.begin())
+  };
+  // clang-format on
+  LCW expected_data({LCW{42}, LCW{10}, LCW({1, 3, 2}, this->nth_valid(1))}, valid.begin());
+  size_type index = 3;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNonNullEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  // clang-format off
+  LCW col{
+    LCW{LCW{1, 2}, LCW{34}},
+    LCW{},
+    LCW{LCW{1}},
+    LCW{LCW{42}, LCW{10}}
+  };
+  // clang-format on
+  LCW expected_data{};
+  size_type index = 1;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetFixedWidthValueTest, NestedGetNull)
+{
+  using LCW      = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using FCW      = cudf::test::fixed_width_column_wrapper<TypeParam>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<offset_type>;
+
+  std::vector<valid_type> valid{1, 0, 1, 0};
+  // clang-format off
+  LCW col(
+    {
+      LCW{LCW{1, 2}, LCW{34}},
+      LCW{},
+      LCW{LCW{1}},
+      LCW{LCW{42}, LCW{10}}
+    }, valid.begin());
+  // clang-format on
+  size_type index = 1;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  auto expected_data =
+    make_lists_column(0, offset_t{}.release(), FCW{}.release(), 0, rmm::device_buffer{});
+
+  EXPECT_FALSE(s->is_valid());
+  // Test preserve column hierarchy
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(typed_s->view(), *expected_data);
+}
+
+struct ListGetStringValueTest : public BaseFixture {
+  auto odds_valid()
+  {
+    return cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+  }
+  auto nth_valid(size_type x)
+  {
+    return cudf::detail::make_counting_transform_iterator(0, [=](auto i) { return x == i; });
+  }
+};
+
+TEST_F(ListGetStringValueTest, NonNestedGetNonNullNonEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<string_view>;
+
+  LCW col{LCW({"aaa", "Héllo"}, this->odds_valid()), LCW{}, LCW{""}, LCW{"42"}};
+  strings_column_wrapper expected_data({"aaa", "Héllo"}, this->odds_valid());
+  size_type index = 0;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TEST_F(ListGetStringValueTest, NonNestedGetNonNullEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<string_view>;
+
+  LCW col{LCW{"aaa", "Héllo"}, LCW{}, LCW{""}, LCW{"42"}};
+  strings_column_wrapper expected_data{};
+  size_type index = 1;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TEST_F(ListGetStringValueTest, NonNestedGetNull)
+{
+  using LCW      = cudf::test::lists_column_wrapper<string_view>;
+  using StringCW = strings_column_wrapper;
+
+  std::vector<valid_type> valid{1, 0, 0, 1};
+  LCW col({LCW{"aaa", "Héllo"}, LCW{}, LCW{""}, LCW{"42"}}, valid.begin());
+  size_type index = 2;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(typed_s->view(), StringCW{});
+}
+
+TEST_F(ListGetStringValueTest, NestedGetNonNullNonEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<string_view>;
+
+  // clang-format off
+  LCW col{
+    LCW{LCW{"aaa", "Héllo"}},
+    LCW{},
+    LCW{LCW{""}, LCW({"string", "str2", "xyz"}, this->nth_valid(0))},
+    LCW{LCW{"42"}, LCW{"21"}}
+  };
+  // clang-format on
+  LCW expected_data{LCW{""}, LCW({"string", "str2", "xyz"}, this->nth_valid(0))};
+  size_type index = 2;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TEST_F(ListGetStringValueTest, NestedGetNonNullNonEmptyPreserveNull)
+{
+  using LCW = cudf::test::lists_column_wrapper<string_view>;
+
+  std::vector<valid_type> valid{0, 1, 1};
+  // clang-format off
+  LCW col{
+    LCW{LCW{"aaa", "Héllo"}},
+    LCW{},
+    LCW({LCW{""}, LCW{"cc"}, LCW({"string", "str2", "xyz"}, this->nth_valid(0))}, valid.begin()),
+    LCW{LCW{"42"}, LCW{"21"}}
+  };
+  // clang-format on
+  LCW expected_data({LCW{""}, LCW{"cc"}, LCW({"string", "str2", "xyz"}, this->nth_valid(0))},
+                    valid.begin());
+  size_type index = 2;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_data, typed_s->view());
+}
+
+TEST_F(ListGetStringValueTest, NestedGetNonNullEmpty)
+{
+  using LCW = cudf::test::lists_column_wrapper<string_view>;
+
+  // clang-format off
+  LCW col{
+    LCW{LCW{"aaa", "Héllo"}},
+    LCW{LCW{""}},
+    LCW{LCW{"42"}, LCW{"21"}},
+    LCW{}
+  };
+  // clang-format on
+  LCW expected_data{};
+  size_type index = 3;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  // Relax to equivalent. `expected_data` leaf string column does not
+  // allocate offset and byte array, but `typed_s` does.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_data, typed_s->view());
+}
+
+TEST_F(ListGetStringValueTest, NestedGetNull)
+{
+  using LCW      = cudf::test::lists_column_wrapper<string_view>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<offset_type>;
+  using StringCW = cudf::test::strings_column_wrapper;
+
+  std::vector<valid_type> valid{0, 0, 1, 1};
+  // clang-format off
+  LCW col(
+    {
+      LCW{LCW{"aaa", "Héllo"}},
+      LCW{LCW{""}},
+      LCW{LCW{"42"}, LCW{"21"}},
+      LCW{}
+    }, valid.begin());
+  // clang-format on
+  size_type index = 0;
+
+  auto s       = get_element(col, index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  auto expected_data =
+    make_lists_column(0, offset_t{}.release(), StringCW{}.release(), 0, rmm::device_buffer{});
+
+  EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_data, typed_s->view());
+}
+
+/**
+ * @brief Some shared helper functions used by lists of structs test.
+ */
+template <typename T>
+struct ListGetStructValueTest : public BaseFixture {
+  using SCW        = structs_column_wrapper;
+  using LCWinner_t = cudf::test::lists_column_wrapper<T, int32_t>;
+
+  /**
+   * @brief Create a lists column
+   *
+   * @note Different from `cudf::make_lists_column`, this allows setting the `null_mask`
+   * in `initializer_list`. However this is an expensive function because it repeatedly
+   * calls `cudf::set_null_mask` for each row.
+   */
+  std::unique_ptr<cudf::column> make_test_lists_column(
+    size_type num_lists,
+    fixed_width_column_wrapper<offset_type> offsets,
+    std::unique_ptr<cudf::column> child,
+    std::initializer_list<valid_type> null_mask)
+  {
+    size_type null_count = num_lists - std::accumulate(null_mask.begin(), null_mask.end(), 0);
+    auto d_null_mask     = cudf::create_null_mask(
+      num_lists, null_count == 0 ? cudf::mask_state::UNALLOCATED : cudf::mask_state::ALL_NULL);
+    if (null_count > 0) {
+      std::for_each(
+        thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_lists), [&](auto i) {
+          if (*(null_mask.begin() + i)) {
+            set_null_mask(static_cast<bitmask_type *>(d_null_mask.data()), i, i + 1, true);
+          }
+        });
+    }
+    return cudf::make_lists_column(
+      num_lists, offsets.release(), std::move(child), null_count, std::move(d_null_mask));
+  }
+
+  /**
+   * @brief Create a structs column that contains 3 fields: int, string, List<int>
+   */
+  template <typename MaskIterator>
+  SCW make_test_structs_column(fixed_width_column_wrapper<T> field1,
+                               strings_column_wrapper field2,
+                               lists_column_wrapper<T, int32_t> field3,
+                               MaskIterator mask)
+  {
+    return SCW{{field1, field2, field3}, mask};
+  }
+
+  /**
+   * @brief Create a 0-length structs column
+   */
+  SCW zero_length_struct() { return SCW{}; }
+
+  /**
+   * @brief Concatenate structs columns, allow specifying inputs in `initializer_list`
+   */
+  std::unique_ptr<cudf::column> concat(std::initializer_list<SCW> rows)
+  {
+    std::vector<column_view> views;
+    std::transform(
+      rows.begin(), rows.end(), std::back_inserter(views), [](auto &r) { return column_view(r); });
+    return cudf::concatenate(views);
+  }
+
+  /**
+   * @brief Test data setup: row 0 of structs column
+   */
+  SCW row0()
+  {
+    // {int: 1, string: NULL, list: NULL}
+    return this->make_test_structs_column({{1}, {1}},
+                                          strings_column_wrapper({"aa"}, {false}),
+                                          LCWinner_t({{}}, all_invalid()),
+                                          all_valid());
+  }
+
+  /**
+   * @brief Test data setup: row 1 of structs column
+   */
+  SCW row1()
+  {
+    // NULL
+    return this->make_test_structs_column({-1}, {""}, LCWinner_t{-1}, all_invalid());
+  }
+
+  /**
+   * @brief Test data setup: row 2 of structs column
+   */
+  SCW row2()
+  {
+    // {int: 3, string: "xyz", list: [3, 8, 4]}
+    return this->make_test_structs_column({{3}, {1}},
+                                          strings_column_wrapper({"xyz"}, {true}),
+                                          LCWinner_t({{3, 8, 4}}, all_valid()),
+                                          all_valid());
+  }
+
+  /**
+   * @brief Test data setup: a 3-row structs column
+   */
+  std::unique_ptr<cudf::column> leaf_data()
+  {
+    // 3 rows:
+    // {int: 1, string: NULL, list: NULL}
+    // NULL
+    // {int: 3, string: "xyz", list: [3, 8, 4]}
+    return this->concat({row0(), row1(), row2()});
+  }
+
+  auto all_valid() { return thrust::make_constant_iterator(true); }
+  auto all_invalid() { return thrust::make_constant_iterator(false); }
+};
+
+TYPED_TEST_CASE(ListGetStructValueTest, FixedWidthTypes);
+
+TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullNonEmpty)
+{
+  // 2-rows
+  // [{1, NULL, NULL}, NULL]
+  // [{3, "xyz", [3, 8, 4]}] <- get_element(1)
+
+  auto list_column   = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
+  size_type index    = 1;
+  auto expected_data = this->row2();
+
+  auto s       = get_element(list_column->view(), index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  // Relax to equivalent. The nested list column in struct allocates `null_mask`.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullNonEmpty2)
+{
+  // 2-rows
+  // [{1, NULL, NULL}, NULL] <- get_element(0)
+  // [{3, "xyz", [3, 8, 4]}]
+
+  auto list_column   = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
+  size_type index    = 0;
+  auto expected_data = this->concat({this->row0(), this->row1()});
+
+  auto s       = get_element(list_column->view(), index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
+{
+  // 3-rows
+  // [{1, NULL, NULL}, NULL]
+  // [{3, "xyz", [3, 8, 4]}]
+  // []                      <- get_element(2)
+
+  auto list_column = this->make_test_lists_column(3, {0, 2, 3, 3}, this->leaf_data(), {1, 1, 1});
+  size_type index  = 2;
+  // For well-formed list column, an empty list still holds the complete structure of
+  // a 0-length structs column
+  auto expected_data = this->zero_length_struct();
+
+  auto s       = get_element(list_column->view(), index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  // Relax to equivalent. The nested list column in struct allocates `null_mask`.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetStructValueTest, NonNestedGetNull)
+{
+  // 2-rows
+  // NULL                    <- get_element(0)
+  // [{3, "xyz", [3, 8, 4]}]
+
+  using valid_t = std::vector<valid_type>;
+
+  auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {0, 1});
+  size_type index  = 0;
+
+  auto s       = get_element(list_column->view(), index);
+  auto typed_s = static_cast<list_scalar const *>(s.get());
+
+  auto expected_data = this->make_test_structs_column({}, {}, {}, valid_t{}.begin());
+
+  EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(typed_s->view(), expected_data);
+}
+
+TYPED_TEST(ListGetStructValueTest, NestedGetNonNullNonEmpty)
+{
+  // 2-rows
+  // [[{1, NULL, NULL}, NULL], [{3, "xyz", [3, 8, 4]}]] <- get_element(0)
+  // []
+
+  auto list_column   = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
+  auto expected_data = std::make_unique<cudf::column>(*list_column);
+
+  auto list_column_nested =
+    this->make_test_lists_column(2, {0, 2, 2}, std::move(list_column), {1, 1});
+
+  size_type index = 0;
+  auto s          = get_element(list_column_nested->view(), index);
+  auto typed_s    = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetStructValueTest, NestedGetNonNullNonEmpty2)
+{
+  // 2-rows
+  // [[{1, NULL, NULL}, NULL]] <- get_element(0)
+  // [[{3, "xyz", [3, 8, 4]}]]
+
+  auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
+  auto list_column_nested =
+    this->make_test_lists_column(2, {0, 1, 2}, std::move(list_column), {1, 1});
+
+  auto expected_data =
+    this->make_test_lists_column(1, {0, 2}, this->concat({this->row0(), this->row1()}), {1});
+
+  size_type index = 0;
+  auto s          = get_element(list_column_nested->view(), index);
+  auto typed_s    = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetStructValueTest, NestedGetNonNullNonEmpty3)
+{
+  // 2-rows
+  // [[{1, NULL, NULL}, NULL]]
+  // [[{3, "xyz", [3, 8, 4]}]] <- get_element(1)
+
+  auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
+  auto list_column_nested =
+    this->make_test_lists_column(2, {0, 1, 2}, std::move(list_column), {1, 1});
+
+  auto expected_data = this->make_test_lists_column(1, {0, 1}, this->row2().release(), {1});
+
+  size_type index = 1;
+  auto s          = get_element(list_column_nested->view(), index);
+  auto typed_s    = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  // Relax to equivalent. For `get_element`, the nested list column in struct
+  // allocates `null_mask`.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetStructValueTest, NestedGetNonNullEmpty)
+{
+  // 3-rows
+  // [[{1, NULL, NULL}, NULL]]
+  // []                        <- get_element(1)
+  // [[{3, "xyz", [3, 8, 4]}]]
+
+  auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
+  auto list_column_nested =
+    this->make_test_lists_column(3, {0, 1, 1, 2}, std::move(list_column), {1, 1, 1});
+
+  auto expected_data =
+    this->make_test_lists_column(0, {0}, this->zero_length_struct().release(), {1});
+
+  size_type index = 1;
+  auto s          = get_element(list_column_nested->view(), index);
+  auto typed_s    = static_cast<list_scalar const *>(s.get());
+
+  EXPECT_TRUE(s->is_valid());
+  // Relax to equivalent. The sliced version still has the array for fields
+  // allocated.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_data, typed_s->view());
+}
+
+TYPED_TEST(ListGetStructValueTest, NestedGetNull)
+{
+  // 3-rows
+  // [[{1, NULL, NULL}, NULL]]
+  // []
+  // NULL                      <- get_element(2)
+
+  using valid_t  = std::vector<valid_type>;
+  using offset_t = cudf::test::fixed_width_column_wrapper<offset_type>;
+
+  auto list_column = this->make_test_lists_column(2, {0, 2, 3}, this->leaf_data(), {1, 1});
+  auto list_column_nested =
+    this->make_test_lists_column(3, {0, 1, 1, 2}, std::move(list_column), {1, 1, 0});
+
+  size_type index = 2;
+  auto s          = get_element(list_column_nested->view(), index);
+  auto typed_s    = static_cast<list_scalar const *>(s.get());
+
+  auto nested = this->make_test_structs_column({}, {}, {}, valid_t{}.begin());
+  auto expected_data =
+    make_lists_column(0, offset_t{}.release(), nested.release(), 0, rmm::device_buffer{});
+
+  EXPECT_FALSE(s->is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_data, typed_s->view());
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/copying/pack_tests.cu b/cpp/tests/copying/pack_tests.cu
index b11ebb0183f..84cf176061d 100644
--- a/cpp/tests/copying/pack_tests.cu
+++ b/cpp/tests/copying/pack_tests.cu
@@ -359,7 +359,7 @@ TEST_F(PackUnpackTest, NestedEmpty)
     auto empty_string = cudf::strings::detail::make_empty_strings_column();
     auto offsets      = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list         = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
     this->run_test(src_table);
@@ -372,7 +372,7 @@ TEST_F(PackUnpackTest, NestedEmpty)
     auto empty_string = cudf::empty_like(str);
     auto offsets      = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list         = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
     this->run_test(src_table);
@@ -385,7 +385,7 @@ TEST_F(PackUnpackTest, NestedEmpty)
     auto empty_list = cudf::empty_like(listw);
     auto offsets    = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list       = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
     this->run_test(src_table);
@@ -398,7 +398,7 @@ TEST_F(PackUnpackTest, NestedEmpty)
     auto empty_list = cudf::empty_like(listw);
     auto offsets    = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list       = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
     this->run_test(src_table);
@@ -413,7 +413,7 @@ TEST_F(PackUnpackTest, NestedEmpty)
     auto empty_struct  = cudf::empty_like(struct_column);
     auto offsets       = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list          = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_struct), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_struct), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
     this->run_test(src_table);
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
new file mode 100644
index 00000000000..d60fd82af8c
--- /dev/null
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+namespace cudf {
+namespace test {
+
+using mask_vector = std::vector<valid_type>;
+using size_column = fixed_width_column_wrapper<size_type>;
+
+class ScatterListScalarTests : public cudf::test::BaseFixture {
+};
+
+std::unique_ptr<column> single_scalar_scatter(column_view const& target,
+                                              scalar const& slr,
+                                              column_view const& scatter_map)
+{
+  std::vector<std::reference_wrapper<const scalar>> slrs{slr};
+  table_view targets{{target}};
+  auto result = scatter(slrs, scatter_map, targets, true);
+  return std::move(result->release()[0]);
+}
+
+template <typename T>
+class ScatterListOfFixedWidthScalarTest : public ScatterListScalarTests {
+};
+
+TYPED_TEST_CASE(ScatterListOfFixedWidthScalarTest, FixedWidthTypesWithoutFixedPoint);
+
+// Test grid
+// Dim1 : {Fixed width, strings, lists, structs}
+// Dim2 : {Null scalar, Non-null empty scalar, Non-null non-empty scalar}
+// Dim3 : {Nullable target, non-nullable target row}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, Basic)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW({2, 2, 2}, {1, 0, 1}), true);
+  LCW col{LCW{1, 1, 1}, LCW{8, 8}, LCW{10, 10, 10, 10}, LCW{5}};
+  size_column scatter_map{3, 1, 0};
+
+  LCW expected{LCW({2, 2, 2}, mask_vector{1, 0, 1}.begin()),
+               LCW({2, 2, 2}, mask_vector{1, 0, 1}.begin()),
+               LCW{10, 10, 10, 10},
+               LCW({2, 2, 2}, mask_vector{1, 0, 1}.begin())};
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, EmptyValidScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW{}, true);
+  LCW col{LCW{1, 1, 1},
+          LCW{8, 8},
+          LCW({10, 10, 10, 10}, mask_vector{1, 0, 1, 0}.begin()),
+          LCW{5},
+          LCW{42, 42}};
+  size_column scatter_map{1, 0};
+
+  LCW expected{
+    LCW{}, LCW{}, LCW({10, 10, 10, 10}, mask_vector{1, 0, 1, 0}.begin()), LCW{5}, LCW{42, 42}};
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, NullScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW{}, false);
+  LCW col{LCW({1, 1, 1}, mask_vector{0, 0, 1}.begin()), LCW{8, 8}, LCW{10, 10, 10, 10}, LCW{5}};
+  size_column scatter_map{3, 1};
+
+  LCW expected({LCW({1, 1, 1}, mask_vector{0, 0, 1}.begin()), LCW{}, LCW{10, 10, 10, 10}, LCW{}},
+               mask_vector{1, 0, 1, 0}.begin());
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfFixedWidthScalarTest, NullableTargetRow)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  using FCW = fixed_width_column_wrapper<TypeParam>;
+
+  auto slr = std::make_unique<list_scalar>(FCW{9, 9}, true);
+  LCW col({LCW{4, 4}, LCW{}, LCW{8, 8, 8}, LCW{}, LCW{9, 9, 9}},
+          mask_vector{1, 0, 1, 0, 1}.begin());
+  size_column scatter_map{0, 1};
+
+  LCW expected({LCW{9, 9}, LCW{9, 9}, LCW{8, 8, 8}, LCW{}, LCW{9, 9, 9}},
+               mask_vector{1, 1, 1, 0, 1}.begin());
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+class ScatterListOfStringScalarTest : public ScatterListScalarTests {
+};
+
+TEST_F(ScatterListOfStringScalarTest, Basic)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(
+    StringCW({"Hello!", "", "你好！", "صباح الخير!", "", "こんにちは！"},
+             {true, false, true, true, false, true}),
+    true);
+  LCW col{LCW({"xx", "yy"}, mask_vector{0, 1}.begin()), LCW{""}, LCW{"a", "bab", "bacab"}};
+
+  size_column scatter_map{2, 1};
+
+  LCW expected{LCW({"xx", "yy"}, mask_vector{0, 1}.begin()),
+               LCW({"Hello!", "", "你好！", "صباح الخير!", "", "こんにちは！"},
+                   mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+               LCW({"Hello!", "", "你好！", "صباح الخير!", "", "こんにちは！"},
+                   mask_vector{1, 0, 1, 1, 0, 1}.begin())};
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(ScatterListOfStringScalarTest, EmptyValidScalar)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(StringCW{}, true);
+
+  LCW col{LCW({"xx", "yy"}, mask_vector{0, 1}.begin()),
+          LCW{""},
+          LCW{"a", "bab", "bacab"},
+          LCW{"888", "777"}};
+
+  size_column scatter_map{0, 3};
+
+  LCW expected{LCW{}, LCW{""}, LCW{"a", "bab", "bacab"}, LCW{}};
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(ScatterListOfStringScalarTest, NullScalar)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(StringCW{}, false);
+  LCW col{LCW{"xx", "yy"},
+          LCW({""}, mask_vector{0}.begin()),
+          LCW{"a", "bab", "bacab"},
+          LCW{"888", "777"}};
+
+  size_column scatter_map{1, 2};
+
+  LCW expected({LCW{"xx", "yy"}, LCW{}, LCW{}, LCW{"888", "777"}}, mask_vector{1, 0, 0, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TEST_F(ScatterListOfStringScalarTest, NullableTargetRow)
+{
+  using LCW      = lists_column_wrapper<string_view, int32_t>;
+  using StringCW = strings_column_wrapper;
+
+  auto slr = std::make_unique<list_scalar>(
+    StringCW({"Hello!", "", "こんにちは！"}, {true, false, true}), true);
+  LCW col({LCW{"xx", "yy"}, LCW({""}, mask_vector{0}.begin()), LCW{}, LCW{"888", "777"}},
+          mask_vector{1, 1, 0, 1}.begin());
+
+  size_column scatter_map{3, 2};
+
+  LCW expected({LCW{"xx", "yy"},
+                LCW({""}, mask_vector{0}.begin()),
+                LCW({"Hello!", "", "こんにちは！"}, mask_vector{1, 0, 1}.begin()),
+                LCW({"Hello!", "", "こんにちは！"}, mask_vector{1, 0, 1}.begin())},
+               mask_vector{1, 1, 1, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+template <typename T>
+class ScatterListOfListScalarTest : public ScatterListScalarTests {
+};
+
+TYPED_TEST_CASE(ScatterListOfListScalarTest, FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(ScatterListOfListScalarTest, Basic)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(
+    LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin()), true);
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{55, 55}, LCW{}, LCW{10, 10, 10}},
+           LCW{LCW{44, 44}}});
+
+  size_column scatter_map{1, 2, 3};
+
+  LCW expected({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+                LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin()),
+                LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin()),
+                LCW({LCW{1, 2, 3}, LCW{4}, LCW{}, LCW{5, 6}}, mask_vector{1, 1, 0, 1}.begin())});
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfListScalarTest, EmptyValidScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(LCW{}, true);
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{55, 55}, LCW{}, LCW{10, 10, 10}},
+           LCW{LCW{44, 44}}});
+
+  size_column scatter_map{3, 0};
+
+  LCW expected({LCW{},
+                LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+                LCW{LCW{55, 55}, LCW{}, LCW{10, 10, 10}},
+                LCW{}});
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfListScalarTest, NullScalar)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(LCW{}, false);
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{44, 44}}});
+
+  size_column scatter_map{1, 0};
+
+  LCW expected({LCW{}, LCW{}, LCW{LCW{44, 44}}}, mask_vector{0, 0, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+TYPED_TEST(ScatterListOfListScalarTest, NullableTargetRows)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+
+  auto slr = std::make_unique<list_scalar>(
+    LCW({LCW{1, 1, 1}, LCW{3, 3}, LCW{}, LCW{4}}, mask_vector{1, 1, 0, 1}.begin()), true);
+
+  LCW col({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+           LCW{LCW{66}, LCW{}, LCW({77, 77, 77, 77}, mask_vector{1, 0, 0, 1}.begin())},
+           LCW{LCW{44, 44}}},
+          mask_vector{1, 0, 1}.begin());
+
+  size_column scatter_map{1};
+
+  LCW expected({LCW({LCW{88, 88}, LCW{}, LCW{9, 9, 9}}, mask_vector{1, 0, 1}.begin()),
+                LCW({LCW{1, 1, 1}, LCW{3, 3}, LCW{}, LCW{4}}, mask_vector{1, 1, 0, 1}.begin()),
+                LCW{LCW{44, 44}}},
+               mask_vector{1, 1, 1}.begin());
+
+  auto result = single_scalar_scatter(col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+}
+
+template <typename T>
+class ScatterListOfStructScalarTest : public ScatterListScalarTests {
+ protected:
+  structs_column_wrapper make_test_structs(fixed_width_column_wrapper<T> field0,
+                                           strings_column_wrapper field1,
+                                           lists_column_wrapper<T, int32_t> field2,
+                                           std::vector<valid_type> mask)
+  {
+    return structs_column_wrapper({field0, field1, field2}, mask.begin());
+  }
+};
+
+TYPED_TEST_CASE(ScatterListOfStructScalarTest, FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(ScatterListOfStructScalarTest, Basic)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data =
+    this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
+                            {{"hello", "你好！", "bonjour!"}, {false, true, true}},
+                            LCW({LCW{88}, LCW{}, LCW{99, 99}}, mask_vector{1, 0, 1}.begin()),
+                            {1, 1, 0});
+  auto slr = std::make_unique<list_scalar>(data, true);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 0, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto col = make_lists_column(4, offsets.release(), child.release(), 0, rmm::device_buffer{});
+
+  size_column scatter_map{1, 3};
+
+  auto ex_child = this->make_test_structs(
+    {{1, 1, 42, 42, 42, 2, 42, 42, 42}, {0, 1, 1, 0, 1, 1, 1, 0, 1}},
+    {{"x", "x", "hello", "你好！", "bonjour!", "yy", "hello", "你好！", "bonjour!"},
+     {true, true, false, true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{88}, LCW{}, LCW{99, 99}, LCW{10}, LCW{88}, LCW{}, LCW{99, 99}},
+        mask_vector{1, 0, 1, 0, 1, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 1, 0, 0, 1, 1, 0});
+  offset_t ex_offsets{0, 2, 5, 6, 9};
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 0, rmm::device_buffer{});
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+TYPED_TEST(ScatterListOfStructScalarTest, EmptyValidScalar)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data = this->make_test_structs({}, {}, LCW{}, {});
+  auto slr  = std::make_unique<list_scalar>(data, true);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 0, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto col = make_lists_column(4, offsets.release(), child.release(), 0, rmm::device_buffer{});
+
+  size_column scatter_map{0, 2};
+
+  auto ex_child =
+    this->make_test_structs({{3, 3, 3}, {1, 0, 0}},
+                            {{"", "zzz", "zzz"}, {false, true, true}},
+                            LCW({LCW{20, 20}, LCW{}, LCW{30, 30}}, mask_vector{1, 0, 1}.begin()),
+                            {0, 1, 1});
+  offset_t ex_offsets{0, 0, 0, 0, 3};
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 0, rmm::device_buffer{});
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+TYPED_TEST(ScatterListOfStructScalarTest, NullScalar)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data = this->make_test_structs({}, {}, {}, {});
+  auto slr  = std::make_unique<list_scalar>(data, false);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto col = make_lists_column(4, offsets.release(), child.release(), 0, rmm::device_buffer{});
+
+  size_column scatter_map{3, 1, 0};
+
+  auto ex_child = this->make_test_structs({2}, {"yy"}, LCW({10}, mask_vector{1}.begin()), {1});
+  offset_t ex_offsets{0, 0, 0, 1, 1};
+
+  auto null_mask = create_null_mask(4, mask_state::ALL_NULL);
+  set_null_mask(static_cast<bitmask_type*>(null_mask.data()), 2, 3, true);
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 3, std::move(null_mask));
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+TYPED_TEST(ScatterListOfStructScalarTest, NullableTargetRow)
+{
+  using LCW      = lists_column_wrapper<TypeParam, int32_t>;
+  using offset_t = fixed_width_column_wrapper<offset_type>;
+
+  auto data =
+    this->make_test_structs({{42, 42, 42}, {1, 0, 1}},
+                            {{"hello", "你好！", "bonjour!"}, {false, true, true}},
+                            LCW({LCW{88}, LCW{}, LCW{99, 99}}, mask_vector{1, 0, 1}.begin()),
+                            {1, 1, 0});
+  auto slr = std::make_unique<list_scalar>(data, true);
+
+  auto child = this->make_test_structs(
+    {{1, 1, 2, 3, 3, 3}, {0, 1, 1, 1, 0, 0}},
+    {{"x", "x", "yy", "", "zzz", "zzz"}, {true, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{10}, LCW{20, 20}, LCW{}, LCW{30, 30}},
+        mask_vector{1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 0, 1, 1});
+  offset_t offsets{0, 2, 2, 3, 6};
+  auto null_mask = create_null_mask(4, mask_state::ALL_VALID);
+  set_null_mask(static_cast<bitmask_type*>(null_mask.data()), 1, 3, false);
+  auto col = make_lists_column(4, offsets.release(), child.release(), 2, std::move(null_mask));
+
+  size_column scatter_map{3, 2};
+
+  auto ex_child = this->make_test_structs(
+    {{1, 1, 42, 42, 42, 42, 42, 42}, {0, 1, 1, 0, 1, 1, 0, 1}},
+    {{"x", "x", "hello", "你好！", "bonjour!", "hello", "你好！", "bonjour!"},
+     {true, true, false, true, true, false, true, true}},
+    LCW({LCW{10, 10}, LCW{}, LCW{88}, LCW{}, LCW{99, 99}, LCW{88}, LCW{}, LCW{99, 99}},
+        mask_vector{1, 0, 1, 0, 1, 1, 0, 1}.begin()),
+    {1, 1, 1, 1, 0, 1, 1, 0});
+  offset_t ex_offsets{0, 2, 2, 5, 8};
+
+  auto ex_null_mask = create_null_mask(4, mask_state::ALL_VALID);
+  set_null_mask(static_cast<bitmask_type*>(ex_null_mask.data()), 1, 2, false);
+  auto expected =
+    make_lists_column(4, ex_offsets.release(), ex_child.release(), 1, std::move(ex_null_mask));
+
+  auto result = single_scalar_scatter(*col, *slr, scatter_map);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/copying/scatter_list_tests.cu b/cpp/tests/copying/scatter_list_tests.cpp
similarity index 95%
rename from cpp/tests/copying/scatter_list_tests.cu
rename to cpp/tests/copying/scatter_list_tests.cpp
index 92f1d44c46f..289d1cd6de0 100644
--- a/cpp/tests/copying/scatter_list_tests.cu
+++ b/cpp/tests/copying/scatter_list_tests.cpp
@@ -18,8 +18,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/scatter.cuh>
+#include <cudf/detail/copy.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -34,11 +33,8 @@
 template <typename T>
 class TypedScatterListsTest : public cudf::test::BaseFixture {
 };
-using FixedWidthTypes = cudf::test::Concat<cudf::test::IntegralTypes,
-                                           cudf::test::FloatingPointTypes,
-                                           cudf::test::DurationTypes,
-                                           cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(TypedScatterListsTest, FixedWidthTypes);
+
+TYPED_TEST_CASE(TypedScatterListsTest, cudf::test::FixedWidthTypes);
 
 class ScatterListsTest : public cudf::test::BaseFixture {
 };
@@ -64,6 +60,41 @@ TYPED_TEST(TypedScatterListsTest, ListsOfFixedWidth)
       {8, 8, 8}, {1, 1}, {9, 9, 9, 9}, {3, 3}, {4, 4}, {5, 5}, {6, 6}});
 }
 
+TYPED_TEST(TypedScatterListsTest, SlicedInputLists)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  auto src_list_column =
+    lists_column_wrapper<T, int32_t>{{0, 0, 0, 0}, {9, 9, 9, 9}, {8, 8, 8}, {7, 7, 7}}.release();
+  auto src_sliced =
+    cudf::detail::slice(src_list_column->view(), {1, 3}, rmm::cuda_stream_default).front();
+
+  auto target_list_column =
+    lists_column_wrapper<T, int32_t>{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}
+      .release();
+
+  auto scatter_map = fixed_width_column_wrapper<cudf::size_type>{2, 0};
+
+  auto ret_1 = cudf::scatter(
+    cudf::table_view({src_sliced}), scatter_map, cudf::table_view({target_list_column->view()}));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    ret_1->get_column(0),
+    lists_column_wrapper<T, int32_t>{
+      {8, 8, 8}, {1, 1}, {9, 9, 9, 9}, {3, 3}, {4, 4}, {5, 5}, {6, 6}});
+
+  auto target_sliced =
+    cudf::detail::slice(target_list_column->view(), {1, 6}, rmm::cuda_stream_default);
+
+  auto ret_2 =
+    cudf::scatter(cudf::table_view({src_sliced}), scatter_map, cudf::table_view({target_sliced}));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    ret_2->get_column(0),
+    lists_column_wrapper<T, int32_t>{{8, 8, 8}, {2, 2}, {9, 9, 9, 9}, {4, 4}, {5, 5}});
+}
+
 TYPED_TEST(TypedScatterListsTest, EmptyListsOfFixedWidth)
 {
   using namespace cudf::test;
diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp
new file mode 100644
index 00000000000..4ca805f2c18
--- /dev/null
+++ b/cpp/tests/copying/scatter_struct_tests.cpp
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+using bools_col   = cudf::test::fixed_width_column_wrapper<bool>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using structs_col = cudf::test::structs_column_wrapper;
+using strings_col = cudf::test::strings_column_wrapper;
+
+constexpr bool print_all{false};  // For debugging
+constexpr int32_t null{0};        // Mark for null child elements
+constexpr int32_t XXX{0};         // Mark for null struct elements
+
+template <typename T>
+struct TypedStructScatterTest : public cudf::test::BaseFixture {
+};
+
+using TestTypes = cudf::test::Concat<cudf::test::IntegralTypes,
+                                     cudf::test::FloatingPointTypes,
+                                     cudf::test::DurationTypes,
+                                     cudf::test::TimestampTypes>;
+
+TYPED_TEST_CASE(TypedStructScatterTest, TestTypes);
+
+namespace {
+auto no_null() { return cudf::test::iterator_no_null(); }
+
+auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); }
+
+auto scatter_structs(std::unique_ptr<cudf::column> const& structs_src,
+                     std::unique_ptr<cudf::column> const& structs_tgt,
+                     std::unique_ptr<cudf::column> const& scatter_map)
+{
+  auto const source = cudf::table_view{std::vector<cudf::column_view>{structs_src->view()}};
+  auto const target = cudf::table_view{std::vector<cudf::column_view>{structs_tgt->view()}};
+  auto const result = cudf::scatter(source, scatter_map->view(), target);
+  return result->get_column(0);
+}
+}  // namespace
+
+// Test case when all input columns are empty
+TYPED_TEST(TypedStructScatterTest, EmptyInputTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_src     = col_wrapper{};
+  auto const structs_src = structs_col{{child_col_src}}.release();
+
+  auto child_col_tgt     = col_wrapper{};
+  auto const structs_tgt = structs_col{{child_col_tgt}}.release();
+
+  auto const scatter_map = int32s_col{}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_src, scatter_structs(structs_src, structs_tgt, scatter_map), print_all);
+}
+
+// Test case when only the scatter map is empty
+TYPED_TEST(TypedStructScatterTest, EmptyScatterMapTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_src     = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)};
+  auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release();
+
+  auto child_col_tgt     = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)};
+  auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release();
+
+  auto const scatter_map = int32s_col{}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_tgt, scatter_structs(structs_src, structs_tgt, scatter_map), print_all);
+}
+
+TYPED_TEST(TypedStructScatterTest, ScatterAsCopyTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_src     = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)};
+  auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release();
+
+  auto child_col_tgt     = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)};
+  auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release();
+
+  // Scatter as copy: the target should be the same as source
+  auto const scatter_map = int32s_col{0, 1, 2, 3, 4, 5}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_src, scatter_structs(structs_src, structs_tgt, scatter_map), print_all);
+}
+
+TYPED_TEST(TypedStructScatterTest, ScatterAsLeftShiftTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_src     = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)};
+  auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release();
+
+  auto child_col_tgt     = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)};
+  auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release();
+
+  auto child_col_expected = col_wrapper{{2, 3, null, XXX, 0, 1}, null_at(2)};
+  auto structs_expected   = structs_col{{child_col_expected}, null_at(3)}.release();
+
+  auto const scatter_map = int32s_col{-2, -1, 0, 1, 2, 3}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all);
+}
+
+TYPED_TEST(TypedStructScatterTest, SimpleScatterTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  // Source data
+  auto child_col_src     = col_wrapper{{0, 1, 2, 3, null, XXX}, null_at(4)};
+  auto const structs_src = structs_col{{child_col_src}, null_at(5)}.release();
+
+  // Target data
+  auto child_col_tgt     = col_wrapper{{50, null, 70, XXX, 90, 100}, null_at(1)};
+  auto const structs_tgt = structs_col{{child_col_tgt}, null_at(3)}.release();
+
+  // Expected data
+  auto child_col_expected1     = col_wrapper{{1, null, 70, XXX, 0, 2}, null_at(1)};
+  auto const structs_expected1 = structs_col{{child_col_expected1}, null_at(3)}.release();
+  auto const scatter_map1      = int32s_col{-2, 0, 5}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_expected1, scatter_structs(structs_src, structs_tgt, scatter_map1), print_all);
+
+  // Expected data
+  auto child_col_expected2     = col_wrapper{{1, null, 70, 3, 0, 2}, null_at(1)};
+  auto const structs_expected2 = structs_col{{child_col_expected2}, no_null()}.release();
+  auto const scatter_map2      = int32s_col{-2, 0, 5, 3}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_expected2, scatter_structs(structs_src, structs_tgt, scatter_map2), print_all);
+}
+
+TYPED_TEST(TypedStructScatterTest, ComplexDataScatterTest)
+{
+  // Testing scatter() on struct<string, numeric, bool>.
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  // Source data
+  auto names_column_src =
+    strings_col{{"Newton", "Washington", "Cherry", "Kiwi", "Lemon", "Tomato" /*XXX*/}, no_null()};
+  auto ages_column_src = col_wrapper{{5, 10, 15, 20, null, XXX}, null_at(4)};
+  auto is_human_col_src =
+    bools_col{{true, true, false, false /*null*/, false, false /*XXX*/}, null_at(3)};
+  auto const structs_src =
+    structs_col{{names_column_src, ages_column_src, is_human_col_src}, null_at(5)}.release();
+
+  // Target data
+  auto names_column_tgt = strings_col{
+    {"String 0" /*null*/, "String 1", "String 2" /*XXX*/, "String 3", "String 4", "String 5"},
+    null_at(0)};
+  auto ages_column_tgt  = col_wrapper{{50, null, XXX, 80, 90, 100}, null_at(1)};
+  auto is_human_col_tgt = bools_col{{true, true, true /*XXX*/, true, true, true}, no_null()};
+  auto const structs_tgt =
+    structs_col{{names_column_tgt, ages_column_tgt, is_human_col_tgt}, null_at(2)}.release();
+
+  // Expected data
+  auto names_column_expected = strings_col{
+    {"String 0" /*null*/, "Lemon", "Kiwi", "Cherry", "Washington", "Newton"}, null_at(0)};
+  auto ages_column_expected = col_wrapper{{50, null, 20, 15, 10, 5}, null_at(1)};
+  auto is_human_col_expected =
+    bools_col{{true, false, false /*null*/, false, true, true}, null_at(2)};
+  auto const structs_expected =
+    structs_col{{names_column_expected, ages_column_expected, is_human_col_expected}, no_null()}
+      .release();
+
+  // The first element of the target is not overwritten
+  auto const scatter_map = int32s_col{-1, 4, 3, 2, 1}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all);
+}
+
+TYPED_TEST(TypedStructScatterTest, ScatterStructOfListsTest)
+{
+  // Testing scatter() on struct<list<numeric>>
+  using lists_col = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+
+  // Source data
+  auto lists_col_src =
+    lists_col{{{5}, {10, 15}, {20, 25, 30}, {35, 40, 45, 50}, {55, 60, 65}, {70, 75}, {80}, {}, {}},
+              // Valid for elements 0, 3, 6,...
+              cudf::detail::make_counting_transform_iterator(0, [](auto i) { return !(i % 3); })};
+  auto const structs_src = structs_col{{lists_col_src}}.release();
+
+  // Target data
+  auto lists_col_tgt =
+    lists_col{{{1}, {2, 3}, {4, 5, 6}, {7, 8}, {9}, {10, 11, 12, 13}, {}, {14}, {15, 16}},
+              // Valid for elements 1, 3, 5, 7,...
+              cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })};
+  auto const structs_tgt = structs_col{{lists_col_tgt}}.release();
+
+  // Expected data
+  auto const validity_expected = std::vector<bool>{0, 1, 1, 0, 0, 1, 1, 0, 0};
+  auto lists_col_expected      = lists_col{
+    {{1}, {2, 3}, {80}, {70, 75}, {55, 60, 65}, {35, 40, 45, 50}, {5}, {10, 15}, {20, 25, 30}},
+    validity_expected.begin()};
+  auto const structs_expected = structs_col{{lists_col_expected}}.release();
+
+  // The first 2 elements of the target is not overwritten
+  auto const scatter_map = int32s_col{-3, -2, -1, 5, 4, 3, 2}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all);
+}
+
+TYPED_TEST(TypedStructScatterTest, SourceSmallerThanTargetScatterTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_src     = col_wrapper{22, 55};
+  auto const structs_src = structs_col{{child_col_src}}.release();
+
+  auto child_col_tgt     = col_wrapper{0, 1, 2, 3, 4, 5, 6};
+  auto const structs_tgt = structs_col{{child_col_tgt}}.release();
+
+  auto child_col_expected     = col_wrapper{0, 1, 22, 3, 4, 55, 6};
+  auto const structs_expected = structs_col{{child_col_expected}}.release();
+
+  auto const scatter_map = int32s_col{2, 5}.release();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    *structs_expected, scatter_structs(structs_src, structs_tgt, scatter_map), print_all);
+}
diff --git a/cpp/tests/copying/scatter_struct_tests.cu b/cpp/tests/copying/scatter_struct_tests.cu
deleted file mode 100644
index a9bb1980d53..00000000000
--- a/cpp/tests/copying/scatter_struct_tests.cu
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/copying.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <memory>
-
-using bools_col   = cudf::test::fixed_width_column_wrapper<bool>;
-using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
-using structs_col = cudf::test::structs_column_wrapper;
-using strings_col = cudf::test::strings_column_wrapper;
-
-constexpr int32_t null{0};  // Mark for null child elements
-constexpr int32_t XXX{0};   // Mark for null struct elements
-
-template <typename T>
-struct TypedStructScatterTest : public cudf::test::BaseFixture {
-};
-
-using TestTypes = cudf::test::Concat<cudf::test::IntegralTypes,
-                                     cudf::test::FloatingPointTypes,
-                                     cudf::test::DurationTypes,
-                                     cudf::test::TimestampTypes>;
-
-TYPED_TEST_CASE(TypedStructScatterTest, TestTypes);
-
-namespace {
-void test_scatter(std::unique_ptr<cudf::column> const& structs_src,
-                  std::unique_ptr<cudf::column> const& structs_tgt,
-                  std::unique_ptr<cudf::column> const& structs_expected,
-                  std::unique_ptr<cudf::column> const& scatter_map)
-{
-  auto const source = cudf::table_view{std::vector<cudf::column_view>{structs_src->view()}};
-  auto const target = cudf::table_view{std::vector<cudf::column_view>{structs_tgt->view()}};
-  auto const result = cudf::scatter(source, scatter_map->view(), target);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(structs_expected->view(), result->get_column(0));
-}
-}  // namespace
-
-// Test case when all input columns are empty
-TYPED_TEST(TypedStructScatterTest, EmptyInputTest)
-{
-  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-
-  auto child_col_src     = col_wrapper{};
-  auto const structs_src = structs_col{{child_col_src}, std::vector<bool>{}}.release();
-
-  auto child_col_tgt     = col_wrapper{};
-  auto const structs_tgt = structs_col{{child_col_tgt}, std::vector<bool>{}}.release();
-
-  auto const scatter_map = int32s_col{}.release();
-  test_scatter(structs_src, structs_tgt, structs_src, scatter_map);
-  test_scatter(structs_src, structs_tgt, structs_tgt, scatter_map);
-}
-
-// Test case when only the scatter map is empty
-TYPED_TEST(TypedStructScatterTest, EmptyScatterMapTest)
-{
-  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-
-  auto child_col_src =
-    col_wrapper{{0, 1, 2, 3, null, XXX},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })};
-  auto const structs_src = structs_col{
-    {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 5;
-    })}.release();
-
-  auto child_col_tgt =
-    col_wrapper{{50, null, 70, XXX, 90, 100},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto const structs_tgt = structs_col{
-    {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 3;
-    })}.release();
-
-  auto const scatter_map = int32s_col{}.release();
-  test_scatter(structs_src, structs_tgt, structs_tgt, scatter_map);
-}
-
-TYPED_TEST(TypedStructScatterTest, ScatterAsCopyTest)
-{
-  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-
-  auto child_col_src =
-    col_wrapper{{0, 1, 2, 3, null, XXX},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })};
-  auto const structs_src = structs_col{
-    {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 5;
-    })}.release();
-
-  auto child_col_tgt =
-    col_wrapper{{50, null, 70, XXX, 90, 100},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto const structs_tgt = structs_col{
-    {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 3;
-    })}.release();
-
-  // Scatter as copy: the target should be the same as source
-  auto const scatter_map = int32s_col{0, 1, 2, 3, 4, 5}.release();
-  test_scatter(structs_src, structs_tgt, structs_src, scatter_map);
-}
-
-TYPED_TEST(TypedStructScatterTest, ScatterAsLeftShiftTest)
-{
-  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-
-  auto child_col_src =
-    col_wrapper{{0, 1, 2, 3, null, XXX},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })};
-  auto const structs_src = structs_col{
-    {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 5;
-    })}.release();
-
-  auto child_col_tgt =
-    col_wrapper{{50, null, 70, XXX, 90, 100},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto const structs_tgt = structs_col{
-    {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 3;
-    })}.release();
-
-  auto child_col_expected =
-    col_wrapper{{2, 3, null, XXX, 0, 1},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2; })};
-  auto structs_expected = structs_col{
-    {child_col_expected}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 3;
-    })}.release();
-
-  auto const scatter_map = int32s_col{-2, -1, 0, 1, 2, 3}.release();
-  test_scatter(structs_src, structs_tgt, structs_expected, scatter_map);
-}
-
-TYPED_TEST(TypedStructScatterTest, SimpleScatterTests)
-{
-  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-
-  // Source data
-  auto child_col_src =
-    col_wrapper{{0, 1, 2, 3, null, XXX},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })};
-  auto const structs_src = structs_col{
-    {child_col_src}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 5;
-    })}.release();
-
-  // Target data
-  auto child_col_tgt =
-    col_wrapper{{50, null, 70, XXX, 90, 100},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto const structs_tgt = structs_col{
-    {child_col_tgt}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 3;
-    })}.release();
-
-  // Expected data
-  auto child_col_expected1 =
-    col_wrapper{{1, null, 70, XXX, 0, 2},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto const structs_expected1 = structs_col{
-    {child_col_expected1}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 3;
-    })}.release();
-  auto const scatter_map1 = int32s_col{-2, 0, 5}.release();
-  test_scatter(structs_src, structs_tgt, structs_expected1, scatter_map1);
-
-  // Expected data
-  auto child_col_expected2 =
-    col_wrapper{{1, null, 70, 3, 0, 2},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto const structs_expected2 = structs_col{
-    {child_col_expected2}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return true;
-    })}.release();
-  auto const scatter_map2 = int32s_col{-2, 0, 5, 3}.release();
-  test_scatter(structs_src, structs_tgt, structs_expected2, scatter_map2);
-}
-
-TYPED_TEST(TypedStructScatterTest, ComplexDataScatterTest)
-{
-  // Testing scatter() on struct<string, numeric, bool>.
-  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-
-  // Source data
-  auto names_column_src =
-    strings_col{{"Newton", "Washington", "Cherry", "Kiwi", "Lemon", "Tomato"},
-                cudf::detail::make_counting_transform_iterator(0, [](auto) { return true; })};
-  auto ages_column_src =
-    col_wrapper{{5, 10, 15, 20, 25, 30},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })};
-  auto is_human_col_src =
-    bools_col{{true, true, false, false, false, false},
-              cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; })};
-
-  // Target data
-  auto names_column_tgt =
-    strings_col{{"String 0", "String 1", "String 2", "String 3", "String 4", "String 5"},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; })};
-  auto ages_column_tgt =
-    col_wrapper{{50, 60, 70, 80, 90, 100},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto is_human_col_tgt =
-    bools_col{{true, true, true, true, true, true},
-              cudf::detail::make_counting_transform_iterator(0, [](auto) { return true; })};
-
-  // Expected data
-  auto names_column_expected =
-    strings_col{{"String 0", "Lemon", "Kiwi", "Cherry", "Washington", "Newton"},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; })};
-  auto ages_column_expected =
-    col_wrapper{{50, 25, 20, 15, 10, 5},
-                cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })};
-  auto is_human_col_expected =
-    bools_col{{true, false, false, false, true, true},
-              cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2; })};
-
-  auto const structs_src = structs_col{
-    {names_column_src, ages_column_src, is_human_col_src},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 5;
-    })}.release();
-  auto const structs_tgt = structs_col{
-    {names_column_tgt, ages_column_tgt, is_human_col_tgt},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return i != 2;
-    })}.release();
-  auto const structs_expected = structs_col{
-    {names_column_expected, ages_column_expected, is_human_col_expected},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return true;
-    })}.release();
-
-  // The first element of the target is not overwritten
-  auto const scatter_map = int32s_col{-1, 4, 3, 2, 1}.release();
-  test_scatter(structs_src, structs_tgt, structs_expected, scatter_map);
-}
-
-TYPED_TEST(TypedStructScatterTest, ScatterStructOfListsTest)
-{
-  // Testing gather() on struct<list<numeric>>
-  using lists_col = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
-
-  // Source data
-  auto lists_col_src =
-    lists_col{{{5}, {10, 15}, {20, 25, 30}, {35, 40, 45, 50}, {55, 60, 65}, {70, 75}, {80}, {}, {}},
-              // Valid for elements 0, 3, 6,...
-              cudf::detail::make_counting_transform_iterator(0, [](auto i) { return !(i % 3); })};
-  auto const structs_src = structs_col{{lists_col_src}}.release();
-
-  // Target data
-  auto lists_col_tgt =
-    lists_col{{{1}, {2, 3}, {4, 5, 6}, {7, 8}, {9}, {10, 11, 12, 13}, {}, {14}, {15, 16}},
-              // Valid for elements 1, 3, 5, 7,...
-              cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })};
-  auto const structs_tgt = structs_col{{lists_col_tgt}}.release();
-
-  // Expected data
-  auto const validity_expected = std::vector<bool>{0, 1, 1, 0, 0, 1, 1, 0, 0};
-  auto lists_col_expected      = lists_col{
-    {{1}, {2, 3}, {80}, {70, 75}, {55, 60, 65}, {35, 40, 45, 50}, {5}, {10, 15}, {20, 25, 30}},
-    validity_expected.begin()};
-  auto const structs_expected = structs_col{{lists_col_expected}}.release();
-
-  // The first 2 elements of the target is not overwritten
-  auto const scatter_map = int32s_col{-3, -2, -1, 5, 4, 3, 2}.release();
-  test_scatter(structs_src, structs_tgt, structs_expected, scatter_map);
-}
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 80fa56b398c..d4e5a53aa85 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -1549,7 +1549,7 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
     auto empty_string = cudf::strings::detail::make_empty_strings_column();
     auto offsets      = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list         = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
 
@@ -1567,7 +1567,7 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
     auto empty_string = cudf::empty_like(str);
     auto offsets      = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list         = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
 
@@ -1584,8 +1584,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
     cudf::test::lists_column_wrapper<float> listw{{1.0f, 2.0f}, {3.0f, 4.0f}};
     auto empty_list = cudf::empty_like(listw);
     auto offsets    = cudf::test::fixed_width_column_wrapper<int>({0, 0});
-    auto list       = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{0});
+    auto list =
+      cudf::make_lists_column(1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
 
@@ -1602,8 +1602,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
     cudf::test::lists_column_wrapper<float> listw{{1.0f, 2.0f}, {3.0f, 4.0f}};
     auto empty_list = cudf::empty_like(listw);
     auto offsets    = cudf::test::fixed_width_column_wrapper<int>({0, 0});
-    auto list       = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{0});
+    auto list =
+      cudf::make_lists_column(1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
 
@@ -1623,7 +1623,7 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty)
     auto empty_struct  = cudf::empty_like(struct_column);
     auto offsets       = cudf::test::fixed_width_column_wrapper<int>({0, 0});
     auto list          = cudf::make_lists_column(
-      1, offsets.release(), std::move(empty_struct), 0, rmm::device_buffer{0});
+      1, offsets.release(), std::move(empty_struct), 0, rmm::device_buffer{});
 
     cudf::table_view src_table({static_cast<cudf::column_view>(*list)});
 
diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp
index 8c54252afbd..c7bbe4199f0 100644
--- a/cpp/tests/copying/utility_tests.cpp
+++ b/cpp/tests/copying/utility_tests.cpp
@@ -72,6 +72,84 @@ TEST_F(EmptyLikeStringTest, ColumnStringTest)
   check_empty_string_columns(got->view(), strings);
 }
 
+template <typename T>
+struct EmptyLikeScalarTest : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(EmptyLikeScalarTest, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(EmptyLikeScalarTest, FixedWidth)
+{
+  // make a column
+  auto input = make_fixed_width_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, 1, rmm::device_buffer{});
+  // get a scalar out of it
+  std::unique_ptr<cudf::scalar> sc = cudf::get_element(*input, 0);
+
+  // empty_like(column) -> column
+  auto expected = cudf::empty_like(*input);
+  // empty_like(scalar) -> column
+  auto result = cudf::empty_like(*sc);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
+}
+
+struct EmptyLikeScalarStringTest : public EmptyLikeScalarTest<std::string> {
+};
+
+TEST_F(EmptyLikeScalarStringTest, String)
+{
+  // make a column
+  cudf::test::strings_column_wrapper input{"abc"};
+
+  // get a scalar out of it
+  std::unique_ptr<cudf::scalar> sc = cudf::get_element(input, 0);
+
+  // empty_like(column) -> column
+  auto expected = cudf::empty_like(input);
+  // empty_like(scalar) -> column
+  auto result = cudf::empty_like(*sc);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
+}
+
+struct EmptyLikeScalarListTest : public EmptyLikeScalarTest<cudf::list_view> {
+};
+
+TEST_F(EmptyLikeScalarListTest, List)
+{
+  // make a column
+  cudf::test::lists_column_wrapper<cudf::string_view> input{{{"abc", "def"}, {"h", "ijk"}},
+                                                            {{"123", "456"}, {"78"}}};
+  // get a scalar out of it
+  std::unique_ptr<cudf::scalar> sc = cudf::get_element(input, 0);
+
+  // empty_like(column) -> column
+  auto expected = cudf::empty_like(input);
+  // empty_like(scalar) -> column
+  auto result = cudf::empty_like(*sc);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
+}
+
+struct EmptyLikeScalarStructTest : public EmptyLikeScalarTest<cudf::struct_view> {
+};
+
+TEST_F(EmptyLikeScalarStructTest, Struct)
+{
+  cudf::test::lists_column_wrapper<cudf::string_view> col0{{{"abc", "def"}, {"h", "ijk"}}};
+  cudf::test::strings_column_wrapper col1{"abc"};
+  cudf::test::fixed_width_column_wrapper<float> col2{1.0f};
+  // scalar. TODO:  make cudf::get_element() work for struct scalars
+  cudf::table_view tbl({col0, col1, col2});
+  cudf::struct_scalar sc(tbl);
+  // column
+  cudf::test::structs_column_wrapper input({col0, col1, col2});
+
+  // empty_like(column) -> column
+  auto expected = cudf::empty_like(input);
+  // empty_like(scalar) -> column
+  auto result = cudf::empty_like(sc);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
+}
+
 std::unique_ptr<cudf::table> create_table(cudf::size_type size, cudf::mask_state state)
 {
   auto num_column_1 = make_numeric_column(cudf::data_type{cudf::type_id::INT64}, size, state);
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index e407af667db..8aa83ce6b22 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp)
   using namespace cuda::std::chrono;
 
   cudf::data_type dtype{cudf::type_to_id<T>()};
-  cudf::column col{dtype, 0, rmm::device_buffer{0}};
+  cudf::column col{dtype, 0, rmm::device_buffer{}};
 
   EXPECT_THROW(extract_year(col), cudf::logic_error);
   EXPECT_THROW(extract_month(col), cudf::logic_error);
@@ -55,10 +55,9 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp)
   EXPECT_THROW(extract_second(col), cudf::logic_error);
   EXPECT_THROW(last_day_of_month(col), cudf::logic_error);
   EXPECT_THROW(day_of_year(col), cudf::logic_error);
-  EXPECT_THROW(
-    add_calendrical_months(
-      col, cudf::column{cudf::data_type{cudf::type_id::INT16}, 0, rmm::device_buffer{0}}),
-    cudf::logic_error);
+  EXPECT_THROW(add_calendrical_months(
+                 col, cudf::column{cudf::data_type{cudf::type_id::INT16}, 0, rmm::device_buffer{}}),
+               cudf::logic_error);
 }
 
 struct BasicDatetimeOpsTest : public cudf::test::BaseFixture {
@@ -159,8 +158,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns)
   auto int16s_dtype     = cudf::data_type{cudf::type_to_id<int16_t>()};
   auto timestamps_dtype = cudf::data_type{cudf::type_to_id<T>()};
 
-  cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{0}};
-  cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{0}};
+  cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{}};
+  cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{}};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), int16s);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), int16s);
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 1cdfd6ad8ef..aa53877f27d 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,19 @@
  * limitations under the License.
  */
 
-#include <algorithm>
+#include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cudf/detail/utilities/device_atomics.cuh>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/timestamp_utilities.cuh>
 #include <cudf_test/type_lists.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
+#include <algorithm>
+
 template <typename T>
 __global__ void gpu_atomic_test(T* result, T* data, size_t size)
 {
@@ -89,13 +92,13 @@ __global__ void gpu_atomicCAS_test(T* result, T* data, size_t size)
 }
 
 template <typename T>
-typename std::enable_if_t<!cudf::is_timestamp<T>(), T> accumulate(std::vector<T> const& xs)
+typename std::enable_if_t<!cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const> xs)
 {
   return std::accumulate(xs.begin(), xs.end(), T{0});
 }
 
 template <typename T>
-typename std::enable_if_t<cudf::is_timestamp<T>(), T> accumulate(std::vector<T> const& xs)
+typename std::enable_if_t<cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const> xs)
 {
   auto ys = std::vector<typename T::rep>(xs.size());
   std::transform(
@@ -112,8 +115,8 @@ struct AtomicsTest : public cudf::test::BaseFixture {
   {
     size_t vec_size = v_input.size();
 
-    // use transform from std::vector<int> instead.
-    std::vector<T> v(vec_size);
+    // use transform from thrust::host_vector<int> instead.
+    thrust::host_vector<T> v(vec_size);
     std::transform(v_input.begin(), v_input.end(), v.begin(), [](int x) {
       T t = cudf::test::make_type_param_scalar<T>(x);
       return t;
@@ -124,7 +127,7 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     exact[1] = *(std::min_element(v.begin(), v.end()));
     exact[2] = *(std::max_element(v.begin(), v.end()));
 
-    std::vector<T> result_init(9);  // +3 padding for int8 tests
+    thrust::host_vector<T> result_init(9);  // +3 padding for int8 tests
     result_init[0] = cudf::test::make_type_param_scalar<T>(0);
     result_init[1] = std::numeric_limits<T>::max();
     result_init[2] = std::numeric_limits<T>::min();
@@ -132,22 +135,20 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     result_init[4] = result_init[1];
     result_init[5] = result_init[2];
 
-    thrust::device_vector<T> dev_data(v);
-    thrust::device_vector<T> dev_result(result_init);
+    auto dev_data   = cudf::detail::make_device_uvector_sync(v);
+    auto dev_result = cudf::detail::make_device_uvector_sync(result_init);
 
     if (block_size == 0) { block_size = vec_size; }
 
     if (is_cas_test) {
-      gpu_atomicCAS_test<<<grid_size, block_size>>>(
-        dev_result.data().get(), dev_data.data().get(), vec_size);
+      gpu_atomicCAS_test<<<grid_size, block_size>>>(dev_result.data(), dev_data.data(), vec_size);
     } else {
-      gpu_atomic_test<<<grid_size, block_size>>>(
-        dev_result.data().get(), dev_data.data().get(), vec_size);
+      gpu_atomic_test<<<grid_size, block_size>>>(dev_result.data(), dev_data.data(), vec_size);
     }
 
-    thrust::host_vector<T> host_result(dev_result);
-    CUDA_TRY(cudaDeviceSynchronize());
-    CHECK_CUDA(0);
+    auto host_result = cudf::detail::make_host_vector_sync(dev_result);
+
+    CHECK_CUDA(rmm::cuda_stream_default.value());
 
     if (!is_timestamp_sum<T, cudf::DeviceSum>()) {
       EXPECT_EQ(host_result[0], exact[0]) << "atomicAdd test failed";
@@ -272,15 +273,10 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
       return t;
     });
 
-    std::vector<T> identity = {T(~0ull),
-                               T(0),
-                               T(0),
-                               T(~0ull),
-                               T(0),
-                               T(0),
-                               T(0),
-                               T(0),
-                               T(0)};  // +3 elements padding for int8 tests
+    thrust::host_vector<T> identity(9, T{0});  // +3 elements padding for int8 tests
+    identity[0] = T(~0ull);
+    identity[3] = T(~0ull);
+
     T exact[3];
     exact[0] = std::accumulate(
       v.begin(), v.end(), identity[0], [](T acc, uint64_t i) { return acc & T(i); });
@@ -289,22 +285,20 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
     exact[2] = std::accumulate(
       v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); });
 
-    thrust::device_vector<T> dev_result(identity);
-    thrust::device_vector<T> dev_data(v);
+    auto dev_result = cudf::detail::make_device_uvector_sync(identity);
+    auto dev_data   = cudf::detail::make_device_uvector_sync(v);
 
     if (block_size == 0) { block_size = vec_size; }
 
-    gpu_atomic_bitwiseOp_test<T>
-      <<<grid_size, block_size>>>(reinterpret_cast<T*>(dev_result.data().get()),
-                                  reinterpret_cast<T*>(dev_data.data().get()),
-                                  vec_size);
+    gpu_atomic_bitwiseOp_test<T><<<grid_size, block_size>>>(
+      reinterpret_cast<T*>(dev_result.data()), reinterpret_cast<T*>(dev_data.data()), vec_size);
+
+    auto host_result = cudf::detail::make_host_vector_sync(dev_result);
 
-    thrust::host_vector<T> host_result(dev_result);
-    CUDA_TRY(cudaDeviceSynchronize());
-    CHECK_CUDA(0);
+    CHECK_CUDA(rmm::cuda_stream_default.value());
 
-    print_exact(exact, "exact");
-    print_exact(host_result.data(), "result");
+    // print_exact(exact, "exact");
+    // print_exact(host_result.data(), "result");
 
     EXPECT_EQ(host_result[0], exact[0]) << "atomicAnd test failed";
     EXPECT_EQ(host_result[1], exact[1]) << "atomicOr  test failed";
@@ -314,7 +308,7 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
     EXPECT_EQ(host_result[5], exact[2]) << "atomicXor test(2) failed";
   }
 
-  void print_exact(const T* v, const char* msg)
+  [[maybe_unused]] void print_exact(const T* v, const char* msg)
   {
     std::cout << std::hex << std::showbase;
     std::cout << "The " << msg << " = {" << +v[0] << ", " << +v[1] << ", " << +v[2] << "}"
diff --git a/cpp/tests/dictionary/factories_test.cpp b/cpp/tests/dictionary/factories_test.cpp
index 5af04ef9cc3..d8e70afb6f5 100644
--- a/cpp/tests/dictionary/factories_test.cpp
+++ b/cpp/tests/dictionary/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ TEST_F(DictionaryFactoriesTest, CreateFromColumns)
   cudf::test::fixed_width_column_wrapper<uint32_t> values(h_values.begin(), h_values.end());
 
   auto dictionary =
-    cudf::make_dictionary_column(keys.release(), values.release(), rmm::device_buffer{0}, 0);
+    cudf::make_dictionary_column(keys.release(), values.release(), rmm::device_buffer{}, 0);
   cudf::dictionary_column_view view(dictionary->view());
 
   cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
@@ -104,7 +104,7 @@ TEST_F(DictionaryFactoriesTest, IndicesWithNulls)
   cudf::test::fixed_width_column_wrapper<int32_t> keys{0, 1, 2, 3, 4};
   cudf::test::fixed_width_column_wrapper<uint32_t> indices{{5, 4, 3, 2, 1, 0}, {1, 1, 1, 0, 1, 0}};
   EXPECT_THROW(
-    cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{0}, 0),
+    cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0),
     cudf::logic_error);
 }
 
@@ -114,6 +114,6 @@ TEST_F(DictionaryFactoriesTest, InvalidIndices)
   cudf::test::fixed_width_column_wrapper<int16_t> indices{5, 4, 3, 2, 1, 0};
   EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error);
   EXPECT_THROW(
-    cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{0}, 0),
+    cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0),
     cudf::logic_error);
 }
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
new file mode 100644
index 00000000000..47b2a95e7b5
--- /dev/null
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/null_mask.hpp>
+
+#include <algorithm>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+
+using namespace numeric;
+
+struct FixedPointTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+
+TYPED_TEST_CASE(FixedPointTestBothReps, RepresentationTypes);
+
+TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX num0{1.234567, scale_type{0}};
+  decimalXX num1{1.234567, scale_type{-1}};
+  decimalXX num2{1.234567, scale_type{-2}};
+  decimalXX num3{1.234567, scale_type{-3}};
+  decimalXX num4{1.234567, scale_type{-4}};
+  decimalXX num5{1.234567, scale_type{-5}};
+  decimalXX num6{1.234567, scale_type{-6}};
+
+  EXPECT_EQ(1, static_cast<double>(num0));
+  EXPECT_EQ(1.2, static_cast<double>(num1));
+  EXPECT_EQ(1.23, static_cast<double>(num2));
+  EXPECT_EQ(1.234, static_cast<double>(num3));
+  EXPECT_EQ(1.2345, static_cast<double>(num4));
+  EXPECT_EQ(1.23456, static_cast<double>(num5));
+  EXPECT_EQ(1.234567, static_cast<double>(num6));
+}
+
+TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX num0{-1.234567, scale_type{0}};
+  decimalXX num1{-1.234567, scale_type{-1}};
+  decimalXX num2{-1.234567, scale_type{-2}};
+  decimalXX num3{-1.234567, scale_type{-3}};
+  decimalXX num4{-1.234567, scale_type{-4}};
+  decimalXX num5{-1.234567, scale_type{-5}};
+  decimalXX num6{-1.234567, scale_type{-6}};
+
+  EXPECT_EQ(-1, static_cast<double>(num0));
+  EXPECT_EQ(-1.2, static_cast<double>(num1));
+  EXPECT_EQ(-1.23, static_cast<double>(num2));
+  EXPECT_EQ(-1.234, static_cast<double>(num3));
+  EXPECT_EQ(-1.2345, static_cast<double>(num4));
+  EXPECT_EQ(-1.23456, static_cast<double>(num5));
+  EXPECT_EQ(-1.234567, static_cast<double>(num6));
+}
+
+TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX a{1.1, scale_type{-1}};
+  decimalXX b{1.01, scale_type{-2}};
+  decimalXX c{1.001, scale_type{-3}};
+  decimalXX d{1.0001, scale_type{-4}};
+  decimalXX e{1.00001, scale_type{-5}};
+  decimalXX f{1.000001, scale_type{-6}};
+
+  decimalXX x{1.000123, scale_type{-8}};
+  decimalXX y{0.000123, scale_type{-8}};
+
+  EXPECT_EQ(1.1, static_cast<double>(a));
+  EXPECT_EQ(1.01, static_cast<double>(b));
+  EXPECT_EQ(1, static_cast<double>(c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.0001, static_cast<double>(d));
+  EXPECT_EQ(1.00001, static_cast<double>(e));
+  EXPECT_EQ(1, static_cast<double>(f));  // intentional (inherited problem from floating point)
+
+  EXPECT_TRUE(1.000123 - static_cast<double>(x) < std::numeric_limits<double>::epsilon());
+  EXPECT_EQ(0.000123, static_cast<double>(y));
+}
+
+TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
+{
+  using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
+
+  binary_fp num0{10, scale_type{0}};
+  binary_fp num1{10, scale_type{1}};
+  binary_fp num2{10, scale_type{2}};
+  binary_fp num3{10, scale_type{3}};
+  binary_fp num4{10, scale_type{4}};
+
+  binary_fp num5{1.24, scale_type{0}};
+  binary_fp num6{1.24, scale_type{-1}};
+  binary_fp num7{1.32, scale_type{-2}};
+  binary_fp num8{1.41, scale_type{-3}};
+  binary_fp num9{1.45, scale_type{-4}};
+
+  EXPECT_EQ(10, static_cast<double>(num0));
+  EXPECT_EQ(10, static_cast<double>(num1));
+  EXPECT_EQ(8, static_cast<double>(num2));
+  EXPECT_EQ(8, static_cast<double>(num3));
+  EXPECT_EQ(0, static_cast<double>(num4));
+
+  EXPECT_EQ(1, static_cast<double>(num5));
+  EXPECT_EQ(1, static_cast<double>(num6));
+  EXPECT_EQ(1.25, static_cast<double>(num7));
+  EXPECT_EQ(1.375, static_cast<double>(num8));
+  EXPECT_EQ(1.4375, static_cast<double>(num9));
+}
+
+TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
+{
+  using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
+
+  binary_fp num0{1.25, scale_type{-2}};
+  binary_fp num1{2.1, scale_type{-4}};
+
+  EXPECT_EQ(1.25, static_cast<double>(num0));
+  EXPECT_EQ(2.0625, static_cast<double>(num1));
+}
+
+TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX ONE{1, scale_type{-2}};
+  decimalXX TWO{2, scale_type{-2}};
+  decimalXX THREE{3, scale_type{-2}};
+  decimalXX SIX{6, scale_type{-2}};
+
+  EXPECT_TRUE(ONE + ONE == TWO);
+
+  EXPECT_EQ(ONE + ONE, TWO);
+  EXPECT_EQ(ONE * TWO, TWO);
+  EXPECT_EQ(THREE * TWO, SIX);
+  EXPECT_EQ(THREE - TWO, ONE);
+  EXPECT_EQ(TWO / ONE, TWO);
+  EXPECT_EQ(SIX / TWO, THREE);
+
+  decimalXX a{1.23, scale_type{-2}};
+  decimalXX b{0, scale_type{0}};
+
+  EXPECT_EQ(a + b, a);
+  EXPECT_EQ(a - b, a);
+}
+
+TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX ONE{1, scale_type{-1}};
+  decimalXX TWO{2, scale_type{-2}};
+  decimalXX THREE{3, scale_type{-3}};
+  decimalXX SIX{6, scale_type{-4}};
+
+  EXPECT_TRUE(ONE + ONE >= TWO);
+
+  EXPECT_TRUE(ONE + ONE <= TWO);
+  EXPECT_TRUE(ONE * TWO < THREE);
+  EXPECT_TRUE(THREE * TWO > THREE);
+  EXPECT_TRUE(THREE - TWO >= ONE);
+  EXPECT_TRUE(TWO / ONE < THREE);
+  EXPECT_TRUE(SIX / TWO >= ONE);
+}
+
+TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX ONE_1{1, scale_type{1}};
+  decimalXX SIX_0{6, scale_type{0}};
+  decimalXX SIX_1{6, scale_type{1}};
+  decimalXX TEN_0{10, scale_type{0}};
+  decimalXX TEN_1{10, scale_type{1}};
+  decimalXX SIXTY_1{60, scale_type{1}};
+
+  EXPECT_EQ(static_cast<int32_t>(ONE_1), 0);
+  EXPECT_EQ(static_cast<int32_t>(SIX_1), 0);
+  EXPECT_EQ(static_cast<int32_t>(TEN_0), 10);
+  EXPECT_EQ(static_cast<int32_t>(SIXTY_1), 60);
+
+  EXPECT_EQ(SIXTY_1 / TEN_0, ONE_1);
+  EXPECT_EQ(SIXTY_1 / TEN_1, SIX_0);
+
+  decimalXX A{34.56, scale_type{-2}};
+  decimalXX B{1.234, scale_type{-3}};
+  decimalXX C{1, scale_type{-2}};
+
+  EXPECT_EQ(static_cast<int32_t>(A / B), 20);
+  EXPECT_EQ(static_cast<int32_t>((A * C) / B), 28);
+
+  decimalXX n{28, scale_type{1}};
+  EXPECT_EQ(static_cast<int32_t>(n), 20);
+}
+
+TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX ZERO_0{0, scale_type{0}};
+  decimalXX ZERO_1{4, scale_type{1}};
+  decimalXX THREE_0{3, scale_type{0}};
+  decimalXX FOUR_0{4, scale_type{0}};
+  decimalXX FIVE_0{5, scale_type{0}};
+  decimalXX TEN_0{10, scale_type{0}};
+  decimalXX TEN_1{10, scale_type{1}};
+
+  decimalXX FOURTEEN_0{14, scale_type{0}};
+  decimalXX FIFTEEN_0{15, scale_type{0}};
+
+  EXPECT_EQ(ZERO_0, ZERO_1);
+  EXPECT_EQ(TEN_0, TEN_1);
+
+  EXPECT_EQ(ZERO_1 + TEN_1, TEN_1);
+  EXPECT_EQ(FOUR_0 + TEN_1, FOURTEEN_0);
+  EXPECT_TRUE(ZERO_0 == ZERO_1);
+  EXPECT_TRUE(FIVE_0 != TEN_1);
+  EXPECT_TRUE(FIVE_0 + FIVE_0 + FIVE_0 == FIFTEEN_0);
+  EXPECT_TRUE(FIVE_0 + FIVE_0 + FIVE_0 != TEN_1);
+  EXPECT_TRUE(FIVE_0 * THREE_0 == FIFTEEN_0);
+  EXPECT_TRUE(FIVE_0 * THREE_0 != TEN_1);
+}
+
+TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX a{1, scale_type{0}};
+  decimalXX b{1.2, scale_type{-1}};
+  decimalXX c{1.23, scale_type{-2}};
+  decimalXX d{1.111, scale_type{-3}};
+
+  decimalXX x{2.2, scale_type{-1}};
+  decimalXX y{3.43, scale_type{-2}};
+  decimalXX z{4.541, scale_type{-3}};
+
+  decimalXX xx{0.2, scale_type{-1}};
+  decimalXX yy{0.03, scale_type{-2}};
+  decimalXX zz{0.119, scale_type{-3}};
+
+  EXPECT_EQ(a + b, x);
+  EXPECT_EQ(a + b + c, y);
+  EXPECT_EQ(a + b + c + d, z);
+  EXPECT_EQ(b - a, xx);
+  EXPECT_EQ(c - b, yy);
+  EXPECT_EQ(c - d, zz);
+}
+
+TYPED_TEST(FixedPointTestBothReps, RescaledTest)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX num0{1, scale_type{0}};
+  decimalXX num1{1.2, scale_type{-1}};
+  decimalXX num2{1.23, scale_type{-2}};
+  decimalXX num3{1.234, scale_type{-3}};
+  decimalXX num4{1.2345, scale_type{-4}};
+  decimalXX num5{1.23456, scale_type{-5}};
+  decimalXX num6{1.234567, scale_type{-6}};
+
+  EXPECT_EQ(num0, num6.rescaled(scale_type{0}));
+  EXPECT_EQ(num1, num6.rescaled(scale_type{-1}));
+  EXPECT_EQ(num2, num6.rescaled(scale_type{-2}));
+  EXPECT_EQ(num3, num6.rescaled(scale_type{-3}));
+  EXPECT_EQ(num4, num6.rescaled(scale_type{-4}));
+  EXPECT_EQ(num5, num6.rescaled(scale_type{-5}));
+}
+
+TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX num0{1500, scale_type{0}};
+  decimalXX num1{1499, scale_type{0}};
+  decimalXX num2{-1499, scale_type{0}};
+  decimalXX num3{-1500, scale_type{0}};
+
+  EXPECT_EQ(1000, static_cast<TypeParam>(num0.rescaled(scale_type{3})));
+  EXPECT_EQ(1000, static_cast<TypeParam>(num1.rescaled(scale_type{3})));
+  EXPECT_EQ(-1000, static_cast<TypeParam>(num2.rescaled(scale_type{3})));
+  EXPECT_EQ(-1000, static_cast<TypeParam>(num3.rescaled(scale_type{3})));
+}
+
+TYPED_TEST(FixedPointTestBothReps, BoolConversion)
+{
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+
+  decimalXX truthy_value{1.234567, scale_type{0}};
+  decimalXX falsy_value{0, scale_type{0}};
+
+  // Test explicit conversions
+  EXPECT_EQ(static_cast<bool>(truthy_value), true);
+  EXPECT_EQ(static_cast<bool>(falsy_value), false);
+
+  // These operators also *explicitly* convert to bool
+  EXPECT_EQ(truthy_value && true, true);
+  EXPECT_EQ(true && truthy_value, true);
+  EXPECT_EQ(falsy_value || false, false);
+  EXPECT_EQ(false || falsy_value, false);
+  EXPECT_EQ(!truthy_value, false);
+  EXPECT_EQ(!falsy_value, true);
+}
+
+// These two overflow tests only work in a Debug build.
+// Unfortunately, in a full debug build, the test will each take about
+// an hour to run.
+// Therefore they are disabled here and can be enabled in an appropriate
+// debug build when required.
+TEST_F(FixedPointTest, DISABLED_OverflowDecimal32)
+{
+  // This flag is needed to avoid warnings with ASSERT_DEATH
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
+
+  decimal32 num0{2, scale_type{-9}};
+  decimal32 num1{-2, scale_type{-9}};
+
+  ASSERT_DEATH(num0 + num0, ".*");
+  ASSERT_DEATH(num1 - num0, ".*");
+
+  decimal32 min{std::numeric_limits<int32_t>::min(), scale_type{0}};
+  decimal32 max{std::numeric_limits<int32_t>::max(), scale_type{0}};
+  decimal32 NEG_ONE{-1, scale_type{0}};
+  decimal32 ONE{1, scale_type{0}};
+  decimal32 TWO{2, scale_type{0}};
+
+  ASSERT_DEATH(min / NEG_ONE, ".*");
+  ASSERT_DEATH(max * TWO, ".*");
+  ASSERT_DEATH(min * TWO, ".*");
+  ASSERT_DEATH(max + ONE, ".*");
+  ASSERT_DEATH(max - NEG_ONE, ".*");
+  ASSERT_DEATH(min - ONE, ".*");
+  ASSERT_DEATH(max - NEG_ONE, ".*");
+}
+
+// See comment above for OverflowDecimal32 test.
+TEST_F(FixedPointTest, DISABLED_OverflowDecimal64)
+{
+  // This flag is needed to avoid warnings with ASSERT_DEATH
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+
+  decimal64 num0{5, scale_type{-18}};
+  decimal64 num1{-5, scale_type{-18}};
+
+  ASSERT_DEATH(num0 + num0, ".*");
+  ASSERT_DEATH(num1 - num0, ".*");
+
+  decimal64 min{std::numeric_limits<int64_t>::min(), scale_type{0}};
+  decimal64 max{std::numeric_limits<int64_t>::max(), scale_type{0}};
+  decimal64 NEG_ONE{-1, scale_type{0}};
+  decimal64 ONE{1, scale_type{0}};
+  decimal64 TWO{2, scale_type{0}};
+
+  ASSERT_DEATH(min / NEG_ONE, ".*");
+  ASSERT_DEATH(max * TWO, ".*");
+  ASSERT_DEATH(min * TWO, ".*");
+  ASSERT_DEATH(max + ONE, ".*");
+  ASSERT_DEATH(max - NEG_ONE, ".*");
+  ASSERT_DEATH(min - ONE, ".*");
+  ASSERT_DEATH(max - NEG_ONE, ".*");
+}
+
+template <typename ValueType, typename Binop>
+void integer_vector_test(ValueType const initial_value,
+                         int32_t const size,
+                         int32_t const scale,
+                         Binop binop)
+{
+  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
+
+  std::vector<decimal32> vec1(size);
+  std::vector<ValueType> vec2(size);
+
+  std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}});
+  std::iota(std::begin(vec2), std::end(vec2), initial_value);
+
+  auto const res1 =
+    std::accumulate(std::cbegin(vec1), std::cend(vec1), decimal32{0, scale_type{scale}});
+
+  auto const res2 = std::accumulate(std::cbegin(vec2), std::cend(vec2), static_cast<ValueType>(0));
+
+  EXPECT_EQ(static_cast<int32_t>(res1), res2);
+
+  std::vector<ValueType> vec3(vec1.size());
+
+  std::transform(std::cbegin(vec1), std::cend(vec1), std::begin(vec3), [](auto const& e) {
+    return static_cast<int32_t>(e);
+  });
+
+  EXPECT_EQ(vec2, vec3);
+}
+
+TEST_F(FixedPointTest, Decimal32IntVector)
+{
+  integer_vector_test(0, 10, -2, std::plus<>());
+  integer_vector_test(0, 1000, -2, std::plus<>());
+
+  integer_vector_test(1, 10, 0, std::multiplies<>());
+  integer_vector_test(2, 20, 0, std::multiplies<>());
+}
+
+template <typename ValueType, typename Binop>
+void float_vector_test(ValueType const initial_value,
+                       int32_t const size,
+                       int32_t const scale,
+                       Binop binop)
+{
+  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
+
+  std::vector<decimal32> vec1(size);
+  std::vector<ValueType> vec2(size);
+
+  std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}});
+  std::iota(std::begin(vec2), std::end(vec2), initial_value);
+
+  auto equal = std::equal(
+    std::cbegin(vec1), std::cend(vec1), std::cbegin(vec2), [](auto const& a, auto const& b) {
+      return static_cast<double>(a) - b <= std::numeric_limits<ValueType>::epsilon();
+    });
+
+  EXPECT_TRUE(equal);
+}
+
+TEST_F(FixedPointTest, Decimal32FloatVector)
+{
+  float_vector_test(0.1, 1000, -2, std::plus<>());
+  float_vector_test(0.15, 1000, -2, std::plus<>());
+
+  float_vector_test(0.1, 10, -2, std::multiplies<>());
+  float_vector_test(0.15, 20, -2, std::multiplies<>());
+}
+
+struct cast_to_int32_fn {
+  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
+  int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
+};
+
+TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
+{
+  using namespace numeric;
+  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
+  using RepType   = TypeParam;
+
+  // fixed_point_column_wrapper
+  auto const w = cudf::test::fixed_point_column_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
+
+  // fixed_width_column_wrapper
+  auto const ONE   = decimalXX{1, scale_type{0}};
+  auto const TWO   = decimalXX{2, scale_type{0}};
+  auto const THREE = decimalXX{3, scale_type{0}};
+  auto const FOUR  = decimalXX{4, scale_type{0}};
+
+  auto const vec = std::vector<decimalXX>{ONE, TWO, THREE, FOUR};
+  auto const col = cudf::test::fixed_width_column_wrapper<decimalXX>(vec.begin(), vec.end());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, w);
+}
+
+TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
+{
+  auto null_mask = cudf::create_null_mask(0, cudf::mask_state::ALL_NULL);
+
+  EXPECT_THROW(
+    cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0, std::move(null_mask)),
+    cudf::logic_error);
+}
+
+TYPED_TEST(FixedPointTestBothReps, SimpleFixedPointColumnWrapper)
+{
+  using RepType = cudf::device_storage_type_t<TypeParam>;
+
+  auto const a = cudf::test::fixed_point_column_wrapper<RepType>{{11, 22, 33}, scale_type{-1}};
+  auto const b = cudf::test::fixed_point_column_wrapper<RepType>{{110, 220, 330}, scale_type{-2}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(a, b);
+}
+
+TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
+{
+  // This is testing fixed_point values outside the range of its underlying type.
+  // For example, 100,000,000 with scale of 6 is 100,000,000,000,000 (100 trillion) and this is
+  // outside the range of a int32_t
+
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int32_t>;
+
+  auto const a = fp_wrapper{{100000000}, scale_type{6}};
+  auto const b = fp_wrapper{{5000000}, scale_type{7}};
+  auto const c = fp_wrapper{{2}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper{{150000000}, scale_type{6}};
+  auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
+
+  auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+}
+
+TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
+{
+  // This is testing fixed_point values outside the range of its underlying type.
+  // For example, 100,000,000 with scale of 100 is 10 ^ 108 and this is far outside the
+  // range of a int64_t
+
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+
+  auto const a = fp_wrapper{{100000000}, scale_type{100}};
+  auto const b = fp_wrapper{{5000000}, scale_type{101}};
+  auto const c = fp_wrapper{{2}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper{{150000000}, scale_type{100}};
+  auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
+
+  auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+}
+
+TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
+{
+  // This is testing fixed_point values with an extremely large negative scale. The fixed_point
+  // implementation should be able to handle any scale representable by an int32_t
+
+  using decimalXX  = fixed_point<TypeParam, Radix::BASE_10>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<TypeParam>;
+
+  auto const a = fp_wrapper{{10}, scale_type{-201}};
+  auto const b = fp_wrapper{{50}, scale_type{-202}};
+  auto const c = fp_wrapper{{2}, scale_type{0}};
+
+  auto const expected1 = fp_wrapper{{150}, scale_type{-202}};
+  auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
+
+  auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
+  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
+
+  auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
+  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
+}
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 5f74e459bb1..7244b913a6a 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -19,21 +19,19 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/binaryop.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
 #include <algorithm>
 #include <limits>
-#include <numeric>
-#include <type_traits>
 #include <vector>
 
 using namespace numeric;
@@ -49,275 +47,6 @@ using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
 
 TYPED_TEST_CASE(FixedPointTestBothReps, RepresentationTypes);
 
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX num0{1.234567, scale_type{0}};
-  decimalXX num1{1.234567, scale_type{-1}};
-  decimalXX num2{1.234567, scale_type{-2}};
-  decimalXX num3{1.234567, scale_type{-3}};
-  decimalXX num4{1.234567, scale_type{-4}};
-  decimalXX num5{1.234567, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
-
-  EXPECT_EQ(1, static_cast<double>(num0));
-  EXPECT_EQ(1.2, static_cast<double>(num1));
-  EXPECT_EQ(1.23, static_cast<double>(num2));
-  EXPECT_EQ(1.234, static_cast<double>(num3));
-  EXPECT_EQ(1.2345, static_cast<double>(num4));
-  EXPECT_EQ(1.23456, static_cast<double>(num5));
-  EXPECT_EQ(1.234567, static_cast<double>(num6));
-}
-
-TYPED_TEST(FixedPointTestBothReps, SimpleNegativeDecimalXXConstruction)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX num0{-1.234567, scale_type{0}};
-  decimalXX num1{-1.234567, scale_type{-1}};
-  decimalXX num2{-1.234567, scale_type{-2}};
-  decimalXX num3{-1.234567, scale_type{-3}};
-  decimalXX num4{-1.234567, scale_type{-4}};
-  decimalXX num5{-1.234567, scale_type{-5}};
-  decimalXX num6{-1.234567, scale_type{-6}};
-
-  EXPECT_EQ(-1, static_cast<double>(num0));
-  EXPECT_EQ(-1.2, static_cast<double>(num1));
-  EXPECT_EQ(-1.23, static_cast<double>(num2));
-  EXPECT_EQ(-1.234, static_cast<double>(num3));
-  EXPECT_EQ(-1.2345, static_cast<double>(num4));
-  EXPECT_EQ(-1.23456, static_cast<double>(num5));
-  EXPECT_EQ(-1.234567, static_cast<double>(num6));
-}
-
-TYPED_TEST(FixedPointTestBothReps, PaddedDecimalXXConstruction)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX a{1.1, scale_type{-1}};
-  decimalXX b{1.01, scale_type{-2}};
-  decimalXX c{1.001, scale_type{-3}};
-  decimalXX d{1.0001, scale_type{-4}};
-  decimalXX e{1.00001, scale_type{-5}};
-  decimalXX f{1.000001, scale_type{-6}};
-
-  decimalXX x{1.000123, scale_type{-8}};
-  decimalXX y{0.000123, scale_type{-8}};
-
-  EXPECT_EQ(1.1, static_cast<double>(a));
-  EXPECT_EQ(1.01, static_cast<double>(b));
-  EXPECT_EQ(1, static_cast<double>(c));  // intentional (inherited problem from floating point)
-  EXPECT_EQ(1.0001, static_cast<double>(d));
-  EXPECT_EQ(1.00001, static_cast<double>(e));
-  EXPECT_EQ(1, static_cast<double>(f));  // intentional (inherited problem from floating point)
-
-  EXPECT_TRUE(1.000123 - static_cast<double>(x) < std::numeric_limits<double>::epsilon());
-  EXPECT_EQ(0.000123, static_cast<double>(y));
-}
-
-TYPED_TEST(FixedPointTestBothReps, SimpleBinaryFPConstruction)
-{
-  using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
-
-  binary_fp num0{10, scale_type{0}};
-  binary_fp num1{10, scale_type{1}};
-  binary_fp num2{10, scale_type{2}};
-  binary_fp num3{10, scale_type{3}};
-  binary_fp num4{10, scale_type{4}};
-
-  binary_fp num5{1.24, scale_type{0}};
-  binary_fp num6{1.24, scale_type{-1}};
-  binary_fp num7{1.32, scale_type{-2}};
-  binary_fp num8{1.41, scale_type{-3}};
-  binary_fp num9{1.45, scale_type{-4}};
-
-  EXPECT_EQ(10, static_cast<double>(num0));
-  EXPECT_EQ(10, static_cast<double>(num1));
-  EXPECT_EQ(8, static_cast<double>(num2));
-  EXPECT_EQ(8, static_cast<double>(num3));
-  EXPECT_EQ(0, static_cast<double>(num4));
-
-  EXPECT_EQ(1, static_cast<double>(num5));
-  EXPECT_EQ(1, static_cast<double>(num6));
-  EXPECT_EQ(1.25, static_cast<double>(num7));
-  EXPECT_EQ(1.375, static_cast<double>(num8));
-  EXPECT_EQ(1.4375, static_cast<double>(num9));
-}
-
-TYPED_TEST(FixedPointTestBothReps, MoreSimpleBinaryFPConstruction)
-{
-  using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
-
-  binary_fp num0{1.25, scale_type{-2}};
-  binary_fp num1{2.1, scale_type{-4}};
-
-  EXPECT_EQ(1.25, static_cast<double>(num0));
-  EXPECT_EQ(2.0625, static_cast<double>(num1));
-}
-
-TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXMath)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX ONE{1, scale_type{-2}};
-  decimalXX TWO{2, scale_type{-2}};
-  decimalXX THREE{3, scale_type{-2}};
-  decimalXX SIX{6, scale_type{-2}};
-
-  EXPECT_TRUE(ONE + ONE == TWO);
-
-  EXPECT_EQ(ONE + ONE, TWO);
-  EXPECT_EQ(ONE * TWO, TWO);
-  EXPECT_EQ(THREE * TWO, SIX);
-  EXPECT_EQ(THREE - TWO, ONE);
-  EXPECT_EQ(TWO / ONE, TWO);
-  EXPECT_EQ(SIX / TWO, THREE);
-
-  decimalXX a{1.23, scale_type{-2}};
-  decimalXX b{0, scale_type{0}};
-
-  EXPECT_EQ(a + b, a);
-  EXPECT_EQ(a - b, a);
-}
-
-TYPED_TEST(FixedPointTestBothReps, ComparisonOperators)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX ONE{1, scale_type{-1}};
-  decimalXX TWO{2, scale_type{-2}};
-  decimalXX THREE{3, scale_type{-3}};
-  decimalXX SIX{6, scale_type{-4}};
-
-  EXPECT_TRUE(ONE + ONE >= TWO);
-
-  EXPECT_TRUE(ONE + ONE <= TWO);
-  EXPECT_TRUE(ONE * TWO < THREE);
-  EXPECT_TRUE(THREE * TWO > THREE);
-  EXPECT_TRUE(THREE - TWO >= ONE);
-  EXPECT_TRUE(TWO / ONE < THREE);
-  EXPECT_TRUE(SIX / TWO >= ONE);
-}
-
-TYPED_TEST(FixedPointTestBothReps, DecimalXXTrickyDivision)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX ONE_1{1, scale_type{1}};
-  decimalXX SIX_0{6, scale_type{0}};
-  decimalXX SIX_1{6, scale_type{1}};
-  decimalXX TEN_0{10, scale_type{0}};
-  decimalXX TEN_1{10, scale_type{1}};
-  decimalXX SIXTY_1{60, scale_type{1}};
-
-  EXPECT_EQ(static_cast<int32_t>(ONE_1), 0);
-  EXPECT_EQ(static_cast<int32_t>(SIX_1), 0);
-  EXPECT_EQ(static_cast<int32_t>(TEN_0), 10);
-  EXPECT_EQ(static_cast<int32_t>(SIXTY_1), 60);
-
-  EXPECT_EQ(SIXTY_1 / TEN_0, ONE_1);
-  EXPECT_EQ(SIXTY_1 / TEN_1, SIX_0);
-
-  decimalXX A{34.56, scale_type{-2}};
-  decimalXX B{1.234, scale_type{-3}};
-  decimalXX C{1, scale_type{-2}};
-
-  EXPECT_EQ(static_cast<int32_t>(A / B), 20);
-  EXPECT_EQ(static_cast<int32_t>((A * C) / B), 28);
-
-  decimalXX n{28, scale_type{1}};
-  EXPECT_EQ(static_cast<int32_t>(n), 20);
-}
-
-TYPED_TEST(FixedPointTestBothReps, DecimalXXRounding)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX ZERO_0{0, scale_type{0}};
-  decimalXX ZERO_1{4, scale_type{1}};
-  decimalXX THREE_0{3, scale_type{0}};
-  decimalXX FOUR_0{4, scale_type{0}};
-  decimalXX FIVE_0{5, scale_type{0}};
-  decimalXX TEN_0{10, scale_type{0}};
-  decimalXX TEN_1{10, scale_type{1}};
-
-  decimalXX FOURTEEN_0{14, scale_type{0}};
-  decimalXX FIFTEEN_0{15, scale_type{0}};
-
-  EXPECT_EQ(ZERO_0, ZERO_1);
-  EXPECT_EQ(TEN_0, TEN_1);
-
-  EXPECT_EQ(ZERO_1 + TEN_1, TEN_1);
-  EXPECT_EQ(FOUR_0 + TEN_1, FOURTEEN_0);
-  EXPECT_TRUE(ZERO_0 == ZERO_1);
-  EXPECT_TRUE(FIVE_0 != TEN_1);
-  EXPECT_TRUE(FIVE_0 + FIVE_0 + FIVE_0 == FIFTEEN_0);
-  EXPECT_TRUE(FIVE_0 + FIVE_0 + FIVE_0 != TEN_1);
-  EXPECT_TRUE(FIVE_0 * THREE_0 == FIFTEEN_0);
-  EXPECT_TRUE(FIVE_0 * THREE_0 != TEN_1);
-}
-
-TYPED_TEST(FixedPointTestBothReps, ArithmeticWithDifferentScales)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX a{1, scale_type{0}};
-  decimalXX b{1.2, scale_type{-1}};
-  decimalXX c{1.23, scale_type{-2}};
-  decimalXX d{1.111, scale_type{-3}};
-
-  decimalXX x{2.2, scale_type{-1}};
-  decimalXX y{3.43, scale_type{-2}};
-  decimalXX z{4.541, scale_type{-3}};
-
-  decimalXX xx{0.2, scale_type{-1}};
-  decimalXX yy{0.03, scale_type{-2}};
-  decimalXX zz{0.119, scale_type{-3}};
-
-  EXPECT_EQ(a + b, x);
-  EXPECT_EQ(a + b + c, y);
-  EXPECT_EQ(a + b + c + d, z);
-  EXPECT_EQ(b - a, xx);
-  EXPECT_EQ(c - b, yy);
-  EXPECT_EQ(c - d, zz);
-}
-
-TYPED_TEST(FixedPointTestBothReps, RescaledTest)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX num0{1, scale_type{0}};
-  decimalXX num1{1.2, scale_type{-1}};
-  decimalXX num2{1.23, scale_type{-2}};
-  decimalXX num3{1.234, scale_type{-3}};
-  decimalXX num4{1.2345, scale_type{-4}};
-  decimalXX num5{1.23456, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
-
-  EXPECT_EQ(num0, num6.rescaled(scale_type{0}));
-  EXPECT_EQ(num1, num6.rescaled(scale_type{-1}));
-  EXPECT_EQ(num2, num6.rescaled(scale_type{-2}));
-  EXPECT_EQ(num3, num6.rescaled(scale_type{-3}));
-  EXPECT_EQ(num4, num6.rescaled(scale_type{-4}));
-  EXPECT_EQ(num5, num6.rescaled(scale_type{-5}));
-}
-
-TYPED_TEST(FixedPointTestBothReps, RescaledRounding)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX num0{1500, scale_type{0}};
-  decimalXX num1{1499, scale_type{0}};
-  decimalXX num2{-1499, scale_type{0}};
-  decimalXX num3{-1500, scale_type{0}};
-
-  EXPECT_EQ(1000, static_cast<TypeParam>(num0.rescaled(scale_type{3})));
-  EXPECT_EQ(1000, static_cast<TypeParam>(num1.rescaled(scale_type{3})));
-  EXPECT_EQ(-1000, static_cast<TypeParam>(num2.rescaled(scale_type{3})));
-  EXPECT_EQ(-1000, static_cast<TypeParam>(num3.rescaled(scale_type{3})));
-}
-
 TYPED_TEST(FixedPointTestBothReps, DecimalXXThrust)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -344,160 +73,6 @@ TYPED_TEST(FixedPointTestBothReps, DecimalXXThrust)
   EXPECT_EQ(vec2, vec3);
 }
 
-TYPED_TEST(FixedPointTestBothReps, BoolConversion)
-{
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-
-  decimalXX truthy_value{1.234567, scale_type{0}};
-  decimalXX falsy_value{0, scale_type{0}};
-
-  // Test explicit conversions
-  EXPECT_EQ(static_cast<bool>(truthy_value), true);
-  EXPECT_EQ(static_cast<bool>(falsy_value), false);
-
-  // These operators also *explicitly* convert to bool
-  EXPECT_EQ(truthy_value && true, true);
-  EXPECT_EQ(true && truthy_value, true);
-  EXPECT_EQ(falsy_value || false, false);
-  EXPECT_EQ(false || falsy_value, false);
-  EXPECT_EQ(!truthy_value, false);
-  EXPECT_EQ(!falsy_value, true);
-}
-
-TEST_F(FixedPointTest, OverflowDecimal32)
-{
-  // This flag is needed to avoid warnings with ASSERT_DEATH
-  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-
-  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-
-#if defined(__CUDACC_DEBUG__)
-
-  decimal32 num0{2, scale_type{-9}};
-  decimal32 num1{-2, scale_type{-9}};
-
-  ASSERT_DEATH(num0 + num0, ".*");
-  ASSERT_DEATH(num1 - num0, ".*");
-
-  decimal32 min{std::numeric_limits<int32_t>::min(), scale_type{0}};
-  decimal32 max{std::numeric_limits<int32_t>::max(), scale_type{0}};
-  decimal32 NEG_ONE{-1, scale_type{0}};
-  decimal32 ONE{1, scale_type{0}};
-  decimal32 TWO{2, scale_type{0}};
-
-  ASSERT_DEATH(min / NEG_ONE, ".*");
-  ASSERT_DEATH(max * TWO, ".*");
-  ASSERT_DEATH(min * TWO, ".*");
-  ASSERT_DEATH(max + ONE, ".*");
-  ASSERT_DEATH(max - NEG_ONE, ".*");
-  ASSERT_DEATH(min - ONE, ".*");
-  ASSERT_DEATH(max - NEG_ONE, ".*");
-
-#endif
-}
-
-TEST_F(FixedPointTest, OverflowDecimal64)
-{
-  // This flag is needed to avoid warnings with ASSERT_DEATH
-  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
-
-  using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
-
-#if defined(__CUDACC_DEBUG__)
-
-  decimal64 num0{5, scale_type{-18}};
-  decimal64 num1{-5, scale_type{-18}};
-
-  ASSERT_DEATH(num0 + num0, ".*");
-  ASSERT_DEATH(num1 - num0, ".*");
-
-  decimal64 min{std::numeric_limits<int64_t>::min(), scale_type{0}};
-  decimal64 max{std::numeric_limits<int64_t>::max(), scale_type{0}};
-  decimal64 NEG_ONE{-1, scale_type{0}};
-  decimal64 ONE{1, scale_type{0}};
-  decimal64 TWO{2, scale_type{0}};
-
-  ASSERT_DEATH(min / NEG_ONE, ".*");
-  ASSERT_DEATH(max * TWO, ".*");
-  ASSERT_DEATH(min * TWO, ".*");
-  ASSERT_DEATH(max + ONE, ".*");
-  ASSERT_DEATH(max - NEG_ONE, ".*");
-  ASSERT_DEATH(min - ONE, ".*");
-  ASSERT_DEATH(max - NEG_ONE, ".*");
-
-#endif
-}
-
-template <typename ValueType, typename Binop>
-void integer_vector_test(ValueType const initial_value,
-                         int32_t const size,
-                         int32_t const scale,
-                         Binop binop)
-{
-  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-
-  std::vector<decimal32> vec1(size);
-  std::vector<ValueType> vec2(size);
-
-  std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}});
-  std::iota(std::begin(vec2), std::end(vec2), initial_value);
-
-  auto const res1 =
-    std::accumulate(std::cbegin(vec1), std::cend(vec1), decimal32{0, scale_type{scale}});
-
-  auto const res2 = std::accumulate(std::cbegin(vec2), std::cend(vec2), static_cast<ValueType>(0));
-
-  EXPECT_EQ(static_cast<int32_t>(res1), res2);
-
-  std::vector<ValueType> vec3(vec1.size());
-
-  std::transform(std::cbegin(vec1), std::cend(vec1), std::begin(vec3), [](auto const& e) {
-    return static_cast<int32_t>(e);
-  });
-
-  EXPECT_EQ(vec2, vec3);
-}
-
-TEST_F(FixedPointTest, Decimal32IntVector)
-{
-  integer_vector_test(0, 10, -2, std::plus<>());
-  integer_vector_test(0, 1000, -2, std::plus<>());
-
-  integer_vector_test(1, 10, 0, std::multiplies<>());
-  integer_vector_test(2, 20, 0, std::multiplies<>());
-}
-
-template <typename ValueType, typename Binop>
-void float_vector_test(ValueType const initial_value,
-                       int32_t const size,
-                       int32_t const scale,
-                       Binop binop)
-{
-  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-
-  std::vector<decimal32> vec1(size);
-  std::vector<ValueType> vec2(size);
-
-  std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}});
-  std::iota(std::begin(vec2), std::end(vec2), initial_value);
-
-  auto equal = std::equal(
-    std::cbegin(vec1), std::cend(vec1), std::cbegin(vec2), [](auto const& a, auto const& b) {
-      return static_cast<double>(a) - b <= std::numeric_limits<ValueType>::epsilon();
-    });
-
-  EXPECT_TRUE(equal);
-}
-
-TEST_F(FixedPointTest, Decimal32FloatVector)
-{
-  float_vector_test(0.1, 1000, -2, std::plus<>());
-  float_vector_test(0.15, 1000, -2, std::plus<>());
-
-  float_vector_test(0.1, 10, -2, std::multiplies<>());
-  float_vector_test(0.15, 20, -2, std::multiplies<>());
-}
-
 struct cast_to_int32_fn {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
@@ -507,149 +82,39 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
 {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
 
-  thrust::device_vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
+  std::vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
+  auto d_vec1 = cudf::detail::make_device_uvector_sync(vec1);
 
   auto const sum = thrust::reduce(
-    rmm::exec_policy(), std::cbegin(vec1), std::cend(vec1), decimal32{0, scale_type{-2}});
+    rmm::exec_policy(), std::cbegin(d_vec1), std::cend(d_vec1), decimal32{0, scale_type{-2}});
 
   EXPECT_EQ(static_cast<int32_t>(sum), 1000);
 
   // TODO: Once nvbugs/1990211 is fixed (ExclusiveSum initial_value = 0 bug)
   //       change inclusive scan to run on device (avoid copying to host)
-  thrust::host_vector<decimal32> vec1_host = vec1;
+  thrust::inclusive_scan(std::cbegin(vec1), std::cend(vec1), std::begin(vec1));
 
-  thrust::inclusive_scan(std::cbegin(vec1_host), std::cend(vec1_host), std::begin(vec1_host));
-
-  vec1 = vec1_host;
+  d_vec1 = cudf::detail::make_device_uvector_sync(vec1);
 
   std::vector<int32_t> vec2(1000);
   std::iota(std::begin(vec2), std::end(vec2), 1);
 
   auto const res1 = thrust::reduce(
-    rmm::exec_policy(), std::cbegin(vec1), std::cend(vec1), decimal32{0, scale_type{-2}});
+    rmm::exec_policy(), std::cbegin(d_vec1), std::cend(d_vec1), decimal32{0, scale_type{-2}});
 
   auto const res2 = std::accumulate(std::cbegin(vec2), std::cend(vec2), 0);
 
   EXPECT_EQ(static_cast<int32_t>(res1), res2);
 
-  thrust::device_vector<int32_t> vec3(1000);
+  rmm::device_uvector<int32_t> d_vec3(1000, rmm::cuda_stream_default);
 
-  thrust::transform(
-    rmm::exec_policy(), std::cbegin(vec1), std::cend(vec1), std::begin(vec3), cast_to_int32_fn{});
+  thrust::transform(rmm::exec_policy(),
+                    std::cbegin(d_vec1),
+                    std::cend(d_vec1),
+                    std::begin(d_vec3),
+                    cast_to_int32_fn{});
 
-  thrust::host_vector<int32_t> vec3_host = vec3;
+  auto vec3 = cudf::detail::make_std_vector_sync(d_vec3);
 
   EXPECT_EQ(vec2, vec3);
 }
-
-TYPED_TEST(FixedPointTestBothReps, FixedPointColumnWrapper)
-{
-  using namespace numeric;
-  using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
-  using RepType   = TypeParam;
-
-  // fixed_point_column_wrapper
-  auto const w = cudf::test::fixed_point_column_wrapper<RepType>{{1, 2, 3, 4}, scale_type{0}};
-
-  // fixed_width_column_wrapper
-  auto const ONE   = decimalXX{1, scale_type{0}};
-  auto const TWO   = decimalXX{2, scale_type{0}};
-  auto const THREE = decimalXX{3, scale_type{0}};
-  auto const FOUR  = decimalXX{4, scale_type{0}};
-
-  auto const vec = std::vector<decimalXX>{ONE, TWO, THREE, FOUR};
-  auto const col = cudf::test::fixed_width_column_wrapper<decimalXX>(vec.begin(), vec.end());
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, w);
-}
-
-TYPED_TEST(FixedPointTestBothReps, NoScaleOrWrongTypeID)
-{
-  auto const null_mask = cudf::create_null_mask(0, cudf::mask_state::ALL_NULL);
-
-  EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0, null_mask),
-               cudf::logic_error);
-}
-
-TYPED_TEST(FixedPointTestBothReps, SimpleFixedPointColumnWrapper)
-{
-  using RepType = cudf::device_storage_type_t<TypeParam>;
-
-  auto const a = cudf::test::fixed_point_column_wrapper<RepType>{{11, 22, 33}, scale_type{-1}};
-  auto const b = cudf::test::fixed_point_column_wrapper<RepType>{{110, 220, 330}, scale_type{-2}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(a, b);
-}
-
-TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType32)
-{
-  // This is testing fixed_point values outside the range of its underlying type.
-  // For example, 100,000,000 with scale of 6 is 100,000,000,000,000 (100 trillion) and this is
-  // outside the range of a int32_t
-
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int32_t>;
-
-  auto const a = fp_wrapper{{100000000}, scale_type{6}};
-  auto const b = fp_wrapper{{5000000}, scale_type{7}};
-  auto const c = fp_wrapper{{2}, scale_type{0}};
-
-  auto const expected1 = fp_wrapper{{150000000}, scale_type{6}};
-  auto const expected2 = fp_wrapper{{50000000}, scale_type{6}};
-
-  auto const type    = cudf::data_type{cudf::type_id::DECIMAL32, 6};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
-}
-
-TEST_F(FixedPointTest, PositiveScaleWithValuesOutsideUnderlyingType64)
-{
-  // This is testing fixed_point values outside the range of its underlying type.
-  // For example, 100,000,000 with scale of 100 is 10 ^ 108 and this is far outside the
-  // range of a int64_t
-
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-
-  auto const a = fp_wrapper{{100000000}, scale_type{100}};
-  auto const b = fp_wrapper{{5000000}, scale_type{101}};
-  auto const c = fp_wrapper{{2}, scale_type{0}};
-
-  auto const expected1 = fp_wrapper{{150000000}, scale_type{100}};
-  auto const expected2 = fp_wrapper{{50000000}, scale_type{100}};
-
-  auto const type    = cudf::data_type{cudf::type_id::DECIMAL64, 100};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type);
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
-}
-
-TYPED_TEST(FixedPointTestBothReps, ExtremelyLargeNegativeScale)
-{
-  // This is testing fixed_point values with an extremely large negative scale. The fixed_point
-  // implementation should be able to handle any scale representable by an int32_t
-
-  using decimalXX  = fixed_point<TypeParam, Radix::BASE_10>;
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<TypeParam>;
-
-  auto const a = fp_wrapper{{10}, scale_type{-201}};
-  auto const b = fp_wrapper{{50}, scale_type{-202}};
-  auto const c = fp_wrapper{{2}, scale_type{0}};
-
-  auto const expected1 = fp_wrapper{{150}, scale_type{-202}};
-  auto const expected2 = fp_wrapper{{5}, scale_type{-201}};
-
-  auto const type1   = cudf::data_type{cudf::type_to_id<decimalXX>(), -202};
-  auto const result1 = cudf::binary_operation(a, b, cudf::binary_operator::ADD, type1);
-
-  auto const type2   = cudf::data_type{cudf::type_to_id<decimalXX>(), -201};
-  auto const result2 = cudf::binary_operation(a, c, cudf::binary_operator::DIV, type2);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, result1->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, result2->view());
-}
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp
new file mode 100644
index 00000000000..d43de574671
--- /dev/null
+++ b/cpp/tests/groupby/argmax_tests.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_argmax_test : public cudf::test::BaseFixture {
+};
+using K = int32_t;
+
+TYPED_TEST_CASE(groupby_argmax_test, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(groupby_argmax_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals{0, 1, 2};
+
+  auto agg = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_argmax_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5});
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_argmax_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_argmax_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 4},
+                                     {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+
+  //  {1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  {6, 3,     5, 4, 0,   2, 1,    -}
+  fixed_width_column_wrapper<R> expect_vals({3, 4, 7, 0}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_argmax_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_argmax_string_test, basic)
+{
+  using V = string_view;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals({0, 4, 2});
+
+  auto agg = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TEST_F(groupby_argmax_string_test, zero_valid_values)
+{
+  using V = string_view;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmax_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_dictionary_argmax_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_argmax_test, basic)
+{
+  using V = std::string;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
+  dictionary_column_wrapper<V>  vals{ "año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+  fixed_width_column_wrapper<K> expect_keys({ 1, 2, 3 });
+  fixed_width_column_wrapper<R> expect_vals({ 0, 4, 2 });
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_argmax_aggregation(),
+                  force_use_sort_impl::YES);
+}
+
+}  // namespace test
+}  // namespace cudf
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp
new file mode 100644
index 00000000000..18ff0f8fef5
--- /dev/null
+++ b/cpp/tests/groupby/argmin_tests.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_argmin_test : public cudf::test::BaseFixture {
+};
+using K = int32_t;
+
+TYPED_TEST_CASE(groupby_argmin_test, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(groupby_argmin_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals{6, 9, 8};
+
+  auto agg = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_argmin_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5});
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_argmin_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_argmin_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
+
+  if (std::is_same<V, bool>::value) return;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 4},
+                                     {1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 9, 6,     8, 5, 0,   7, 1,    -}
+  fixed_width_column_wrapper<R> expect_vals({3, 9, 8, 0}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  // TODO: explore making this a gtest parameter
+  auto agg2 = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_argmin_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_argmin_string_test, basic)
+{
+  using V = string_view;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals({3, 5, 7});
+
+  auto agg = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TEST_F(groupby_argmin_string_test, zero_valid_values)
+{
+  using V = string_view;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_argmin_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_dictionary_argmin_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_argmin_test, basic)
+{
+  using V = std::string;
+  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{    1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
+  dictionary_column_wrapper<V>  vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+  fixed_width_column_wrapper<K> expect_keys({ 1, 2, 3 });
+  fixed_width_column_wrapper<R> expect_vals({ 3, 5, 7 });
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_argmin_aggregation(),
+                  force_use_sort_impl::YES);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_collect_test.cpp b/cpp/tests/groupby/collect_list_tests.cpp
similarity index 52%
rename from cpp/tests/groupby/group_collect_test.cpp
rename to cpp/tests/groupby/collect_list_tests.cpp
index 8a578ea0c0f..43c62743b9f 100644
--- a/cpp/tests/groupby/group_collect_test.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -68,6 +68,39 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNulls)
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K, int32_t> keys{1, 1, 1, 2, 2, 3, 3, 4, 4};
+
+  fixed_width_column_wrapper<V, int32_t> values{
+    {1, 2, 3, 4, 5, 6, 7, 8, 9}, {false, true, false, true, false, false, false, true, true}};
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{1, 2, 3, 4};
+
+  lists_column_wrapper<V, int32_t> expect_vals{{2}, {4}, {}, {8, 9}};
+
+  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K, int32_t> keys{};
+  fixed_width_column_wrapper<V, int32_t> values{};
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{};
+  lists_column_wrapper<V, int32_t> expect_vals{};
+
+  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
+}
+
 TYPED_TEST(groupby_collect_list_test, CollectLists)
 {
   using K = int32_t;
@@ -87,6 +120,80 @@ TYPED_TEST(groupby_collect_list_test, CollectLists)
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  fixed_width_column_wrapper<K, int32_t> keys{1, 1, 2, 2, 3, 3, 4, 4};
+  const bool validity_mask[] = {true, false, false, true, true, true, false, false};
+  LCW values{{{1, 2}, {3, 4}, {5, 6, 7}, LCW{}, {9, 10}, {11}, {20, 30, 40}, LCW{}}, validity_mask};
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{1, 2, 3, 4};
+
+  LCW expect_vals{{{1, 2}}, {LCW{}}, {{9, 10}, {11}}, {}};
+
+  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  auto offsets = data_type{type_to_id<offset_type>()};
+
+  fixed_width_column_wrapper<K, int32_t> keys{};
+  auto values = cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{};
+
+  auto expect_child =
+    cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {});
+  auto expect_values =
+    cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {});
+
+  auto agg = cudf::make_collect_list_aggregation();
+  test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
+}
+
+TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using LCW = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  fixed_width_column_wrapper<K, int32_t> keys{};
+  auto struct_child  = LCW{};
+  auto struct_column = structs_column_wrapper{{struct_child}};
+
+  auto values = cudf::make_lists_column(
+    0, make_empty_column(data_type{type_to_id<offset_type>()}), struct_column.release(), 0, {});
+
+  fixed_width_column_wrapper<K, int32_t> expect_keys{};
+
+  auto expect_struct_child  = LCW{};
+  auto expect_struct_column = structs_column_wrapper{{expect_struct_child}};
+
+  auto expect_child =
+    cudf::make_lists_column(0,
+                            make_empty_column(data_type{type_to_id<offset_type>()}),
+                            expect_struct_column.release(),
+                            0,
+                            {});
+  auto expect_values = cudf::make_lists_column(
+    0, make_empty_column(data_type{type_to_id<offset_type>()}), std::move(expect_child), 0, {});
+
+  auto agg = cudf::make_collect_list_aggregation();
+  test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
+}
+
 TYPED_TEST(groupby_collect_list_test, dictionary)
 {
   using K = int32_t;
@@ -103,30 +210,11 @@ TYPED_TEST(groupby_collect_list_test, dictionary)
                                              std::make_unique<cudf::column>(offsets),
                                              std::make_unique<cudf::column>(vals),
                                              0,
-                                             rmm::device_buffer{0});
+                                             rmm::device_buffer{});
 
   test_single_agg(
     keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_list_aggregation());
 }
 
-TYPED_TEST(groupby_collect_list_test, CollectFailsWithNullExclusion)
-{
-  using K = int32_t;
-  using V = TypeParam;
-
-  fixed_width_column_wrapper<K, int32_t> keys{1, 1, 2, 2, 3, 3};
-  groupby::groupby gby{table_view{{keys}}};
-
-  fixed_width_column_wrapper<V, int32_t> values{{1, 2, 3, 4, 5, 6},
-                                                {true, false, true, false, true, false}};
-
-  std::vector<groupby::aggregation_request> agg_requests(1);
-  agg_requests[0].values = values;
-  agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE));
-
-  CUDF_EXPECT_THROW_MESSAGE(gby.aggregate(agg_requests),
-                            "null exclusion is not supported on groupby COLLECT_LIST aggregation.");
-}
-
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_tests.cpp
similarity index 76%
rename from cpp/tests/groupby/collect_set_test.cpp
rename to cpp/tests/groupby/collect_set_tests.cpp
index 5303b8f4f61..d5a881a1993 100644
--- a/cpp/tests/groupby/collect_set_test.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -31,11 +31,19 @@ namespace test {
 #define LCL_V cudf::test::lists_column_wrapper<TypeParam, int32_t>
 #define LCL_S cudf::test::lists_column_wrapper<cudf::string_view>
 #define VALIDITY std::initializer_list<bool>
-#define COLLECT_SET cudf::make_collect_set_aggregation()
-#define COLLECT_SET_NULL_UNEQUAL \
-  cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL)
 
 struct CollectSetTest : public cudf::test::BaseFixture {
+  static auto collect_set() { return cudf::make_collect_set_aggregation(); }
+
+  static auto collect_set_null_unequal()
+  {
+    return cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL);
+  }
+
+  static auto collect_set_null_exclude()
+  {
+    return cudf::make_collect_set_aggregation(null_policy::EXCLUDE);
+  }
 };
 
 template <typename V>
@@ -47,22 +55,10 @@ using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBo
                                                   cudf::test::TimestampTypes>;
 TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool);
 
-TYPED_TEST(CollectSetTypedTest, ExceptionTests)
-{
-  std::vector<groupby::aggregation_request> agg_requests(1);
-  agg_requests[0].values = COL_V{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}};
-  agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE));
-
-  // groupby cannot exclude nulls
-  groupby::groupby gby{table_view{{COL_K{1, 1, 2, 2, 3, 3}}}};
-  EXPECT_THROW(gby.aggregate(agg_requests), cudf::logic_error);
-}
-
 TYPED_TEST(CollectSetTypedTest, TrivialInput)
 {
   // Empty input
-  // TODO: Enable this test after issue#7611 has been fixed
-  // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET);
+  test_single_agg(COL_K{}, COL_V{}, COL_K{}, LCL_V{}, CollectSetTest::collect_set());
 
   // Single key input
   {
@@ -70,7 +66,7 @@ TYPED_TEST(CollectSetTypedTest, TrivialInput)
     COL_V vals{10};
     COL_K keys_expected{1};
     LCL_V vals_expected{LCL_V{10}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
   }
 
   // Non-repeated keys
@@ -79,7 +75,7 @@ TYPED_TEST(CollectSetTypedTest, TrivialInput)
     COL_V vals{20, 10};
     COL_K keys_expected{1, 2};
     LCL_V vals_expected{LCL_V{10}, LCL_V{20}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
   }
 }
 
@@ -91,7 +87,7 @@ TYPED_TEST(CollectSetTypedTest, TypicalInput)
     COL_V vals{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31};
     COL_K keys_expected{1, 2, 3};
     LCL_V vals_expected{{10, 11}, {20, 21}, {30, 31, 32, 33}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
   }
 
   // Expect the result keys to be sorted by sort-based groupby
@@ -100,7 +96,7 @@ TYPED_TEST(CollectSetTypedTest, TypicalInput)
     COL_V vals{40, 10, 20, 40, 30, 30, 20, 11};
     COL_K keys_expected{1, 2, 3, 4};
     LCL_V vals_expected{{10, 11}, {20}, {30}, {40}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
   }
 }
 
@@ -114,14 +110,14 @@ TYPED_TEST(CollectSetTypedTest, SlicedColumnsInput)
     auto const vals          = cudf::slice(vals_original, {0, 4})[0];  // { 10, 11, 10, 10 }
     auto const keys_expected = COL_K{1};
     auto const vals_expected = LCL_V{{10, 11}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
   }
   {
     auto const keys = cudf::slice(keys_original, {2, 10})[0];  // { 1, 1, 2, 2, 2, 2, 3, 3 }
     auto const vals = cudf::slice(vals_original, {2, 10})[0];  // { 10, 10, 20, 21, 21, 20, 30, 33 }
     auto const keys_expected = COL_K{1, 2, 3};
     auto const vals_expected = LCL_V{{10}, {20, 21}, {30, 33}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
   }
 }
 
@@ -147,7 +143,7 @@ TEST_F(CollectSetTest, StringInput)
   LCL_S vals_expected{{"String 1, first", "String 1, second"},
                       {"String 2, first", "String 2, second"},
                       {"String 3, first", "String 3, second"}};
-  test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+  test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
 }
 
 TYPED_TEST(CollectSetTypedTest, CollectWithNulls)
@@ -167,13 +163,19 @@ TYPED_TEST(CollectSetTypedTest, CollectWithNulls)
     LCL_V vals_expected{{{10, null}, VALIDITY{true, false}},
                         {{20, null}, VALIDITY{true, false}},
                         {{30, 31}, VALIDITY{true, true}}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
 
     // All nulls per key are kept (nulls are put at the end of each list)
     vals_expected = LCL_V{{{10, null, null}, VALIDITY{true, false, false}},
                           {{20, null, null, null}, VALIDITY{true, false, false, false}},
                           {{30, 31}, VALIDITY{true, true}}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL);
+    test_single_agg(
+      keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set_null_unequal());
+
+    // All nulls per key are excluded
+    vals_expected = LCL_V{{10}, {20}, {30, 31}};
+    test_single_agg(
+      keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set_null_exclude());
   }
 
   // Expect the result keys to be sorted by sort-based groupby
@@ -188,14 +190,20 @@ TYPED_TEST(CollectSetTypedTest, CollectWithNulls)
                         {{20, 21}, VALIDITY{true, true}},
                         {{null}, VALIDITY{false}},
                         {{40}, VALIDITY{true}}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET);
+    test_single_agg(keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set());
 
     // All nulls per key are kept (nulls are put at the end of each list)
     vals_expected = LCL_V{{{10, null}, VALIDITY{true, false}},
                           {{20, 21}, VALIDITY{true, true}},
                           {{null, null, null, null}, VALIDITY{false, false, false, false}},
                           {{40}, VALIDITY{true}}};
-    test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL);
+    test_single_agg(
+      keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set_null_unequal());
+
+    // All nulls per key are excluded
+    vals_expected = LCL_V{{10}, {20, 21}, {}, {40}};
+    test_single_agg(
+      keys, vals, keys_expected, vals_expected, CollectSetTest::collect_set_null_exclude());
   }
 }
 
diff --git a/cpp/tests/groupby/group_count_scan_test.cpp b/cpp/tests/groupby/count_scan_tests.cpp
similarity index 100%
rename from cpp/tests/groupby/group_count_scan_test.cpp
rename to cpp/tests/groupby/count_scan_tests.cpp
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
new file mode 100644
index 00000000000..f3d73a0fe30
--- /dev/null
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_count_test : public cudf::test::BaseFixture {
+};
+using K = int32_t;
+
+TYPED_TEST_CASE(groupby_count_test, cudf::test::AllTypes);
+
+TYPED_TEST(groupby_count_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals{3, 4, 3};
+
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg1 = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+TYPED_TEST(groupby_count_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals;
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals;
+
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg1 = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_count_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg1 = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+TYPED_TEST(groupby_count_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals{0};
+
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg1 = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R> expect_vals2{3};
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
+}
+
+TYPED_TEST(groupby_count_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  // clang-format off
+  //                                        {1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, all_valid());
+  //                                        {3, 6,     1, 4, 9,   2, 8,    -}
+  fixed_width_column_wrapper<R> expect_vals({2,        3,         2,       0});
+  // clang-format on
+
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg1 = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R> expect_vals2{3, 4, 2, 1};
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
+}
+
+struct groupby_count_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_count_string_test, basic)
+{
+  using V = cudf::string_view;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{1,    3,  3,   5,   5,   0};
+  strings_column_wrapper        vals{"1", "1", "1", "1", "1", "1"};
+  // clang-format on
+
+  fixed_width_column_wrapper<K> expect_keys{0, 1, 3, 5};
+  fixed_width_column_wrapper<R> expect_vals{1, 1, 2, 2};
+
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg1 = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
+}
+// clang-format on
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupByCount)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  using V = decimalXX;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  auto const scale = scale_type{-1};
+  auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+
+  auto const expect_keys = fixed_width_column_wrapper<K>{1, 2, 3};
+  auto const expect_vals = fixed_width_column_wrapper<R>{3, 4, 3};
+
+  auto agg = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg1 = cudf::make_count_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
+
+  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
+}
+
+struct groupby_dictionary_count_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_count_test, basic)
+{
+  using V = std::string;
+  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
+
+  // clang-format off
+  strings_column_wrapper        keys{"1", "3", "3", "5", "5", "0"};
+  dictionary_column_wrapper<K>  vals{1, 1, 1, 1, 1, 1};
+  strings_column_wrapper        expect_keys{"0", "1", "3", "5"};
+  fixed_width_column_wrapper<R> expect_vals{1, 1, 2, 2};
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(), force_use_sort_impl::YES);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_argmax_test.cpp b/cpp/tests/groupby/group_argmax_test.cpp
deleted file mode 100644
index dafa061e7ff..00000000000
--- a/cpp/tests/groupby/group_argmax_test.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_argmax_test : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(groupby_argmax_test, cudf::test::FixedWidthTypes);
-
-// clang-format off
-TYPED_TEST(groupby_argmax_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int32_t> vals({9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R> expect_vals { 0, 1, 2 };
-
-    auto agg = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_argmax_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5});
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_argmax_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys   { 1, 1, 1};
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5}, all_null());
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_argmax_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                       { 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
-    fixed_width_column_wrapper<V, int32_t> vals({9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 4},
-                                                {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 6, 3,     5, 4, 0,   2, 1,    -}
-    fixed_width_column_wrapper<R> expect_vals({ 3,        4,         7,       0},
-                                              { 1,        1,         1,       0});
-
-    auto agg = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-
-struct groupby_argmax_string_test : public cudf::test::BaseFixture {};
-
-TEST_F(groupby_argmax_string_test, basic)
-{
-    using K = int32_t;
-    using V = string_view;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
-
-    fixed_width_column_wrapper<K> keys        {     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-    strings_column_wrapper        vals        { "año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R> expect_vals({ 0, 4, 2 });
-
-    auto agg = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TEST_F(groupby_argmax_string_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = string_view;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    strings_column_wrapper        vals      ( { "año", "bit", "₹1"}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmax_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-// clang-format on
-
-struct groupby_dictionary_argmax_test : public cudf::test::BaseFixture {
-};
-
-TEST_F(groupby_dictionary_argmax_test, basic)
-{
-  using K = int32_t;
-  using V = std::string;
-  using R = cudf::detail::target_type_t<V, aggregation::ARGMAX>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-  dictionary_column_wrapper<V>  vals{ "año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
-  fixed_width_column_wrapper<K> expect_keys({ 1, 2, 3 });
-  fixed_width_column_wrapper<R> expect_vals({ 0, 4, 2 });
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation());
-  test_single_agg(keys,
-                  vals,
-                  expect_keys,
-                  expect_vals,
-                  cudf::make_argmax_aggregation(),
-                  force_use_sort_impl::YES);
-}
-
-}  // namespace test
-}  // namespace cudf
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/groupby/group_argmin_test.cpp b/cpp/tests/groupby/group_argmin_test.cpp
deleted file mode 100644
index 28f13a3b813..00000000000
--- a/cpp/tests/groupby/group_argmin_test.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_argmin_test : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(groupby_argmin_test, cudf::test::FixedWidthTypes);
-
-// clang-format off
-TYPED_TEST(groupby_argmin_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int32_t> vals({9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R> expect_vals { 6, 9, 8 };
-
-    auto agg = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_argmin_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5});
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_argmin_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys   { 1, 1, 1};
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5}, all_null());
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_argmin_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
-
-    if (std::is_same<V, bool>::value) return;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                       { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V, int32_t> vals({ 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 4},
-                                                { 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 9, 6,     8, 5, 0,   7, 1,    -}
-    fixed_width_column_wrapper<R> expect_vals({ 3,        9,         8,       0},
-                                              { 1,        1,         1,       0});
-
-    auto agg = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    // TODO: explore making this a gtest parameter
-    auto agg2 = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-
-struct groupby_argmin_string_test : public cudf::test::BaseFixture {};
-
-TEST_F(groupby_argmin_string_test, basic)
-{
-    using K = int32_t;
-    using V = string_view;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
-
-    fixed_width_column_wrapper<K> keys        {     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-    strings_column_wrapper        vals        { "año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R> expect_vals({ 3, 5, 7 });
-
-    auto agg = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TEST_F(groupby_argmin_string_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = string_view;
-    using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    strings_column_wrapper        vals      ( { "año", "bit", "₹1"}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_argmin_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-// clang-format on
-
-struct groupby_dictionary_argmin_test : public cudf::test::BaseFixture {
-};
-
-TEST_F(groupby_dictionary_argmin_test, basic)
-{
-  using K = int32_t;
-  using V = std::string;
-  using R = cudf::detail::target_type_t<V, aggregation::ARGMIN>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-  dictionary_column_wrapper<V>  vals{ "año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
-  fixed_width_column_wrapper<K> expect_keys({ 1, 2, 3 });
-  fixed_width_column_wrapper<R> expect_vals({ 3, 5, 7 });
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation());
-  test_single_agg(keys,
-                  vals,
-                  expect_keys,
-                  expect_vals,
-                  cudf::make_argmin_aggregation(),
-                  force_use_sort_impl::YES);
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_count_test.cpp b/cpp/tests/groupby/group_count_test.cpp
deleted file mode 100644
index 5147e150c58..00000000000
--- a/cpp/tests/groupby/group_count_test.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_count_test : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(groupby_count_test, cudf::test::AllTypes);
-
-// clang-format off
-TYPED_TEST(groupby_count_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-    fixed_width_column_wrapper<K> keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int> vals { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R, int> expect_vals { 3, 4, 3 };
-
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg1 = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
-
-    auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
-}
-
-TYPED_TEST(groupby_count_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals;
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals;
-
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg1 = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_count_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-    fixed_width_column_wrapper<K> keys( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int> vals  { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R, int> expect_vals { };
-
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg1 = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
-
-    auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
-}
-
-TYPED_TEST(groupby_count_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-    fixed_width_column_wrapper<K> keys   { 1, 1, 1};
-    fixed_width_column_wrapper<V, int> vals ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R, int> expect_vals { 0 };
-
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg1 = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
-
-    fixed_width_column_wrapper<R, int> expect_vals2 { 3 };
-    auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
-    test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
-}
-
-TYPED_TEST(groupby_count_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                       { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V, int> vals({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                       { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    -}
-    fixed_width_column_wrapper<R, int> expect_vals { 2,        3,         2,       0};
-
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg1 = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
-
-    fixed_width_column_wrapper<R, int> expect_vals2{ 3,        4,         2,       1};
-    auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
-    test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
-
-}
-
-struct groupby_count_string_test : public cudf::test::BaseFixture {};
-
-TEST_F(groupby_count_string_test, basic)
-{
-    using K = int32_t;
-    using V = cudf::string_view;
-    using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-    fixed_width_column_wrapper<K> keys        {   1,   3,   3,   5,   5,   0};
-    strings_column_wrapper        vals        { "1", "1", "1", "1", "1", "1"};
-
-    fixed_width_column_wrapper<K> expect_keys   {   0,   1,   3,   5};
-    fixed_width_column_wrapper<R, int> expect_vals   {   1,   1,   2,   2};
-
-    auto agg = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg1 = cudf::make_count_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
-}
-// clang-format on
-
-template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
-
-TYPED_TEST(FixedPointTestBothReps, GroupByCount)
-{
-  using namespace numeric;
-  using decimalXX  = TypeParam;
-  using RepType    = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-
-  using K = int32_t;
-  using V = decimalXX;
-  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-  auto const scale = scale_type{-1};
-  auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-  auto const expect_keys = fixed_width_column_wrapper<K>{1, 2, 3};
-  auto const expect_vals = fixed_width_column_wrapper<R, int>{3, 4, 3};
-
-  auto agg = cudf::make_count_aggregation();
-  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-  auto agg1 = cudf::make_count_aggregation();
-  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
-
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
-  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
-}
-
-struct groupby_dictionary_count_test : public cudf::test::BaseFixture {
-};
-
-TEST_F(groupby_dictionary_count_test, basic)
-{
-  using K = int32_t;
-  using V = std::string;
-  using R = cudf::detail::target_type_t<V, aggregation::COUNT_VALID>;
-
-  // clang-format off
-  strings_column_wrapper       keys{"1", "3", "3", "5", "5", "0"};
-  dictionary_column_wrapper<K> vals{ 1,   1,   1,   1,   1,   1};
-  strings_column_wrapper             expect_keys{"0", "1", "3", "5"};
-  fixed_width_column_wrapper<R, int> expect_vals{ 1,   1,   2,   2};
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation());
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(), force_use_sort_impl::YES);
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_max_test.cpp b/cpp/tests/groupby/group_max_test.cpp
deleted file mode 100644
index 0a13510a948..00000000000
--- a/cpp/tests/groupby/group_max_test.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/dictionary/update_keys.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_max_test : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(groupby_max_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
-
-// clang-format off
-TYPED_TEST(groupby_max_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MAX>;
-
-    fixed_width_column_wrapper<K> keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int32_t> vals({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R, int32_t> expect_vals({ 6, 9, 8 });
-
-    auto agg = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_max_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MAX>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_max_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MAX>;
-
-    fixed_width_column_wrapper<K> keys( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5});
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_max_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MAX>;
-
-    fixed_width_column_wrapper<K> keys   { 1, 1, 1};
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5}, all_null());
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R, int32_t> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_max_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MAX>;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                       { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                                {1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 0, 3,     1, 4, 5,   2, 8,    -}
-    fixed_width_column_wrapper<R, int32_t> expect_vals({ 3,        5,         8,       0},
-                                                       { 1,        1,         1,       0});
-
-    auto agg = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-
-struct groupby_max_string_test : public cudf::test::BaseFixture {};
-
-TEST_F(groupby_max_string_test, basic)
-{
-    using K = int32_t;
-
-    fixed_width_column_wrapper<K> keys        {     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-    strings_column_wrapper        vals        { "año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
-
-    fixed_width_column_wrapper<K> expect_keys {     1,     2,    3 };
-    strings_column_wrapper        expect_vals({ "año", "zit", "₹1" });
-
-    auto agg = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TEST_F(groupby_max_string_test, zero_valid_values)
-{
-    using K = int32_t;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    strings_column_wrapper        vals      ( { "año", "bit", "₹1"}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    strings_column_wrapper        expect_vals({ "" }, all_null());
-
-    auto agg = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-// clang-format on
-
-struct groupby_dictionary_max_test : public cudf::test::BaseFixture {
-};
-
-TEST_F(groupby_dictionary_max_test, basic)
-{
-  using K = int32_t;
-  using V = std::string;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-  dictionary_column_wrapper<V>  vals{ "año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
-  fixed_width_column_wrapper<K> expect_keys   {     1,     2,    3 };
-  dictionary_column_wrapper<V>  expect_vals_w({ "año", "zit", "₹1" });
-  // clang-format on
-
-  auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
-
-  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_max_aggregation());
-  test_single_agg(keys,
-                  vals,
-                  expect_keys,
-                  expect_vals->view(),
-                  cudf::make_max_aggregation(),
-                  force_use_sort_impl::YES);
-}
-
-template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
-
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
-{
-  using namespace numeric;
-  using decimalXX  = TypeParam;
-  using RepType    = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-  using K          = int32_t;
-
-  for (auto const i : {2, 1, 0, -1, -2}) {
-    auto const scale = scale_type{i};
-    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
-
-    auto agg3 = cudf::make_max_aggregation();
-    test_single_agg(
-      keys, vals, expect_keys, expect_vals_max, std::move(agg3), force_use_sort_impl::YES);
-  }
-}
-
-// This test will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TYPED_TEST(FixedPointTestBothReps, DISABLED_GroupByHashMaxDecimalAsValue)
-{
-  using namespace numeric;
-  using decimalXX  = TypeParam;
-  using RepType    = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-  using K          = int32_t;
-
-  for (auto const i : {2, 1, 0, -1, -2}) {
-    auto const scale = scale_type{i};
-    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
-
-    auto agg7 = cudf::make_max_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals_max, std::move(agg7));
-  }
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_mean_test.cpp b/cpp/tests/groupby/group_mean_test.cpp
deleted file mode 100644
index 6d12516a8cf..00000000000
--- a/cpp/tests/groupby/group_mean_test.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_list_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <initializer_list>
-#include <iterator>
-#include <type_traits>
-#include <vector>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_mean_test : public cudf::test::BaseFixture {
-};
-
-template <typename Target, typename Source>
-std::vector<Target> convert(std::initializer_list<Source> in)
-{
-  std::vector<Target> out(std::cbegin(in), std::cend(in));
-  return out;
-}
-
-using supported_types =
-  cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
-                     cudf::test::DurationTypes>;
-TYPED_TEST_CASE(groupby_mean_test, supported_types);
-
-// clang-format off
-TYPED_TEST(groupby_mean_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
-    using RT = typename std::conditional<cudf::is_duration<R>(), int, double>::type;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-    fixed_width_column_wrapper<K> expect_keys { 1,  2,     3    };
-    std::vector<RT> expect_v = convert<RT>({ 3., 19./4, 17./3});
-    fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend());
-
-    auto agg = cudf::make_mean_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_mean_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V, int> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_mean_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_mean_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
-
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int> vals        { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_mean_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_mean_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    fixed_width_column_wrapper<V, int> vals      ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R, int> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_mean_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_mean_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
-    using RT = typename std::conditional<cudf::is_duration<R>(), int, double>::type;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V, int> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    -}
-    std::vector<RT> expect_v = convert<RT>({ 4.5,      14./3,     5.,      0.});
-    fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend(),
-                                              { 1,        1,         1,       0});
-
-    auto agg = cudf::make_mean_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-// clang-format on
-
-struct groupby_dictionary_mean_test : public cudf::test::BaseFixture {
-};
-
-// This tests will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TEST_F(groupby_dictionary_mean_test, DISABLED_basic)
-{
-  using K = int32_t;
-  using V = int16_t;
-  using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K>     keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  dictionary_column_wrapper<V, int> vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  fixed_width_column_wrapper<K>         expect_keys({ 1,    2,     3    });
-  fixed_width_column_wrapper<R, double> expect_vals({ 9./3, 19./4, 17./3});
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation());
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_median_test.cpp b/cpp/tests/groupby/group_median_test.cpp
deleted file mode 100644
index 28acadf9ecc..00000000000
--- a/cpp/tests/groupby/group_median_test.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_median_test : public cudf::test::BaseFixture {
-};
-
-using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
-
-TYPED_TEST_CASE(groupby_median_test, supported_types);
-
-// clang-format off
-TYPED_TEST(groupby_median_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                          //  { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
-    fixed_width_column_wrapper<K> expect_keys { 1,       2,          3      };
-                                          //  { 0,  3, 6, 1, 4, 5, 9, 2, 7, 8}
-    fixed_width_column_wrapper<R> expect_vals({    3.,        4.5,      7.   }, all_valid());
-
-    auto agg = cudf::make_median_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_median_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_median_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_median_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
-
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V> vals        { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_median_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_median_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    fixed_width_column_wrapper<V> vals      ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_median_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_median_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    -}
-    fixed_width_column_wrapper<R> expect_vals({  4.5,       4.,       5.,    0.},
-                                              {   1,         1,        1,     0});
-
-    auto agg = cudf::make_median_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-// clang-format on
-
-TYPED_TEST(groupby_median_test, dictionary)
-{
-  using K = int32_t;
-  using V = TypeParam;
-  using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                        //  { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
-  fixed_width_column_wrapper<K> expect_keys({ 1,       2,          3      });
-                                        //  { 0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({ 3.,       4.5,       7.   }, all_valid());
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation());
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_min_test.cpp b/cpp/tests/groupby/group_min_test.cpp
deleted file mode 100644
index 4cd0c9864ad..00000000000
--- a/cpp/tests/groupby/group_min_test.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/dictionary/update_keys.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_min_test : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(groupby_min_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
-
-// clang-format off
-TYPED_TEST(groupby_min_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MIN>;
-
-    fixed_width_column_wrapper<K> keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R, int32_t> expect_vals({0, 1, 2 });
-
-    auto agg = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_min_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MIN>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_min_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MIN>;
-
-    fixed_width_column_wrapper<K> keys( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5});
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_min_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MIN>;
-
-    fixed_width_column_wrapper<K> keys   { 1, 1, 1};
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5}, all_null());
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R, int32_t> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_min_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::MIN>;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                       { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                                {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    -}
-    fixed_width_column_wrapper<R, int32_t> expect_vals({ 3,        1,         2,       0},
-                                                       { 1,        1,         1,       0});
-
-    auto agg = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-
-struct groupby_min_string_test : public cudf::test::BaseFixture {};
-
-TEST_F(groupby_min_string_test, basic)
-{
-    using K = int32_t;
-
-    fixed_width_column_wrapper<K> keys        {     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-    strings_column_wrapper        vals        { "año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
-
-    fixed_width_column_wrapper<K> expect_keys {     1,     2,    3 };
-    strings_column_wrapper        expect_vals({ "aaa", "bat", "$1" });
-
-    auto agg = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TEST_F(groupby_min_string_test, zero_valid_values)
-{
-    using K = int32_t;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    strings_column_wrapper        vals      ( { "año", "bit", "₹1"}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    strings_column_wrapper        expect_vals({ "" }, all_null());
-
-    auto agg = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-// clang-format on
-
-struct groupby_dictionary_min_test : public cudf::test::BaseFixture {
-};
-
-TEST_F(groupby_dictionary_min_test, basic)
-{
-  using K = int32_t;
-  using V = std::string;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
-  dictionary_column_wrapper<V>  vals{ "año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
-  fixed_width_column_wrapper<K> expect_keys   {     1,     2,    3 };
-  dictionary_column_wrapper<V>  expect_vals_w({ "aaa", "bat", "$1" });
-  // clang-format on
-
-  auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
-
-  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_min_aggregation());
-  test_single_agg(keys,
-                  vals,
-                  expect_keys,
-                  expect_vals->view(),
-                  cudf::make_min_aggregation(),
-                  force_use_sort_impl::YES);
-}
-
-template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
-
-TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
-{
-  using namespace numeric;
-  using decimalXX  = TypeParam;
-  using RepType    = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-
-  using K = int32_t;
-
-  for (auto const i : {2, 1, 0, -1, -2}) {
-    auto const scale = scale_type{i};
-    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
-
-    auto agg2 = cudf::make_min_aggregation();
-    test_single_agg(
-      keys, vals, expect_keys, expect_vals_min, std::move(agg2), force_use_sort_impl::YES);
-  }
-}
-
-// This test will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TYPED_TEST(FixedPointTestBothReps, DISABLED_GroupByHashMinDecimalAsValue)
-{
-  using namespace numeric;
-  using decimalXX  = TypeParam;
-  using RepType    = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
-  using K          = int32_t;
-
-  for (auto const i : {2, 1, 0, -1, -2}) {
-    auto const scale = scale_type{i};
-    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
-
-    auto agg6 = cudf::make_min_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg6));
-  }
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_nunique_test.cpp b/cpp/tests/groupby/group_nunique_test.cpp
deleted file mode 100644
index 192e2971fe2..00000000000
--- a/cpp/tests/groupby/group_nunique_test.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_nunique_test : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(groupby_nunique_test, cudf::test::AllTypes);
-
-// clang-format off
-TYPED_TEST(groupby_nunique_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int32_t> vals(
-            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R> expect_vals { 3, 4, 3 };
-    fixed_width_column_wrapper<R> expect_bool_vals { 2, 1, 1 };
-
-    auto agg = cudf::make_nunique_aggregation();
-    if(std::is_same<V, bool>())
-        test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
-    else
-        test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_nunique_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_nunique_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_nunique_test, basic_duplicates)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 3, 2, 2, 9});
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2, 3 };
-    fixed_width_column_wrapper<R> expect_vals { 2, 4, 1 };
-    fixed_width_column_wrapper<R> expect_bool_vals { 2, 1, 1 };
-
-    auto agg = cudf::make_nunique_aggregation();
-    if(std::is_same<V, bool>())
-        test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
-    else
-        test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_nunique_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5});
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_nunique_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_nunique_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys  { 1, 1, 1};
-    fixed_width_column_wrapper<V, int32_t> vals({3, 4, 5}, all_null());
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals { 0 };
-
-    auto agg = cudf::make_nunique_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_nunique_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                       { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                                {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                // all unique values only //  { 3, 6,     1, 4, 9,   2, 8,    -}                                   
-    fixed_width_column_wrapper<R> expect_vals { 2,        3,         2,       0};
-    fixed_width_column_wrapper<R> expect_bool_vals { 1, 1, 1, 0};
-
-
-    auto agg = cudf::make_nunique_aggregation();
-    if(std::is_same<V, bool>())
-        test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
-    else 
-        test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_nunique_test, null_keys_and_values_with_duplicates)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
-                                       { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
-    fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
-                                                {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
-
-                                          //  { 1, 1,     2, 2, 2,    3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,          3,       4}, all_valid());
-                                          //  { 3, 6,-    1, 4, 9,-   2*, 8,   -*}
-                                          //  unique,     with null,  dup,     dup null
-    fixed_width_column_wrapper<R> expect_vals { 2,        3,          2,       0};
-    fixed_width_column_wrapper<R> expect_bool_vals { 1, 1, 1, 0};
-
-
-    auto agg = cudf::make_nunique_aggregation();
-    if(std::is_same<V, bool>())
-        test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
-    else 
-        test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-
-TYPED_TEST(groupby_nunique_test, include_nulls)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-    fixed_width_column_wrapper<K> keys({ 1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
-                                       { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
-    fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
-                                                {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
-
-                                          //  { 1, 1,     2, 2, 2,    3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,          3,       4}, all_valid());
-                                          //  { 3, 6,-    1, 4, 9,-   2*, 8,   -*}
-                                          //  unique,     with null,  dup,     dup null
-    fixed_width_column_wrapper<R> expect_vals { 3,        4,          2,       1};
-    fixed_width_column_wrapper<R> expect_bool_vals { 2, 2, 1, 1};
-
-
-    auto agg = cudf::make_nunique_aggregation(null_policy::INCLUDE);
-    if(std::is_same<V, bool>())
-        test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
-    else 
-        test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-// clang-format on
-
-TYPED_TEST(groupby_nunique_test, dictionary)
-{
-  using K = int32_t;
-  using V = TypeParam;
-  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K>         keys({1, 2, 3, 3, 1, 2, 2, 1, 0, 3, 2, 4, 4, 2},
-                                             {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
-  dictionary_column_wrapper<V, int32_t> vals({0, 1, 2, 2, 3, 4, 0, 6, 7, 8, 9, 0, 0, 0},
-                                             {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
-
-                                        // { 1, 1,   2, 2, 2,   3, 3,   4}
-  fixed_width_column_wrapper<K> expect_keys({1,      2,         3,      4}, all_valid());
-                                        // { 3, 6,-  1, 4, 9,-  2*, 8,  -*}
-                                        //  unique,  with null, dup,    dup null
-  fixed_width_column_wrapper<R> expect_fixed_vals({3,  4,         2,      1});
-  fixed_width_column_wrapper<R> expect_bool_vals { 2,  2,         1,      1};
-  // clang-format on
-
-  cudf::column_view expect_vals = (std::is_same<V, bool>()) ? cudf::column_view{expect_bool_vals}
-                                                            : cudf::column_view{expect_fixed_vals};
-
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_nunique_aggregation(null_policy::INCLUDE));
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_quantile_test.cpp b/cpp/tests/groupby/group_quantile_test.cpp
deleted file mode 100644
index 9908423d088..00000000000
--- a/cpp/tests/groupby/group_quantile_test.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_quantile_test : public cudf::test::BaseFixture {
-};
-
-using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
-
-TYPED_TEST_CASE(groupby_quantile_test, supported_types);
-
-// clang-format off
-TYPED_TEST(groupby_quantile_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                          //  { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
-    fixed_width_column_wrapper<K> expect_keys { 1,       2,          3      };
-                                          //  { 0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-    fixed_width_column_wrapper<R> expect_vals({   3.,        4.5,      7.   }, all_valid());
-
-    auto agg = cudf::make_quantile_aggregation({0.5},
-                                        interpolation::LINEAR);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_quantile_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_quantile_aggregation({0.5},
-                                        interpolation::LINEAR);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_quantile_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V> vals        { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_quantile_aggregation({0.5},
-                                        interpolation::LINEAR);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_quantile_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    fixed_width_column_wrapper<V> vals      ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_quantile_aggregation({0.5},
-                                        interpolation::LINEAR);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_quantile_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    -}
-    fixed_width_column_wrapper<R> expect_vals({  4.5,       4.,       5.,    0.},
-                                              {   1,         1,        1,     0});
-
-    auto agg = cudf::make_quantile_aggregation({0.5},
-                                        interpolation::LINEAR);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_quantile_test, multiple_quantile)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                          //  { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
-    fixed_width_column_wrapper<K> expect_keys { 1,       2,          3      };
-                                          //  { 0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-    fixed_width_column_wrapper<R> expect_vals({  1.5,4.5, 3.25, 6.,  4.5,7.5}, all_valid());
-
-    auto agg = cudf::make_quantile_aggregation({0.25, 0.75},
-                                        interpolation::LINEAR);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg),
-        force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_quantile_test, interpolation_types)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 9};
-
-                                          //  { 1, 1, 1,  2, 2, 2, 2,  3, 3}
-    fixed_width_column_wrapper<K> expect_keys { 1,        2,           3   };
-
-
-                                           //  { 0, 3, 6,  1, 4, 5, 9,  2, 7}
-    fixed_width_column_wrapper<R> expect_vals1({  2.4,         4.2,      4. }, all_valid());
-    auto agg1 = cudf::make_quantile_aggregation({0.4},
-                                        interpolation::LINEAR);
-    test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1));
-
-                                           //  { 0, 3, 6,  1, 4, 5, 9,  2, 7}
-    fixed_width_column_wrapper<R> expect_vals2({    3,          4,        2 }, all_valid());
-    auto agg2 = cudf::make_quantile_aggregation({0.4},
-                                        interpolation::NEAREST);
-    test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
-
-                                           //  { 0, 3, 6,  1, 4, 5, 9,  2, 7}
-    fixed_width_column_wrapper<R> expect_vals3({    0,          4,        2 }, all_valid());
-    auto agg3 = cudf::make_quantile_aggregation({0.4},
-                                        interpolation::LOWER);
-    test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3));
-
-                                           //  { 0, 3, 6,  1, 4, 5, 9,  2, 7}
-    fixed_width_column_wrapper<R> expect_vals4({    3,          5,        7 }, all_valid());
-    auto agg4 = cudf::make_quantile_aggregation({0.4},
-                                        interpolation::HIGHER);
-    test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg4));
-
-                                           //  { 0, 3, 6,  1, 4, 5, 9,  2, 7}
-    fixed_width_column_wrapper<R> expect_vals5({  1.5,         4.5,      4.5}, all_valid());
-    auto agg5 = cudf::make_quantile_aggregation({0.4},
-                                        interpolation::MIDPOINT);
-    test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg5));
-
-}
-// clang-format on
-
-TYPED_TEST(groupby_quantile_test, dictionary)
-{
-  using K = int32_t;
-  using V = TypeParam;
-  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                        //  { 1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
-  fixed_width_column_wrapper<K> expect_keys({ 1,       2,          3      });
-                                        //  { 0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({ 3.,      4.5,        7.   }, all_valid());
-  // clang-format on
-
-  test_single_agg(keys,
-                  vals,
-                  expect_keys,
-                  expect_vals,
-                  cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR));
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_std_test.cpp b/cpp/tests/groupby/group_std_test.cpp
deleted file mode 100644
index 776e24f1fc1..00000000000
--- a/cpp/tests/groupby/group_std_test.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef NDEBUG  // currently groupby std tests are not supported. See groupstd.cu
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_std_test : public cudf::test::BaseFixture {
-};
-
-using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
-
-TYPED_TEST_CASE(groupby_std_test, supported_types);
-
-// clang-format off
-TYPED_TEST(groupby_std_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::STD>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                          //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
-    fixed_width_column_wrapper<K> expect_keys { 1,        2,           3      };
-                                          //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-    fixed_width_column_wrapper<R> expect_vals({   3.,   sqrt(131./12),sqrt(31./3)}, all_valid());
-
-    auto agg = cudf::make_std_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_std_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::STD>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_std_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_std_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::STD>;
-
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V> vals        { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_std_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_std_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::STD>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    fixed_width_column_wrapper<V> vals      ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_std_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_std_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::STD>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,       4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,          4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,       3}
-    fixed_width_column_wrapper<R> expect_vals({3/sqrt(2), 7/sqrt(3), 3*sqrt(2), 0.},
-                                              { 1,        1,         1,          0});
-
-    auto agg = cudf::make_std_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_std_test, ddof_non_default)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::STD>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    3}
-    fixed_width_column_wrapper<R> expect_vals({ 0.,   7*sqrt(2./3), 0.,      0.},
-                                              { 0,        1,         0,       0});
-
-    auto agg = cudf::make_std_aggregation(2);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-// clang-format on
-
-// This test will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TYPED_TEST(groupby_std_test, DISABLED_dictionary)
-{
-  using K = int32_t;
-  using V = TypeParam;
-  using R = cudf::detail::target_type_t<V, aggregation::STD>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                        //   { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
-  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3      });
-                                        //   { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals({  3.,   sqrt(131./12),sqrt(31./3)}, all_valid());
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation());
-}
-
-}  // namespace test
-}  // namespace cudf
-
-#endif  // NDEBUG
diff --git a/cpp/tests/groupby/group_sum_of_squares_test.cpp b/cpp/tests/groupby/group_sum_of_squares_test.cpp
deleted file mode 100644
index 24601d2b246..00000000000
--- a/cpp/tests/groupby/group_sum_of_squares_test.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_sum_of_squares_test : public cudf::test::BaseFixture {
-};
-
-// These tests will not work for all types until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-// using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
-using supported_types = cudf::test::Types<float, double>;
-
-TYPED_TEST_CASE(groupby_sum_of_squares_test, supported_types);
-
-// clang-format off
-TYPED_TEST(groupby_sum_of_squares_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                          //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
-    fixed_width_column_wrapper<K> expect_keys { 1,        2,           3      };
-                                          //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-    fixed_width_column_wrapper<R> expect_vals({   45.,       123.,      117. }, all_valid());
-
-    auto agg = cudf::make_sum_of_squares_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_sum_of_squares_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_sum_of_squares_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_sum_of_squares_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
-
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V> vals        { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_sum_of_squares_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_sum_of_squares_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    fixed_width_column_wrapper<V> vals      ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_sum_of_squares_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_sum_of_squares_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    3}
-    fixed_width_column_wrapper<R> expect_vals({ 45.,      98.,       68.,     9.},
-                                              { 1,        1,         1,       0});
-
-    auto agg = cudf::make_sum_of_squares_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-// clang-format on
-
-// This test will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TYPED_TEST(groupby_sum_of_squares_test, DISABLED_dictionary)
-{
-  using K = int32_t;
-  using V = TypeParam;
-  using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                        //   { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
-  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3      });
-                                        //   { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals( { 45.,       123.,        117. }, all_valid());
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_of_squares_aggregation());
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_sum_test.cpp b/cpp/tests/groupby/group_sum_test.cpp
deleted file mode 100644
index f07c695b17a..00000000000
--- a/cpp/tests/groupby/group_sum_test.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_sum_test : public cudf::test::BaseFixture {
-};
-
-using supported_types =
-  cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
-                     cudf::test::DurationTypes>;
-
-TYPED_TEST_CASE(groupby_sum_test, supported_types);
-
-// clang-format off
-TYPED_TEST(groupby_sum_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V, int> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-    fixed_width_column_wrapper<K> expect_keys { 1, 2,  3 };
-    fixed_width_column_wrapper<R, int> expect_vals { 9, 19, 17};
-
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_sum_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V, int> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R, int> expect_vals { };
-
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_sum_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V, int> vals        { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R, int> expect_vals { };
-
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_sum_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    fixed_width_column_wrapper<V, int> vals      ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R, int> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-
-TYPED_TEST(groupby_sum_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V, int> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    -}
-    fixed_width_column_wrapper<R, int> expect_vals({ 9,        14,        10,      0},
-                                              { 1,         1,         1,      0});
-
-    auto agg = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-
-    auto agg2 = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
-}
-// clang-format on
-
-// These tests will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TYPED_TEST(groupby_sum_test, DISABLED_dictionary)
-{
-  using K = int32_t;
-  using V = TypeParam;
-  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K>     keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  dictionary_column_wrapper<V, int> vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  fixed_width_column_wrapper<K>      expect_keys{ 1, 2,  3 };
-  fixed_width_column_wrapper<R, int> expect_vals{ 9, 19, 17};
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation());
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES);
-}
-
-template <typename T>
-struct FixedPointTestBothReps : public cudf::test::BaseFixture {
-};
-
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
-
-TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
-{
-  using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using K            = int32_t;
-
-  for (auto const i : {2, 1, 0, -1, -2}) {
-    auto const scale = scale_type{i};
-    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
-
-    auto agg1 = cudf::make_sum_aggregation();
-    test_single_agg(
-      keys, vals, expect_keys, expect_vals_sum, std::move(agg1), force_use_sort_impl::YES);
-
-    auto agg4 = cudf::make_product_aggregation();
-    EXPECT_THROW(
-      test_single_agg(keys, vals, expect_keys, {}, std::move(agg4), force_use_sort_impl::YES),
-      cudf::logic_error);
-  }
-}
-
-// This test will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TYPED_TEST(FixedPointTestBothReps, DISABLED_GroupByHashSumDecimalAsValue)
-{
-  using namespace numeric;
-  using decimalXX    = TypeParam;
-  using RepType      = cudf::device_storage_type_t<decimalXX>;
-  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
-  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
-  using K            = int32_t;
-
-  for (auto const i : {2, 1, 0, -1, -2}) {
-    auto const scale = scale_type{i};
-    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    auto const vals  = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
-
-    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
-    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
-
-    auto agg5 = cudf::make_sum_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5));
-
-    auto agg8 = cudf::make_product_aggregation();
-    EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error);
-  }
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/groupby/group_var_test.cpp b/cpp/tests/groupby/group_var_test.cpp
deleted file mode 100644
index 5fa52bad22a..00000000000
--- a/cpp/tests/groupby/group_var_test.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef NDEBUG  // currently groupby variance tests are not supported. See groupstd.cu
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/aggregation/aggregation.hpp>
-
-namespace cudf {
-namespace test {
-template <typename V>
-struct groupby_var_test : public cudf::test::BaseFixture {
-};
-
-using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
-
-TYPED_TEST_CASE(groupby_var_test, supported_types);
-
-// clang-format off
-TYPED_TEST(groupby_var_test, basic)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-    fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                          //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
-    fixed_width_column_wrapper<K> expect_keys { 1,        2,           3      };
-                                          //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-    fixed_width_column_wrapper<R> expect_vals({   9.,      131./12,     31./3 }, all_valid());
-
-    auto agg = cudf::make_variance_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_var_test, empty_cols)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
-
-    fixed_width_column_wrapper<K> keys        { };
-    fixed_width_column_wrapper<V> vals        { };
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_variance_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_var_test, zero_valid_keys)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
-
-    fixed_width_column_wrapper<K> keys      ( { 1, 2, 3}, all_null() );
-    fixed_width_column_wrapper<V> vals        { 3, 4, 5};
-
-    fixed_width_column_wrapper<K> expect_keys { };
-    fixed_width_column_wrapper<R> expect_vals { };
-
-    auto agg = cudf::make_variance_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_var_test, zero_valid_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
-
-    fixed_width_column_wrapper<K> keys        { 1, 1, 1};
-    fixed_width_column_wrapper<V> vals      ( { 3, 4, 5}, all_null() );
-
-    fixed_width_column_wrapper<K> expect_keys { 1 };
-    fixed_width_column_wrapper<R> expect_vals({ 0 }, all_null());
-
-    auto agg = cudf::make_variance_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_var_test, null_keys_and_values)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    3}
-    fixed_width_column_wrapper<R> expect_vals({ 4.5,      49./3,    18.,     0.},
-                                              { 1,        1,         1,       0});
-
-    auto agg = cudf::make_variance_aggregation();
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-
-TYPED_TEST(groupby_var_test, ddof_non_default)
-{
-    using K = int32_t;
-    using V = TypeParam;
-    using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
-
-    fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                              { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
-    fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
-                                              { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
-
-                                          //  { 1, 1,     2, 2, 2,   3, 3,    4}
-    fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
-                                          //  { 3, 6,     1, 4, 9,   2, 8,    3}
-    fixed_width_column_wrapper<R> expect_vals({ 0.,       98./3,    0.,      0.},
-                                              { 0,        1,         0,       0});
-
-    auto agg = cudf::make_variance_aggregation(2);
-    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
-}
-// clang-format on
-
-// This test will not work until the following ptxas bug is fixed in 10.2
-// https://nvbugswb.nvidia.com/NvBugs5/SWBug.aspx?bugid=3186317&cp=
-TYPED_TEST(groupby_var_test, DISABLED_dictionary)
-{
-  using K = int32_t;
-  using V = TypeParam;
-  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
-
-  // clang-format off
-  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
-  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-                                        //   { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
-  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3      });
-                                        //   { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
-  fixed_width_column_wrapper<R> expect_vals( { 9.,      131./12,     31./3 }, all_valid());
-  // clang-format on
-
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_variance_aggregation());
-}
-
-}  // namespace test
-}  // namespace cudf
-
-#endif  // NDEBUG
diff --git a/cpp/tests/groupby/groupby_groups_test.cpp b/cpp/tests/groupby/groups_tests.cpp
similarity index 100%
rename from cpp/tests/groupby/groupby_groups_test.cpp
rename to cpp/tests/groupby/groups_tests.cpp
diff --git a/cpp/tests/groupby/groupby_keys_test.cpp b/cpp/tests/groupby/keys_tests.cpp
similarity index 100%
rename from cpp/tests/groupby/groupby_keys_test.cpp
rename to cpp/tests/groupby/keys_tests.cpp
diff --git a/cpp/tests/groupby/group_max_scan_test.cpp b/cpp/tests/groupby/max_scan_tests.cpp
similarity index 100%
rename from cpp/tests/groupby/group_max_scan_test.cpp
rename to cpp/tests/groupby/max_scan_tests.cpp
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
new file mode 100644
index 00000000000..30720998fe0
--- /dev/null
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/dictionary/update_keys.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_max_test : public cudf::test::BaseFixture {
+};
+
+using K = int32_t;
+TYPED_TEST_CASE(groupby_max_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(groupby_max_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MAX>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals({6, 9, 8});
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_max_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MAX>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_max_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MAX>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5});
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_max_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MAX>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_max_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MAX>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 0, 3,     1, 4, 5,   2, 8,    -}
+  fixed_width_column_wrapper<R> expect_vals({3, 5, 8, 0}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_max_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_max_string_test, basic)
+{
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  strings_column_wrapper expect_vals({"año", "zit", "₹1"});
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TEST_F(groupby_max_string_test, zero_valid_values)
+{
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  strings_column_wrapper expect_vals({""}, all_null());
+
+  auto agg = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_max_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_dictionary_max_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_max_test, basic)
+{
+  using V = std::string;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
+  dictionary_column_wrapper<V>  vals{ "año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
+  fixed_width_column_wrapper<K> expect_keys   {     1,     2,    3 };
+  dictionary_column_wrapper<V>  expect_vals_w({ "año", "zit", "₹1" });
+  // clang-format on
+
+  auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
+
+  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_max_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_max_aggregation(),
+                  force_use_sort_impl::YES);
+}
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
+
+    auto agg3 = cudf::make_max_aggregation();
+    test_single_agg(
+      keys, vals, expect_keys, expect_vals_max, std::move(agg3), force_use_sort_impl::YES);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
+
+    auto agg7 = cudf::make_max_aggregation();
+    test_single_agg(keys, vals, expect_keys, expect_vals_max, std::move(agg7));
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
new file mode 100644
index 00000000000..026d999e172
--- /dev/null
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <initializer_list>
+#include <iterator>
+#include <type_traits>
+#include <vector>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_mean_test : public cudf::test::BaseFixture {
+};
+
+template <typename Target, typename Source>
+std::vector<Target> convert(std::initializer_list<Source> in)
+{
+  std::vector<Target> out(std::cbegin(in), std::cend(in));
+  return out;
+}
+
+using supported_types =
+  cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
+                     cudf::test::DurationTypes>;
+TYPED_TEST_CASE(groupby_mean_test, supported_types);
+using K = int32_t;
+
+TYPED_TEST(groupby_mean_test, basic)
+{
+  using V  = TypeParam;
+  using R  = cudf::detail::target_type_t<V, aggregation::MEAN>;
+  using RT = typename std::conditional<cudf::is_duration<R>(), int, double>::type;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // clang-format off
+  fixed_width_column_wrapper<K> expect_keys{1,       2,          3};
+  //                                       {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
+  std::vector<RT> expect_v = convert<RT>(  {3.,      19. / 4,    17. / 3});
+  fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend());
+  // clang-format on
+
+  auto agg = cudf::make_mean_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_mean_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_mean_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_mean_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_mean_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_mean_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_mean_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_mean_test, null_keys_and_values)
+{
+  using V  = TypeParam;
+  using R  = cudf::detail::target_type_t<V, aggregation::MEAN>;
+  using RT = typename std::conditional<cudf::is_duration<R>(), int, double>::type;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  // clang-format off
+  //                                        {1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, all_valid());
+  //                                        {3, 6,     1, 4, 9,   2, 8,    -}
+  std::vector<RT> expect_v = convert<RT>(   {4.5,      14. / 3,   5.,      0.});
+  fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend(), {1, 1, 1, 0});
+  // clang-format on
+
+  auto agg = cudf::make_mean_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+// clang-format on
+
+struct groupby_dictionary_mean_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_mean_test, basic)
+{
+  using V = int16_t;
+  using R = cudf::detail::target_type_t<V, aggregation::MEAN>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  fixed_width_column_wrapper<K> expect_keys(        {1,      2,       3});
+  fixed_width_column_wrapper<R, double> expect_vals({9. / 3, 19. / 4, 17. / 3});
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation());
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/median_tests.cpp b/cpp/tests/groupby/median_tests.cpp
new file mode 100644
index 00000000000..d83e9ec946b
--- /dev/null
+++ b/cpp/tests/groupby/median_tests.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_median_test : public cudf::test::BaseFixture {
+};
+
+using K               = int32_t;
+using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+TYPED_TEST_CASE(groupby_median_test, supported_types);
+
+TYPED_TEST(groupby_median_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // clang-format off
+  //                                       {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys{1,       2,          3};
+  //                                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({3.,     4.5,        7.}, all_valid());
+  // clang-format on
+
+  auto agg = cudf::make_median_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_median_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_median_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_median_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_median_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_median_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_median_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_median_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 3, 6,     1, 4, 9,   2, 8,    -}
+  fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_median_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_median_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MEDIAN>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  //                                        {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys({1,       2,          3      });
+  //                                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({3.,       4.5,       7.     }, all_valid());
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation());
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_min_scan_test.cpp b/cpp/tests/groupby/min_scan_tests.cpp
similarity index 100%
rename from cpp/tests/groupby/group_min_scan_test.cpp
rename to cpp/tests/groupby/min_scan_tests.cpp
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
new file mode 100644
index 00000000000..f6340a4838b
--- /dev/null
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/dictionary/update_keys.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_min_test : public cudf::test::BaseFixture {
+};
+
+using K = int32_t;
+TYPED_TEST_CASE(groupby_min_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(groupby_min_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MIN>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals({0, 1, 2});
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_min_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MIN>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_min_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MIN>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5});
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_min_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MIN>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_min_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::MIN>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 3, 6,     1, 4, 9,   2, 8,    -}
+  fixed_width_column_wrapper<R> expect_vals({3, 1, 2, 0}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_min_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_min_string_test, basic)
+{
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  strings_column_wrapper expect_vals({"aaa", "bat", "$1"});
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TEST_F(groupby_min_string_test, zero_valid_values)
+{
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  strings_column_wrapper vals({"año", "bit", "₹1"}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  strings_column_wrapper expect_vals({""}, all_null());
+
+  auto agg = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_min_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_dictionary_min_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_min_test, basic)
+{
+  using V = std::string;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{     1,     2,    3,     1,     2,     2,     1,    3,    3,    2 };
+  dictionary_column_wrapper<V>  vals{ "año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
+  fixed_width_column_wrapper<K> expect_keys   {     1,     2,    3 };
+  dictionary_column_wrapper<V>  expect_vals_w({ "aaa", "bat", "$1" });
+  // clang-format on
+
+  auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
+
+  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_min_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_min_aggregation(),
+                  force_use_sort_impl::YES);
+}
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
+
+    auto agg2 = cudf::make_min_aggregation();
+    test_single_agg(
+      keys, vals, expect_keys, expect_vals_min, std::move(agg2), force_use_sort_impl::YES);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
+
+    auto agg6 = cudf::make_min_aggregation();
+    test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg6));
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_nth_element_test.cpp b/cpp/tests/groupby/nth_element_tests.cpp
similarity index 92%
rename from cpp/tests/groupby/group_nth_element_test.cpp
rename to cpp/tests/groupby/nth_element_tests.cpp
index ec0265a3023..5630cba09da 100644
--- a/cpp/tests/groupby/group_nth_element_test.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -362,5 +362,45 @@ TEST_F(groupby_nth_element_string_test, dictionary)
     keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2));
 }
 
+template <typename T>
+struct groupby_nth_element_lists_test : BaseFixture {
+};
+
+TYPED_TEST_CASE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint);
+
+TYPED_TEST(groupby_nth_element_lists_test, Basics)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using lists = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  auto keys   = fixed_width_column_wrapper<K, int32_t>{1, 1, 2, 2, 3, 3};
+  auto values = lists{{1, 2}, {3, 4}, {5, 6, 7}, lists{}, {9, 10}, {11}};
+
+  auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{1, 2, 3};
+  auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}};
+
+  test_single_agg(
+    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0));
+}
+
+TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
+{
+  using K = int32_t;
+  using V = TypeParam;
+
+  using lists = cudf::test::lists_column_wrapper<V, int32_t>;
+
+  auto keys   = fixed_width_column_wrapper<K, int32_t>{};
+  auto values = lists{};
+
+  auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{};
+  auto expected_values = lists{};
+
+  test_single_agg(
+    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/nunique_tests.cpp b/cpp/tests/groupby/nunique_tests.cpp
new file mode 100644
index 00000000000..acfa1c953e2
--- /dev/null
+++ b/cpp/tests/groupby/nunique_tests.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_nunique_test : public cudf::test::BaseFixture {
+};
+
+using K = int32_t;
+TYPED_TEST_CASE(groupby_nunique_test, cudf::test::AllTypes);
+
+TYPED_TEST(groupby_nunique_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // clang-format off
+  //                                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
+  fixed_width_column_wrapper<K> expect_keys{1,        2,          3};
+  fixed_width_column_wrapper<R> expect_vals{3,        4,          3};
+  fixed_width_column_wrapper<R> expect_bool_vals{2,   1,          1};
+  // clang-format on
+
+  auto agg = cudf::make_nunique_aggregation();
+  if (std::is_same<V, bool>())
+    test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
+  else
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_nunique_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, basic_duplicates)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 3, 2, 2, 9};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals{2, 4, 1};
+  fixed_width_column_wrapper<R> expect_bool_vals{2, 1, 1};
+
+  auto agg = cudf::make_nunique_aggregation();
+  if (std::is_same<V, bool>())
+    test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
+  else
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals({3, 4, 5});
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_nunique_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals{0};
+
+  auto agg = cudf::make_nunique_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //                                        {1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  // all unique values only                 {3, 6,     1, 4, 9,   2, 8,    -}
+  fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
+  fixed_width_column_wrapper<R> expect_bool_vals{1, 1, 1, 0};
+
+  auto agg = cudf::make_nunique_aggregation();
+  if (std::is_same<V, bool>())
+    test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
+  else
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, null_keys_and_values_with_duplicates)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
+                                     {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
+                                     {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
+
+  //  { 1, 1,     2, 2, 2,    3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 3, 6,-    1, 4, 9,-   2*, 8,   -*}
+  //  unique,     with null,  dup,     dup null
+  fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
+  fixed_width_column_wrapper<R> expect_bool_vals{1, 1, 1, 0};
+
+  auto agg = cudf::make_nunique_aggregation();
+  if (std::is_same<V, bool>())
+    test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
+  else
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, include_nulls)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
+                                     {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
+                                     {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
+
+  //  { 1, 1,     2, 2, 2,    3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 3, 6,-    1, 4, 9,-   2*, 8,   -*}
+  //  unique,     with null,  dup,     dup null
+  fixed_width_column_wrapper<R> expect_vals{3, 4, 2, 1};
+  fixed_width_column_wrapper<R> expect_bool_vals{2, 2, 1, 1};
+
+  auto agg = cudf::make_nunique_aggregation(null_policy::INCLUDE);
+  if (std::is_same<V, bool>())
+    test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
+  else
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_nunique_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::NUNIQUE>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 0, 3, 2, 4, 4, 2},
+                                     {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+  dictionary_column_wrapper<V>  vals({0, 1, 2, 2, 3, 4, 0, 6, 7, 8, 9, 0, 0, 0},
+                                     {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
+
+  // { 1, 1,   2, 2, 2,   3, 3,   4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  // { 3, 6,-  1, 4, 9,-  2*, 8,  -*}
+  //  unique,  with null, dup,    dup null
+  fixed_width_column_wrapper<R> expect_fixed_vals({3, 4, 2, 1});
+  fixed_width_column_wrapper<R> expect_bool_vals{2, 2, 1, 1};
+  // clang-format on
+
+  cudf::column_view expect_vals = (std::is_same<V, bool>()) ? cudf::column_view{expect_bool_vals}
+                                                            : cudf::column_view{expect_fixed_vals};
+
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_nunique_aggregation(null_policy::INCLUDE));
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/product_tests.cpp b/cpp/tests/groupby/product_tests.cpp
new file mode 100644
index 00000000000..d2db409711d
--- /dev/null
+++ b/cpp/tests/groupby/product_tests.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_product_test : public cudf::test::BaseFixture {
+};
+
+using K               = int32_t;
+using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+TYPED_TEST_CASE(groupby_product_test, supported_types);
+
+TYPED_TEST(groupby_product_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys        { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+                                        //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys { 1,        2,           3      };
+                                        //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({   0.,       180.,      112. }, all_valid());
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+}
+
+TYPED_TEST(groupby_product_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+}
+
+TYPED_TEST(groupby_product_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+}
+
+TYPED_TEST(groupby_product_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+}
+
+TYPED_TEST(groupby_product_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                            { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
+                                            { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+                                        //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,         3,       4}, all_valid());
+                                        //  { _, 3, 6,  1, 4, 9,   2, 8,    _}
+  fixed_width_column_wrapper<R> expect_vals({ 18.,      36.,       16.,     3.},
+                                            { 1,        1,         1,       0});
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+}
+
+TYPED_TEST(groupby_product_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+                                        //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3      });
+                                        //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        112. }, all_valid());
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+}
+
+TYPED_TEST(groupby_product_test, dictionary_with_nulls)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::PRODUCT>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                     {1, 0, 0, 1, 1, 1, 1, 1, 1, 1}};
+
+                                        //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys({ 1,        2,           3      });
+                                        //  { 0, 3, 6,  @, 4, 5, 9,  @, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        56. }, all_valid());
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/quantile_tests.cpp b/cpp/tests/groupby/quantile_tests.cpp
new file mode 100644
index 00000000000..babd84d4334
--- /dev/null
+++ b/cpp/tests/groupby/quantile_tests.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_quantile_test : public cudf::test::BaseFixture {
+};
+
+using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+using K = int32_t;
+TYPED_TEST_CASE(groupby_quantile_test, supported_types);
+
+TYPED_TEST(groupby_quantile_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // clang-format on
+  //                                       {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  //                                       {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({3., 4.5, 7.}, all_valid());
+  // clang-format on
+
+  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_quantile_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_quantile_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_quantile_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_quantile_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 3, 6,     1, 4, 9,   2, 8,    -}
+  fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_quantile_test, multiple_quantile)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // clang-format off
+  //                                       {1, 1, 1,   2, 2, 2, 2, 3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  //                                        {0, 3, 6,  1, 4, 5, 9, 2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({1.5, 4.5, 3.25, 6.,   4.5, 7.5}, all_valid());
+  // clang-format on
+
+  auto agg = cudf::make_quantile_aggregation({0.25, 0.75}, interpolation::LINEAR);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_quantile_test, interpolation_types)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 9};
+
+  // clang-format off
+  //                                         {1, 1, 1,  2, 2, 2, 2,  3, 3}
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
+  fixed_width_column_wrapper<R> expect_vals1({2.4,      4.2,         4.}, all_valid());
+  auto agg1 = cudf::make_quantile_aggregation({0.4}, interpolation::LINEAR);
+  test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1));
+
+  //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
+  fixed_width_column_wrapper<R> expect_vals2({3,        4,           2}, all_valid());
+  auto agg2 = cudf::make_quantile_aggregation({0.4}, interpolation::NEAREST);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
+
+  //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
+  fixed_width_column_wrapper<R> expect_vals3({0,        4,          2}, all_valid());
+  auto agg3 = cudf::make_quantile_aggregation({0.4}, interpolation::LOWER);
+  test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3));
+
+  //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
+  fixed_width_column_wrapper<R> expect_vals4({3,        5,           7}, all_valid());
+  auto agg4 = cudf::make_quantile_aggregation({0.4}, interpolation::HIGHER);
+  test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg4));
+
+  //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
+  fixed_width_column_wrapper<R> expect_vals5({1.5,      4.5,         4.5}, all_valid());
+  auto agg5 = cudf::make_quantile_aggregation({0.4}, interpolation::MIDPOINT);
+  test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg5));
+  // clang-format on
+}
+
+TYPED_TEST(groupby_quantile_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::QUANTILE>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  //                                        {1, 1, 1, 2, 2, 2, 2, 3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3});
+  //                                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({3.,      4.5,        7.}, all_valid());
+  // clang-format on
+
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR));
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/replace_nulls_tests.cpp b/cpp/tests/groupby/replace_nulls_tests.cpp
new file mode 100644
index 00000000000..527c7dba725
--- /dev/null
+++ b/cpp/tests/groupby/replace_nulls_tests.cpp
@@ -0,0 +1,366 @@
+
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+namespace cudf {
+namespace test {
+
+using K = int32_t;
+
+template <typename T>
+struct GroupbyReplaceNullsFixedWidthTest : public BaseFixture {
+};
+
+TYPED_TEST_CASE(GroupbyReplaceNullsFixedWidthTest, FixedWidthTypes);
+
+template <typename K, typename V>
+void TestReplaceNullsGroupbySingle(
+  K const& key, V const& input, K const& expected_key, V const& expected_val, replace_policy policy)
+{
+  groupby::groupby gb_obj(table_view({key}));
+  std::vector<replace_policy> policies{policy};
+  auto p = gb_obj.replace_nulls(table_view({input}), policies);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*p.first, table_view({expected_key}));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*p.second, table_view({expected_val}));
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, PrecedingFill)
+{
+  // Group 0 value: {42, 24, null}  --> {42, 24, 24}
+  // Group 1 value: {7, null, null} --> {7, 7, 7}
+  fixed_width_column_wrapper<K> key{0, 1, 0, 1, 0, 1};
+  fixed_width_column_wrapper<TypeParam> val({42, 7, 24, 10, 1, 1000}, {1, 1, 1, 0, 0, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({42, 24, 24, 7, 7, 7}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, FollowingFill)
+{
+  // Group 0 value: {2, null, 32}               --> {2, 32, 32}
+  // Group 1 value: {8, null, null, 128, 256}   --> {8, 128, 128, 128, 256}
+  fixed_width_column_wrapper<K> key{0, 0, 1, 1, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> val({2, 4, 8, 16, 32, 64, 128, 256},
+                                            {1, 0, 1, 0, 1, 0, 1, 1});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({2, 32, 32, 8, 128, 128, 128, 256}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, PrecedingFillLeadingNulls)
+{
+  // Group 0 value: {null, 24, null}    --> {null, 24, 24}
+  // Group 1 value: {null, null, null}  --> {null, null, null}
+  fixed_width_column_wrapper<K> key{0, 1, 0, 1, 0, 1};
+  fixed_width_column_wrapper<TypeParam> val({42, 7, 24, 10, 1, 1000}, {0, 0, 1, 0, 0, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({-1, 24, 24, -1, -1, -1}, {0, 1, 1, 0, 0, 0});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsFixedWidthTest, FollowingFillTrailingNulls)
+{
+  // Group 0 value: {2, null, null}                 --> {2, null, null}
+  // Group 1 value: {null, null, 64, null, null}    --> {64, 64, 64, null, null}
+  fixed_width_column_wrapper<K> key{0, 0, 1, 1, 0, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> val({2, 4, 8, 16, 32, 64, 128, 256},
+                                            {1, 0, 0, 0, 0, 1, 0, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1, 1};
+  fixed_width_column_wrapper<TypeParam> expect_val({2, -1, -1, 64, 64, 64, -1, -1},
+                                                   {1, 0, 0, 1, 1, 1, 0, 0});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+struct GroupbyReplaceNullsStringsTest : public BaseFixture {
+};
+
+TEST_F(GroupbyReplaceNullsStringsTest, PrecedingFill)
+{
+  // Group 0 value: {"y" "42"}  --> {"y", "42"}
+  // Group 1 value: {"xx" @ "zzz" @ "one"} --> {"xx" "xx" "zzz" "zzz" "one"}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"xx", "", "y", "zzz", "42", "", "one"},
+                             {true, false, true, true, true, false, true});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"y", "42", "xx", "xx", "zzz", "zzz", "one"}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TEST_F(GroupbyReplaceNullsStringsTest, FollowingFill)
+{
+  // Group 0 value: {@ "42"}  --> {"42", "42"}
+  // Group 1 value: {"xx" @ "zzz" @ "one"} --> {"xx" "zzz" "zzz" "one" "one"}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"xx", "", "", "zzz", "42", "", "one"},
+                             {true, false, false, true, true, false, true});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"42", "42", "xx", "zzz", "zzz", "one", "one"}, all_valid());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+TEST_F(GroupbyReplaceNullsStringsTest, PrecedingFillPrecedingNull)
+{
+  // Group 0 value: {"y" "42"}  --> {"y", "42"}
+  // Group 1 value: {@ @ "zzz" "zzz" "zzz"} --> {@ @ "zzz" "zzz" "zzz"}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"", "", "y", "zzz", "42", "", ""},
+                             {false, false, true, true, true, false, false});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"y", "42", "", "", "zzz", "zzz", "zzz"},
+                                    {true, true, false, false, true, true, true});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TEST_F(GroupbyReplaceNullsStringsTest, FollowingFillTrailingNull)
+{
+  // Group 0 value: {@ "y"}  --> {"y", "y"}
+  // Group 1 value: {"xx" @ "zzz" @ @} --> {"xx" "zzz" "zzz" @ @}
+  fixed_width_column_wrapper<K> key{1, 1, 0, 1, 0, 1, 1};
+  strings_column_wrapper val({"xx", "", "", "zzz", "y", "", ""},
+                             {true, false, false, true, true, false, false});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 1, 1, 1, 1, 1};
+  strings_column_wrapper expect_val({"y", "y", "xx", "zzz", "zzz", "", ""},
+                                    {true, true, true, true, true, false, false});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+template <typename T>
+struct GroupbyReplaceNullsListsTest : public BaseFixture {
+};
+
+TYPED_TEST_CASE(GroupbyReplaceNullsListsTest, FixedWidthTypes);
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, PrecedingFillNonNested)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  // Group 0 value: {{1 2 3} @ {4 5} @} --> {{1 2 3} {1 2 3} {4 5} {4 5}}, w/o leading nulls
+  // Group 1 value: {@ {} @} --> {@ {} {}}, w/ leading nulls
+  fixed_width_column_wrapper<K> key{0, 1, 0, 0, 1, 1, 0};
+
+  std::vector<valid_type> mask{1, 0, 0, 1, 1, 0, 0};
+  LCW val({{1, 2, 3}, {}, {}, {4, 5}, {}, {}, {}}, mask.begin());
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 0, 1, 1, 1};
+  std::vector<valid_type> expected_mask{1, 1, 1, 1, 0, 1, 1};
+  LCW expect_val({{1, 2, 3}, {1, 2, 3}, {4, 5}, {4, 5}, {-1}, {}, {}}, expected_mask.begin());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, FollowingFillNonNested)
+{
+  using LCW = lists_column_wrapper<TypeParam, int32_t>;
+  // Group 0 value: {@ {5 6} @ {-1}} --> {{5 6} {5 6} {-1} {-1}}, w/o trailing nulls
+  // Group 1 value: {@ {} @} --> {{} {} @}}, w/ trailing nulls
+  fixed_width_column_wrapper<K> key{0, 1, 0, 0, 1, 1, 0};
+
+  std::vector<valid_type> mask{0, 0, 1, 0, 1, 0, 1};
+  LCW val({{}, {}, {5, 6}, {}, {}, {}, {-1}}, mask.begin());
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 0, 1, 1, 1};
+  std::vector<valid_type> expected_mask{1, 1, 1, 1, 1, 1, 0};
+  LCW expect_val({{5, 6}, {5, 6}, {-1}, {-1}, {}, {}, {}}, expected_mask.begin());
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, PrecedingFillNested)
+{
+  using LCW    = lists_column_wrapper<TypeParam, int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  // Group 0 value: {{{1 @ 3} @}
+  //                 @
+  //                 {{@} {}}}} -->
+  //                {{{1 @ 3} @}
+  //                 {{1 @ 3} @}
+  //                 {{@} {}}}}, w/o leading nulls
+  // Group 1 value: {@
+  //                 {@ {102 @}}
+  //                 @
+  //                 {{@ 202} {}}}} -->
+  //                {@
+  //                 {@ {102 @}}
+  //                 {@ {102 @}}
+  //                 {{@ 202} {}}}}, w/ leading nulls
+  // Only top level nulls are replaced.
+  fixed_width_column_wrapper<K> key{1, 0, 1, 1, 0, 0, 1};
+
+  // clang-format off
+  LCW val({{},
+           LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+           LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+           {},
+           {},
+           {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+           {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}}},
+           Mask_t{0, 1, 1, 0, 0, 1, 1}.begin());
+  // clang-format on
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  // clang-format off
+  LCW expect_val({LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+                  LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+                  {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+                  {},
+                  LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+                  LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+                  {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}}},
+           Mask_t{1, 1, 1, 0, 1 ,1 ,1}.begin());
+  // clang-format on
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TYPED_TEST(GroupbyReplaceNullsListsTest, FollowingFillNested)
+{
+  using LCW    = lists_column_wrapper<TypeParam, int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  // Group 0 value: {{{1 @ 3} @}
+  //                 @
+  //                 {{@} {}}}} -->
+  //                {{{1 @ 3} @}
+  //                 {{@} {}}}}
+  //                 {{@} {}}}}, w/o trailing nulls
+  // Group 1 value: {{@ {102 @}}
+  //                 @
+  //                 {{@ 202} {}}}}
+  //                 @ -->
+  //                {{@ {102 @}}
+  //                 {{@ 202} {}}}
+  //                 {{@ 202} {}}}
+  //                 @}, w/ trailing nulls
+  // Only top level nulls are replaced.
+  fixed_width_column_wrapper<K> key{1, 0, 1, 1, 0, 0, 1};
+
+  // clang-format off
+  LCW val({LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+           LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+           {},
+           {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}},
+           {},
+           {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+           {}},
+           Mask_t{1, 1, 0, 1, 0, 1, 0}.begin());
+  // clang-format on
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  // clang-format off
+  LCW expect_val({LCW({LCW({1, -1, 3}, Mask_t{1, 0, 1}.begin()), {}}, Mask_t{1, 0}.begin()),
+                 {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+                 {LCW({{}}, Mask_t{0}.begin()), LCW{}},
+                 LCW({LCW{}, LCW({102, -1}, Mask_t{1, 0}.begin())}, Mask_t{0, 1}.begin()),
+                 {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}},
+                 {LCW({-1, 202}, Mask_t{0, 1}.begin()), LCW{}},
+                 {}},
+           Mask_t{1, 1, 1, 1, 1, 1, 0}.begin());
+  // clang-format on
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+struct GroupbyReplaceNullsStructsTest : public BaseFixture {
+  using SCW = structs_column_wrapper;
+
+  SCW data(fixed_width_column_wrapper<int32_t> field0,
+           strings_column_wrapper field1,
+           lists_column_wrapper<int32_t> field2,
+           std::initializer_list<valid_type> mask)
+  {
+    return SCW({field0, field1, field2}, mask.begin());
+  }
+};
+
+TEST_F(GroupbyReplaceNullsStructsTest, PrecedingFill)
+{
+  using LCW    = lists_column_wrapper<int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  fixed_width_column_wrapper<K> key{1, 0, 0, 1, 0, 1, 1};
+
+  // Only null rows are replaced.
+
+  SCW val =
+    this->data({{1, -1, 3, -1, -1, -1, 7}, {1, 0, 1, 0, 0, 0, 1}},
+               {{"x", "yy", "", "", "", "zz", ""}, {true, true, false, false, false, true, false}},
+               LCW({{1, 2, 3}, {-1}, {}, {}, {42}, {}, {}}, Mask_t{1, 1, 0, 0, 1, 0, 0}.begin()),
+               {1, 1, 0, 0, 1, 1, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  SCW expect_val = this->data(
+    {{-1, -1, -1, 1, 1, -1, -1}, {0, 0, 0, 1, 1, 0, 0}},
+    {{"yy", "yy", "", "x", "x", "zz", "zz"}, {true, true, false, true, true, true, true}},
+    LCW({LCW{-1}, {-1}, {42}, {1, 2, 3}, {1, 2, 3}, {}, {}}, Mask_t{1, 1, 1, 1, 1, 0, 0}.begin()),
+    {1, 1, 1, 1, 1, 1, 1});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::PRECEDING);
+}
+
+TEST_F(GroupbyReplaceNullsStructsTest, FollowingFill)
+{
+  using LCW    = lists_column_wrapper<int32_t>;
+  using Mask_t = std::vector<valid_type>;
+  fixed_width_column_wrapper<K> key{1, 0, 0, 1, 0, 1, 1};
+
+  // Only null rows are replaced.
+
+  SCW val =
+    this->data({{1, -1, 3, -1, -1, -1, 7}, {1, 0, 1, 0, 0, 0, 1}},
+               {{"x", "yy", "", "", "", "zz", ""}, {true, true, false, false, false, true, false}},
+               LCW({{1, 2, 3}, {-1}, {}, {}, {42}, {}, {}}, Mask_t{1, 1, 0, 0, 1, 0, 0}.begin()),
+               {1, 1, 0, 0, 1, 1, 0});
+
+  fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
+
+  SCW expect_val = this->data(
+    {{-1, -1, -1, 1, -1, -1, -1}, {0, 0, 0, 1, 0, 0, 0}},
+    {{"yy", "", "", "x", "zz", "zz", ""}, {true, false, false, true, true, true, false}},
+    LCW({LCW{-1}, {42}, {42}, {1, 2, 3}, {}, {}, {}}, Mask_t{1, 1, 1, 1, 0, 0, 0}.begin()),
+    {1, 1, 1, 1, 1, 1, 0});
+
+  TestReplaceNullsGroupbySingle(key, val, expect_key, expect_val, replace_policy::FOLLOWING);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
new file mode 100644
index 00000000000..3a934071427
--- /dev/null
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -0,0 +1,479 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+namespace cudf {
+namespace test {
+
+using K = int32_t;
+template <typename T>
+struct groupby_shift_fixed_width_test : public BaseFixture {
+};
+
+TYPED_TEST_CASE(groupby_shift_fixed_width_test, FixedWidthTypes);
+
+template <typename V>
+void test_groupby_shift_fixed_width_single(fixed_width_column_wrapper<K> const& key,
+                                           fixed_width_column_wrapper<V> const& value,
+                                           size_type offset,
+                                           scalar const& fill_value,
+                                           fixed_width_column_wrapper<V> const& expected)
+{
+  groupby::groupby gb_obj(table_view({key}));
+  std::vector<size_type> offsets{offset};
+  auto got = gb_obj.shift(table_view{{value}}, offsets, {fill_value});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL((*got.second).view().column(0), expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithoutNull_NullScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  fixed_width_column_wrapper<V> val{3, 4, 5, 6, 7, 8, 9};
+  fixed_width_column_wrapper<V> expected({-1, -1, 3, 5, -1, -1, 4}, {0, 0, 1, 1, 0, 0, 1});
+  size_type offset = 2;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, *slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithNull_NullScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  fixed_width_column_wrapper<V> val({3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> expected({-1, -1, -1, -1, -1, -1, -1}, {0, 0, 0, 0, 0, 0, 0});
+  size_type offset = 2;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, *slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithoutNull_ValidScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1};
+  fixed_width_column_wrapper<V> val({3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5});
+  fixed_width_column_wrapper<V> expected({42, 42, 42, 3, 5, 8, 9, 42, 42, 42, 4, 6, 7});
+  size_type offset = 3;
+  auto slr         = cudf::scalar_type_t<TypeParam>(make_type_param_scalar<TypeParam>(42), true);
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, ForwardShiftWithNull_ValidScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1};
+  fixed_width_column_wrapper<V> val({3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5},
+                                    {1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1});
+  fixed_width_column_wrapper<V> expected({42, 42, 42, 3, 5, -1, -1, 42, 42, 42, -1, -1, 7},
+                                         {1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1});
+  size_type offset = 3;
+  auto slr         = cudf::scalar_type_t<TypeParam>(make_type_param_scalar<TypeParam>(42), true);
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithoutNull_NullScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  fixed_width_column_wrapper<V> val{3, 4, 5, 6, 7, 8, 9};
+  fixed_width_column_wrapper<V> expected({5, 8, 9, -1, 6, 7, -1}, {1, 1, 1, 0, 1, 1, 0});
+  size_type offset = -1;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, *slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithNull_NullScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  fixed_width_column_wrapper<V> val({3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> expected({-1, 8, 9, -1, 6, 7, -1}, {0, 1, 1, 0, 1, 1, 0});
+  size_type offset = -1;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, *slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithoutNull_ValidScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1};
+  fixed_width_column_wrapper<V> val{3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5};
+  fixed_width_column_wrapper<V> expected({3, 5, 42, 42, 42, 42, 42, 4, 42, 42, 42, 42, 42});
+  size_type offset = -5;
+  auto slr         = cudf::scalar_type_t<TypeParam>(make_type_param_scalar<TypeParam>(42), true);
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, BackwardShiftWithNull_ValidScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1};
+  fixed_width_column_wrapper<V> val({3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5},
+                                    {1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1});
+  fixed_width_column_wrapper<V> expected({5, -1, -1, -1, 3, 5, 42, -1, 7, 0, 2, -1, 42},
+                                         {1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1});
+  size_type offset = -1;
+  auto slr         = cudf::scalar_type_t<TypeParam>(make_type_param_scalar<TypeParam>(42), true);
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, ZeroShiftNullScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  fixed_width_column_wrapper<V> val{3, 4, 5, 6, 7, 8, 9};
+  fixed_width_column_wrapper<V> expected({3, 5, 8, 9, 4, 6, 7});
+  size_type offset = 0;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, *slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, ZeroShiftValidScalar)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1};
+  fixed_width_column_wrapper<V> val{3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5};
+  fixed_width_column_wrapper<V> expected({3, 5, 8, 9, 1, 3, 5, 4, 6, 7, 0, 2, 4});
+  size_type offset = 0;
+  auto slr         = cudf::scalar_type_t<TypeParam>(make_type_param_scalar<TypeParam>(42), true);
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, VeryLargeForwardOffset)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1};
+  fixed_width_column_wrapper<V> val{3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5};
+  fixed_width_column_wrapper<V> expected({-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                                         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  size_type offset = 1024;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, *slr, expected);
+}
+
+TYPED_TEST(groupby_shift_fixed_width_test, VeryLargeBackwardOffset)
+{
+  using V = TypeParam;
+
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 1};
+  fixed_width_column_wrapper<V> val{3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5};
+  fixed_width_column_wrapper<V> expected({-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+                                         {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  size_type offset = -1024;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_fixed_width_single<V>(key, val, offset, *slr, expected);
+}
+
+struct groupby_shift_string_test : public BaseFixture {
+};
+
+void test_groupby_shift_string_single(fixed_width_column_wrapper<K> const& key,
+                                      strings_column_wrapper const& value,
+                                      size_type offset,
+                                      scalar const& fill_value,
+                                      strings_column_wrapper const& expected)
+{
+  groupby::groupby gb_obj(table_view({key}));
+  std::vector<size_type> offsets{offset};
+  auto got = gb_obj.shift(table_view{{value}}, offsets, {fill_value});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL((*got.second).view().column(0), expected);
+}
+
+TEST_F(groupby_shift_string_test, ForwardShiftWithoutNull_NullScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"", "a", "cc", "f", "", "bb", "d"}, {0, 1, 1, 1, 0, 1, 1});
+  size_type offset = 1;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, ForwardShiftWithNull_NullScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"}, {1, 0, 1, 1, 0, 0, 0});
+  strings_column_wrapper expected({"", "", "a", "cc", "", "", ""}, {0, 0, 1, 1, 0, 0, 0});
+  size_type offset = 2;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, ForwardShiftWithoutNull_ValidScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"42", "42", "a", "cc", "42", "42", "bb"});
+
+  size_type offset = 2;
+  auto slr         = cudf::make_string_scalar("42");
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, ForwardShiftWithNull_ValidScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"}, {1, 1, 0, 0, 1, 0, 1});
+  strings_column_wrapper expected({"42", "a", "", "", "42", "bb", ""}, {1, 1, 0, 0, 1, 1, 0});
+
+  size_type offset = 1;
+  auto slr         = cudf::make_string_scalar("42");
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, BackwardShiftWithoutNull_NullScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"gg", "", "", "", "", "", ""}, {1, 0, 0, 0, 0, 0, 0});
+
+  size_type offset = -3;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, BackwardShiftWithNull_NullScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"}, {1, 0, 1, 1, 0, 0, 0});
+  strings_column_wrapper expected({"cc", "", "", "", "d", "", ""}, {1, 0, 0, 0, 1, 0, 0});
+
+  size_type offset = -1;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, BackwardShiftWithoutNull_ValidScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"42", "42", "42", "42", "42", "42", "42"});
+
+  size_type offset = -4;
+  auto slr         = cudf::make_string_scalar("42");
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, BackwardShiftWithNull_ValidScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"}, {1, 1, 0, 0, 1, 0, 1});
+  strings_column_wrapper expected({"", "gg", "42", "42", "eee", "42", "42"}, {0, 1, 1, 1, 1, 1, 1});
+
+  size_type offset = -2;
+  auto slr         = cudf::make_string_scalar("42");
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, ZeroShiftNullScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"a", "cc", "f", "gg", "bb", "d", "eee"});
+
+  size_type offset = 0;
+  auto slr         = cudf::make_default_constructed_scalar(column_view(val).type());
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, ZeroShiftValidScalar)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"a", "cc", "f", "gg", "bb", "d", "eee"});
+
+  size_type offset = 0;
+  auto slr         = cudf::make_string_scalar("42");
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, VeryLargeForwardOffset)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"42", "42", "42", "42", "42", "42", "42"});
+
+  size_type offset = 1024;
+  auto slr         = cudf::make_string_scalar("42");
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+TEST_F(groupby_shift_string_test, VeryLargeBackwardOffset)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  strings_column_wrapper expected({"42", "42", "42", "42", "42", "42", "42"});
+
+  size_type offset = -1024;
+  auto slr         = cudf::make_string_scalar("42");
+
+  test_groupby_shift_string_single(key, val, offset, *slr, expected);
+}
+
+template <typename T>
+struct groupby_shift_mixed_test : public BaseFixture {
+};
+
+TYPED_TEST_CASE(groupby_shift_mixed_test, FixedWidthTypes);
+
+void test_groupby_shift_multi(fixed_width_column_wrapper<K> const& key,
+                              table_view const& value,
+                              std::vector<size_type> offsets,
+                              std::vector<std::reference_wrapper<const scalar>> fill_values,
+                              table_view const& expected)
+{
+  groupby::groupby gb_obj(table_view({key}));
+  auto got = gb_obj.shift(value, offsets, fill_values);
+  CUDF_TEST_EXPECT_TABLES_EQUAL((*got.second).view(), expected);
+}
+
+TYPED_TEST(groupby_shift_mixed_test, NoFill)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper v1{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  fixed_width_column_wrapper<TypeParam> v2{1, 2, 3, 4, 5, 6, 7};
+  table_view value{{v1, v2}};
+
+  strings_column_wrapper e1({"", "", "a", "cc", "", "", "bb"}, {0, 0, 1, 1, 0, 0, 1});
+  fixed_width_column_wrapper<TypeParam> e2({-1, 1, 3, 6, -1, 2, 4}, {0, 1, 1, 1, 0, 1, 1});
+  table_view expected{{e1, e2}};
+
+  std::vector<size_type> offset{2, 1};
+  auto slr1 = cudf::make_default_constructed_scalar(column_view(v1).type());
+  auto slr2 = cudf::make_default_constructed_scalar(column_view(v2).type());
+  std::vector<std::reference_wrapper<const scalar>> fill_values{*slr1, *slr2};
+
+  test_groupby_shift_multi(key, value, offset, fill_values, expected);
+}
+
+TYPED_TEST(groupby_shift_mixed_test, Fill)
+{
+  fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
+  strings_column_wrapper v1{"a", "bb", "cc", "d", "eee", "f", "gg"};
+  fixed_width_column_wrapper<TypeParam> v2{1, 2, 3, 4, 5, 6, 7};
+  table_view value{{v1, v2}};
+
+  strings_column_wrapper e1({"cc", "f", "gg", "42", "d", "eee", "42"});
+  fixed_width_column_wrapper<TypeParam> e2({6, 7, 42, 42, 5, 42, 42});
+  table_view expected{{e1, e2}};
+
+  std::vector<size_type> offset{-1, -2};
+
+  auto slr1 = cudf::make_string_scalar("42");
+  auto slr2 = cudf::scalar_type_t<TypeParam>(make_type_param_scalar<TypeParam>(42), true);
+  std::vector<std::reference_wrapper<const scalar>> fill_values{*slr1, slr2};
+
+  test_groupby_shift_multi(key, value, offset, fill_values, expected);
+}
+
+struct groupby_shift_fixed_point_type_test : public BaseFixture {
+};
+
+TEST_F(groupby_shift_fixed_point_type_test, Matching)
+{
+  fixed_width_column_wrapper<K> key{2, 3, 4, 4, 3, 2, 2, 4};
+  fixed_point_column_wrapper<int32_t> v1{{10, 10, 40, 40, 20, 20, 30, 40}, numeric::scale_type{-1}};
+  fixed_point_column_wrapper<int64_t> v2{{5, 5, 8, 8, 6, 7, 9, 7}, numeric::scale_type{3}};
+  table_view value{{v1, v2}};
+
+  std::vector<size_type> offset{-3, 1};
+  auto slr1 = make_fixed_point_scalar<numeric::decimal32>(-42, numeric::scale_type{-1});
+  auto slr2 = make_fixed_point_scalar<numeric::decimal64>(42, numeric::scale_type{3});
+  std::vector<std::reference_wrapper<const scalar>> fill_values{*slr1, *slr2};
+
+  fixed_point_column_wrapper<int32_t> e1{{-42, -42, -42, -42, -42, -42, -42, -42},
+                                         numeric::scale_type{-1}};
+  fixed_point_column_wrapper<int64_t> e2{{42, 5, 7, 42, 5, 42, 8, 8}, numeric::scale_type{3}};
+  table_view expected{{e1, e2}};
+
+  test_groupby_shift_multi(key, value, offset, fill_values, expected);
+}
+
+TEST_F(groupby_shift_fixed_point_type_test, MismatchScaleType)
+{
+  fixed_width_column_wrapper<K> key{2, 3, 4, 4, 3, 2, 2, 4};
+  fixed_point_column_wrapper<int32_t> v1{{10, 10, 40, 40, 20, 20, 30, 40}, numeric::scale_type{-1}};
+
+  std::vector<size_type> offset{-3};
+  auto slr1 = make_fixed_point_scalar<numeric::decimal32>(-42, numeric::scale_type{-4});
+
+  fixed_point_column_wrapper<int32_t> stub{{-42, -42, -42, -42, -42, -42, -42, -42},
+                                           numeric::scale_type{-1}};
+
+  EXPECT_THROW(test_groupby_shift_multi(key, table_view{{v1}}, offset, {*slr1}, table_view{{stub}}),
+               logic_error);
+}
+
+TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
+{
+  fixed_width_column_wrapper<K> key{2, 3, 4, 4, 3, 2, 2, 4};
+  fixed_point_column_wrapper<int64_t> v1{{10, 10, 40, 40, 20, 20, 30, 40}, numeric::scale_type{-1}};
+
+  std::vector<size_type> offset{-3};
+  auto slr1 = make_fixed_point_scalar<numeric::decimal32>(-42, numeric::scale_type{-1});
+
+  fixed_point_column_wrapper<int32_t> stub{{-42, -42, -42, -42, -42, -42, -42, -42},
+                                           numeric::scale_type{-1}};
+
+  EXPECT_THROW(test_groupby_shift_multi(key, table_view{{v1}}, offset, {*slr1}, table_view{{stub}}),
+               logic_error);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/std_tests.cpp b/cpp/tests/groupby/std_tests.cpp
new file mode 100644
index 00000000000..f9980f3f5a6
--- /dev/null
+++ b/cpp/tests/groupby/std_tests.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef NDEBUG  // currently groupby std tests are not supported. See groupstd.cu
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_std_test : public cudf::test::BaseFixture {
+};
+
+using K               = int32_t;
+using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+TYPED_TEST_CASE(groupby_std_test, supported_types);
+
+TYPED_TEST(groupby_std_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::STD>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  // clang-format off
+  //                                        {1, 1, 1,  2, 2, 2, 2,    3, 3, 3}
+  fixed_width_column_wrapper<K>  expect_keys{1,        2,             3};
+  //                                        {0, 3, 6,  1, 4, 5, 9,    2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, all_valid());
+  // clang-format on
+
+  auto agg = cudf::make_std_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_std_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::STD>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_std_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_std_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::STD>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_std_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_std_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::STD>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_std_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_std_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::STD>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+
+  //                                        { 1, 1,     2, 2, 2,   3, 3,       4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //                                        { 3, 6,     1, 4, 9,   2, 8,       3}
+  fixed_width_column_wrapper<R> expect_vals({3 / sqrt(2), 7 / sqrt(3), 3 * sqrt(2), 0.},
+                                            {1, 1, 1, 0});
+
+  auto agg = cudf::make_std_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_std_test, ddof_non_default)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::STD>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+
+  //                                        { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //                                        { 3, 6,     1, 4, 9,   2, 8,    3}
+  fixed_width_column_wrapper<R> expect_vals({0., 7 * sqrt(2. / 3), 0., 0.}, {0, 1, 0, 0});
+
+  auto agg = cudf::make_std_aggregation(2);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_std_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::STD>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  //                                        {1, 1, 1,  2, 2, 2, 2,    3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys({1,        2,             3});
+  //                                        {0, 3, 6,  1, 4, 5, 9,    2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, all_valid());
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation());
+}
+
+}  // namespace test
+}  // namespace cudf
+
+#endif  // NDEBUG
diff --git a/cpp/tests/groupby/sum_of_squares_tests.cpp b/cpp/tests/groupby/sum_of_squares_tests.cpp
new file mode 100644
index 00000000000..24306a51056
--- /dev/null
+++ b/cpp/tests/groupby/sum_of_squares_tests.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_sum_of_squares_test : public cudf::test::BaseFixture {
+};
+
+using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+using K               = int32_t;
+
+TYPED_TEST_CASE(groupby_sum_of_squares_test, supported_types);
+
+TYPED_TEST(groupby_sum_of_squares_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  //  { 1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({45., 123., 117.}, all_valid());
+
+  auto agg = cudf::make_sum_of_squares_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_of_squares_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_sum_of_squares_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_of_squares_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_sum_of_squares_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_of_squares_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_sum_of_squares_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_of_squares_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 3, 6,     1, 4, 9,   2, 8,    3}
+  fixed_width_column_wrapper<R> expect_vals({45., 98., 68., 9.}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_sum_of_squares_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_sum_of_squares_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM_OF_SQUARES>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  //                                        {1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys({1,        2,           3      });
+  //                                        {0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({45.,       123.,       117.   }, all_valid());
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_of_squares_aggregation());
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/group_sum_scan_test.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
similarity index 100%
rename from cpp/tests/groupby/group_sum_scan_test.cpp
rename to cpp/tests/groupby/sum_scan_tests.cpp
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
new file mode 100644
index 00000000000..90544dd0db6
--- /dev/null
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_sum_test : public cudf::test::BaseFixture {
+};
+
+using K = int32_t;
+using supported_types =
+  cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
+                     cudf::test::DurationTypes>;
+
+TYPED_TEST_CASE(groupby_sum_test, supported_types);
+
+TYPED_TEST(groupby_sum_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R> expect_vals{9, 19, 17};
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_sum_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_sum_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_sum_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_sum_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
+
+  //  { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, all_valid());
+  //  { 3, 6,     1, 4, 9,   2, 8,    -}
+  fixed_width_column_wrapper<R> expect_vals({9, 14, 10, 0}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+
+  auto agg2 = cudf::make_sum_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
+}
+// clang-format on
+
+TYPED_TEST(groupby_sum_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::SUM>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{ 1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  fixed_width_column_wrapper<K> expect_keys{ 1, 2,  3 };
+  fixed_width_column_wrapper<R> expect_vals{ 9, 19, 17};
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES);
+}
+
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX    = TypeParam;
+  using RepType      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
+  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using K            = int32_t;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+
+    auto agg1 = cudf::make_sum_aggregation();
+    test_single_agg(
+      keys, vals, expect_keys, expect_vals_sum, std::move(agg1), force_use_sort_impl::YES);
+
+    auto agg4 = cudf::make_product_aggregation();
+    EXPECT_THROW(
+      test_single_agg(keys, vals, expect_keys, {}, std::move(agg4), force_use_sort_impl::YES),
+      cudf::logic_error);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX    = TypeParam;
+  using RepType      = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
+  using fp64_wrapper = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using K            = int32_t;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
+
+    auto agg5 = cudf::make_sum_aggregation();
+    test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5));
+
+    auto agg8 = cudf::make_product_aggregation();
+    EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error);
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp
new file mode 100644
index 00000000000..5835d850b8c
--- /dev/null
+++ b/cpp/tests/groupby/var_tests.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef NDEBUG  // currently groupby variance tests are not supported. See groupstd.cu
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+template <typename V>
+struct groupby_var_test : public cudf::test::BaseFixture {
+};
+using K = int32_t;
+
+using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
+
+TYPED_TEST_CASE(groupby_var_test, supported_types);
+
+TYPED_TEST(groupby_var_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  //                                       {1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys{1,        2,           3};
+  //                                       {0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({9.,      131. / 12,   31. / 3}, all_valid());
+  // clang-format on
+
+  auto agg = cudf::make_variance_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_var_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> vals{};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_variance_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_var_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_null());
+  fixed_width_column_wrapper<V> vals{3, 4, 5};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_variance_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_var_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> vals({3, 4, 5}, all_null());
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_null());
+
+  auto agg = cudf::make_variance_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_var_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+
+  // clang-format off
+  //                                        {1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1,        2,         3,       4}, all_valid());
+  //                                        {3, 6,     1, 4, 9,   2, 8,    3}
+  fixed_width_column_wrapper<R> expect_vals({4.5,      49. / 3,   18.,     0.}, {1, 1, 1, 0});
+  // clang-format on
+
+  auto agg = cudf::make_variance_aggregation();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_var_test, ddof_non_default)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
+                                     {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+
+  // clang-format off
+  //                                        { 1, 1,     2, 2, 2,   3, 3,    4}
+  fixed_width_column_wrapper<K> expect_keys({1,         2,         3,       4}, all_valid());
+  //                                        { 3, 6,     1, 4, 9,   2, 8,    3}
+  fixed_width_column_wrapper<R> expect_vals({0.,        98. / 3,   0.,      0.},
+                                            {0,         1,         0,       0});
+  // clang-format on
+
+  auto agg = cudf::make_variance_aggregation(2);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TYPED_TEST(groupby_var_test, dictionary)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::VARIANCE>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  dictionary_column_wrapper<V>  vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  //                                        {1, 1, 1,  2, 2, 2, 2,  3, 3, 3}
+  fixed_width_column_wrapper<K> expect_keys({1,        2,           3      });
+  //                                        {0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
+  fixed_width_column_wrapper<R> expect_vals({9.,      131./12,      31./3  }, all_valid());
+  // clang-format on
+
+  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_variance_aggregation());
+}
+
+}  // namespace test
+}  // namespace cudf
+
+#endif  // NDEBUG
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 49cfda078b1..a747646d894 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,24 +14,25 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
 #include <hash/concurrent_unordered_map.cuh>
 
 #include <cudf/types.hpp>
-#include <cudf_test/base_fixture.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_vector.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/logical.h>
 
-#include <gtest/gtest.h>
-
 #include <cstdlib>
 #include <iostream>
 #include <limits>
 #include <random>
 #include <unordered_map>
 #include <vector>
+#include "rmm/exec_policy.hpp"
 
 template <typename K, typename V>
 struct key_value_types {
@@ -53,13 +54,13 @@ struct InsertTest : public cudf::test::BaseFixture {
     // prevent overflow of small types
     const size_t input_size =
       std::min(static_cast<key_type>(size), std::numeric_limits<key_type>::max());
-    pairs.resize(input_size);
+    pairs.resize(input_size, rmm::cuda_stream_default);
     map = std::move(map_type::create(compute_hash_table_size(size)));
     rmm::cuda_stream_default.synchronize();
   }
 
   const cudf::size_type size{10000};
-  rmm::device_vector<pair_type> pairs;
+  rmm::device_uvector<pair_type> pairs{static_cast<std::size_t>(size), rmm::cuda_stream_default};
   std::unique_ptr<map_type, std::function<void(map_type*)>> map;
 };
 
@@ -137,53 +138,78 @@ TYPED_TEST(InsertTest, UniqueKeysUniqueValues)
 {
   using map_type  = typename TypeParam::map_type;
   using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(this->pairs.begin(), this->pairs.end(), unique_pair_generator<pair_type>{});
+  thrust::tabulate(
+    rmm::exec_policy(), this->pairs.begin(), this->pairs.end(), unique_pair_generator<pair_type>{});
   // All pairs should be new inserts
-  EXPECT_TRUE(thrust::all_of(
-    this->pairs.begin(), this->pairs.end(), insert_pair<map_type, pair_type>{*this->map}));
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(),
+                             this->pairs.begin(),
+                             this->pairs.end(),
+                             insert_pair<map_type, pair_type>{*this->map}));
 
   // All pairs should be present in the map
-  EXPECT_TRUE(thrust::all_of(
-    this->pairs.begin(), this->pairs.end(), find_pair<map_type, pair_type>{*this->map}));
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(),
+                             this->pairs.begin(),
+                             this->pairs.end(),
+                             find_pair<map_type, pair_type>{*this->map}));
 }
 
 TYPED_TEST(InsertTest, IdenticalKeysIdenticalValues)
 {
   using map_type  = typename TypeParam::map_type;
   using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(this->pairs.begin(), this->pairs.end(), identical_pair_generator<pair_type>{});
+  thrust::tabulate(rmm::exec_policy(),
+                   this->pairs.begin(),
+                   this->pairs.end(),
+                   identical_pair_generator<pair_type>{});
   // Insert a single pair
-  EXPECT_TRUE(thrust::all_of(
-    this->pairs.begin(), this->pairs.begin() + 1, insert_pair<map_type, pair_type>{*this->map}));
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(),
+                             this->pairs.begin(),
+                             this->pairs.begin() + 1,
+                             insert_pair<map_type, pair_type>{*this->map}));
   // Identical inserts should all return false (no new insert)
-  EXPECT_FALSE(thrust::all_of(
-    this->pairs.begin(), this->pairs.end(), insert_pair<map_type, pair_type>{*this->map}));
+  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(),
+                              this->pairs.begin(),
+                              this->pairs.end(),
+                              insert_pair<map_type, pair_type>{*this->map}));
 
   // All pairs should be present in the map
-  EXPECT_TRUE(thrust::all_of(
-    this->pairs.begin(), this->pairs.end(), find_pair<map_type, pair_type>{*this->map}));
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(),
+                             this->pairs.begin(),
+                             this->pairs.end(),
+                             find_pair<map_type, pair_type>{*this->map}));
 }
 
 TYPED_TEST(InsertTest, IdenticalKeysUniqueValues)
 {
   using map_type  = typename TypeParam::map_type;
   using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(this->pairs.begin(), this->pairs.end(), identical_key_generator<pair_type>{});
+  thrust::tabulate(rmm::exec_policy(),
+                   this->pairs.begin(),
+                   this->pairs.end(),
+                   identical_key_generator<pair_type>{});
 
   // Insert a single pair
-  EXPECT_TRUE(thrust::all_of(
-    this->pairs.begin(), this->pairs.begin() + 1, insert_pair<map_type, pair_type>{*this->map}));
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(),
+                             this->pairs.begin(),
+                             this->pairs.begin() + 1,
+                             insert_pair<map_type, pair_type>{*this->map}));
 
   // Identical key inserts should all return false (no new insert)
-  EXPECT_FALSE(thrust::all_of(
-    this->pairs.begin() + 1, this->pairs.end(), insert_pair<map_type, pair_type>{*this->map}));
+  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(),
+                              this->pairs.begin() + 1,
+                              this->pairs.end(),
+                              insert_pair<map_type, pair_type>{*this->map}));
 
   // Only first pair is present in map
-  EXPECT_TRUE(thrust::all_of(
-    this->pairs.begin(), this->pairs.begin() + 1, find_pair<map_type, pair_type>{*this->map}));
-
-  EXPECT_FALSE(thrust::all_of(
-    this->pairs.begin() + 1, this->pairs.end(), find_pair<map_type, pair_type>{*this->map}));
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(),
+                             this->pairs.begin(),
+                             this->pairs.begin() + 1,
+                             find_pair<map_type, pair_type>{*this->map}));
+
+  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(),
+                              this->pairs.begin() + 1,
+                              this->pairs.end(),
+                              find_pair<map_type, pair_type>{*this->map}));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/hash_map/multimap_test.cu b/cpp/tests/hash_map/multimap_test.cu
index 7fd69e90efd..21135746227 100644
--- a/cpp/tests/hash_map/multimap_test.cu
+++ b/cpp/tests/hash_map/multimap_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,21 +14,17 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
 #include <hash/concurrent_unordered_multimap.cuh>
 #include <hash/hash_allocator.cuh>
 
-#include <cudf_test/base_fixture.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/device_vector.h>
-
 #include <gtest/gtest.h>
 
-#include <cstdlib>
-#include <iostream>
 #include <limits>
-#include <vector>
 
 // This is necessary to do a parametrized typed-test over multiple template
 // arguments
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 0512ef73fda..4d8a94f276d 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,7 +111,7 @@ TEST_F(DLPackUntypedTests, UnsupportedDeviceTypeFromDlpack)
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
   // Spoof an unsupported device type
-  tensor->dl_tensor.ctx.device_type = kDLOpenCL;
+  tensor->dl_tensor.device.device_type = kDLOpenCL;
   EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error);
 }
 
@@ -122,7 +122,7 @@ TEST_F(DLPackUntypedTests, InvalidDeviceIdFromDlpack)
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
   // Spoof the wrong device ID
-  tensor->dl_tensor.ctx.device_id += 1;
+  tensor->dl_tensor.device.device_id += 1;
   EXPECT_THROW(cudf::from_dlpack(tensor.get()), cudf::logic_error);
 }
 
@@ -242,7 +242,7 @@ TYPED_TEST(DLPackNumericTests, ToDlpack1D)
 
   auto const& tensor = result->dl_tensor;
   validate_dtype<TypeParam>(tensor.dtype);
-  EXPECT_EQ(kDLGPU, tensor.ctx.device_type);
+  EXPECT_EQ(kDLCUDA, tensor.device.device_type);
   EXPECT_EQ(1, tensor.ndim);
   EXPECT_EQ(uint64_t{0}, tensor.byte_offset);
   EXPECT_EQ(nullptr, tensor.strides);
@@ -275,7 +275,7 @@ TYPED_TEST(DLPackNumericTests, ToDlpack2D)
 
   auto const& tensor = result->dl_tensor;
   validate_dtype<TypeParam>(tensor.dtype);
-  EXPECT_EQ(kDLGPU, tensor.ctx.device_type);
+  EXPECT_EQ(kDLCUDA, tensor.device.device_type);
   EXPECT_EQ(2, tensor.ndim);
   EXPECT_EQ(uint64_t{0}, tensor.byte_offset);
 
@@ -341,12 +341,12 @@ TYPED_TEST(DLPackNumericTests, FromDlpackCpu)
   int64_t strides[2] = {1, 5};
 
   DLManagedTensor tensor{};
-  tensor.dl_tensor.ctx.device_type = kDLCPU;
-  tensor.dl_tensor.dtype           = get_dtype<T>();
-  tensor.dl_tensor.ndim            = 2;
-  tensor.dl_tensor.byte_offset     = offset;
-  tensor.dl_tensor.shape           = shape;
-  tensor.dl_tensor.strides         = strides;
+  tensor.dl_tensor.device.device_type = kDLCPU;
+  tensor.dl_tensor.dtype              = get_dtype<T>();
+  tensor.dl_tensor.ndim               = 2;
+  tensor.dl_tensor.byte_offset        = offset;
+  tensor.dl_tensor.shape              = shape;
+  tensor.dl_tensor.strides            = strides;
 
   thrust::host_vector<T> host_vector(data.begin(), data.end());
   tensor.dl_tensor.data = host_vector.data();
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index d79307dcbf6..ae8808ba59d 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -32,6 +32,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <tests/interop/arrow_utils.hpp>
 
 std::unique_ptr<cudf::table> get_cudf_table()
@@ -76,17 +78,17 @@ TEST_F(FromArrowTest, EmptyTable)
 
 TEST_F(FromArrowTest, DateTimeTable)
 {
-  auto data = {1, 2, 3, 4, 5, 6};
+  auto data = std::vector<int64_t>{1, 2, 3, 4, 5, 6};
 
-  auto col =
-    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  auto col = cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+    data.begin(), data.end());
 
   cudf::table_view expected_table_view({col});
 
   std::shared_ptr<arrow::Array> arr;
-  arrow::TimestampBuilder timestamp_builder(timestamp(arrow::TimeUnit::type::MILLI),
+  arrow::TimestampBuilder timestamp_builder(arrow::timestamp(arrow::TimeUnit::type::MILLI),
                                             arrow::default_memory_pool());
-  timestamp_builder.AppendValues(std::vector<int64_t>{1, 2, 3, 4, 5, 6});
+  timestamp_builder.AppendValues(data);
   CUDF_EXPECTS(timestamp_builder.Finish(&arr).ok(), "Failed to build array");
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
@@ -337,10 +339,10 @@ TEST_P(FromArrowTestSlice, SliceTest)
   auto start           = std::get<0>(GetParam());
   auto end             = std::get<1>(GetParam());
 
-  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
-  cudf::table expected_cudf_table{sliced_cudf_table};
-  auto sliced_arrow_table = arrow_table->Slice(start, end - start);
-  auto got_cudf_table     = cudf::from_arrow(*sliced_arrow_table);
+  auto sliced_cudf_table   = cudf::slice(cudf_table_view, {start, end})[0];
+  auto expected_cudf_table = cudf::table{sliced_cudf_table};
+  auto sliced_arrow_table  = arrow_table->Slice(start, end - start);
+  auto got_cudf_table      = cudf::from_arrow(*sliced_arrow_table);
 
   // This has been added to take-care of empty string column issue with no children
   if (got_cudf_table->num_rows() == 0 and expected_cudf_table.num_rows() == 0) {
@@ -350,6 +352,131 @@ TEST_P(FromArrowTestSlice, SliceTest)
   }
 }
 
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(FromArrowTest, FixedPointTable)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto const col      = fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6}, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
+                                 data.size() / BIT_WIDTH_RATIO);
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
+TEST_F(FromArrowTest, FixedPointTableLarge)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto constexpr NUM_ELEMENTS    = 1000;
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; };
+    auto transform   = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other);
+    auto const data  = std::vector<int64_t>(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO);
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()), NUM_ELEMENTS);
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
+TEST_F(FromArrowTest, FixedPointTableNulls)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto const col =
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
+                                 data.size() / BIT_WIDTH_RATIO);
+    decimal_builder.AppendNull();
+    decimal_builder.AppendNull();
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
+TEST_F(FromArrowTest, FixedPointTableNullsLarge)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto constexpr NUM_ELEMENTS    = 1000;
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; };
+    auto transform   = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other);
+    auto const data  = std::vector<int64_t>(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO);
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, transform, scale_type{i});
+    auto const expected = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
+    for (int64_t i = 0; i < NUM_ELEMENTS / BIT_WIDTH_RATIO; ++i) {
+      decimal_builder.Append(reinterpret_cast<const uint8_t*>(data.data() + 4 * i));
+      decimal_builder.AppendNull();
+    }
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(FromArrowTest,
                         FromArrowTestSlice,
                         ::testing::Values(std::make_tuple(0, 10000),
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 57275433516..00d625175d0 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -353,6 +353,134 @@ TEST_F(ToArrowTest, StructColumn)
   ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
 }
 
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowTest, FixedPointTable)
+{
+  using namespace numeric;
+  auto constexpr const BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    auto const expect_data = std::vector<int64_t>{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(expect_data.data()),
+                                 expect_data.size() / BIT_WIDTH_RATIO);
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPointTableLarge)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+  auto constexpr NUM_ELEMENTS    = 1000;
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    auto every_other = [](auto i) { return i % 2 == 0 ? i / 2 : 0; };
+    auto transform   = cudf::detail::make_counting_transform_iterator(2, every_other);
+    auto const expect_data =
+      std::vector<int64_t>{transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO};
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+
+    // Note: For some reason, decimal_builder.AppendValues with NUM_ELEMENTS >= 1000 doesn't work
+    for (int i = 0; i < NUM_ELEMENTS; ++i)
+      decimal_builder.Append(reinterpret_cast<const uint8_t*>(expect_data.data() + 2 * i));
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPointTableNullsSimple)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+    auto const col =
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
+                                 data.size() / BIT_WIDTH_RATIO);
+    decimal_builder.AppendNull();
+    decimal_builder.AppendNull();
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPointTableNulls)
+{
+  using namespace numeric;
+  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
+
+  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col = fp_wrapper<int64_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i});
+    auto const input = cudf::table_view({col});
+
+    auto const expect_data =
+      std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0};
+    std::shared_ptr<arrow::Array> arr;
+    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
+    for (int64_t i = 0; i < input.column(0).size() / BIT_WIDTH_RATIO; ++i) {
+      decimal_builder.Append(reinterpret_cast<const uint8_t*>(expect_data.data() + 4 * i));
+      decimal_builder.AppendNull();
+    }
+
+    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
 struct ToArrowTestSlice
   : public ToArrowTest,
     public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {
diff --git a/cpp/tests/io/comp/decomp_test.cu b/cpp/tests/io/comp/decomp_test.cu
index c7e1ae91bd9..a2e2fee8242 100644
--- a/cpp/tests/io/comp/decomp_test.cu
+++ b/cpp/tests/io/comp/decomp_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,8 +57,8 @@ struct DecompressTest : public cudf::test::BaseFixture {
                   const uint8_t* compressed,
                   size_t compressed_size)
   {
-    rmm::device_buffer src(compressed, compressed_size);
-    rmm::device_buffer dst(decompressed->size());
+    rmm::device_buffer src{compressed, compressed_size, rmm::cuda_stream_default};
+    rmm::device_buffer dst{decompressed->size(), rmm::cuda_stream_default};
 
     inf_args->srcDevice = static_cast<const uint8_t*>(src.data());
     inf_args->dstDevice = static_cast<uint8_t*>(dst.data());
@@ -117,7 +117,8 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
 struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
   cudaError_t dispatch()
   {
-    rmm::device_buffer d_scratch(cudf::io::get_gpu_debrotli_scratch_size(1));
+    rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1),
+                                 rmm::cuda_stream_default};
 
     return cudf::io::gpu_debrotli(
       d_inf_args.data().get(), d_inf_stat.data().get(), d_scratch.data(), d_scratch.size(), 1);
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 07d6001708b..e45b67505ba 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -22,9 +22,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -61,6 +63,16 @@ using table_view = cudf::table_view;
 auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
   ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
+// Base test fixture for tests
+struct CsvWriterTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+struct CsvFixedPointWriterTest : public CsvWriterTest {
+};
+
+TYPED_TEST_CASE(CsvFixedPointWriterTest, cudf::test::FixedPointTypes);
+
 // Base test fixture for tests
 struct CsvReaderTest : public cudf::test::BaseFixture {
 };
@@ -307,6 +319,98 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumn)
   expect_column_data_equal(std::vector<TypeParam>(sequence, sequence + num_rows), view.column(0));
 }
 
+TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
+{
+  std::vector<std::string> reference_strings = {
+    "1.23", "-8.76", "5.43", "-0.12", "0.25", "-0.23", "-0.27", "0.00", "0.00"};
+
+  auto validity = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return (i % 2 == 0) ? true : false; });
+  cudf::test::strings_column_wrapper strings(
+    reference_strings.begin(), reference_strings.end(), validity);
+
+  std::vector<std::string> valid_reference_strings;
+  thrust::copy_if(thrust::host,
+                  reference_strings.begin(),
+                  reference_strings.end(),
+                  thrust::make_counting_iterator(0),
+                  std::back_inserter(valid_reference_strings),
+                  validity.functor());
+  reference_strings = valid_reference_strings;
+
+  using DecimalType = TypeParam;
+  auto input_column = cudf::strings::to_fixed_point(
+    cudf::strings_column_view(strings),
+    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{-2}});
+
+  auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
+
+  auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnNegativeScale.csv";
+
+  cudf_io::csv_writer_options writer_options =
+    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table);
+
+  cudf_io::write_csv(writer_options);
+
+  std::vector<std::string> result_strings;
+  result_strings.reserve(reference_strings.size());
+
+  std::ifstream read_result_file(filepath);
+  assert(read_result_file.is_open());
+
+  std::copy(std::istream_iterator<std::string>(read_result_file),
+            std::istream_iterator<std::string>(),
+            std::back_inserter(result_strings));
+
+  EXPECT_EQ(result_strings, reference_strings);
+}
+
+TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
+{
+  std::vector<std::string> reference_strings = {
+    "123000", "-876000", "543000", "-12000", "25000", "-23000", "-27000", "0000", "0000"};
+
+  auto validity = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return (i % 2 == 0) ? true : false; });
+  cudf::test::strings_column_wrapper strings(
+    reference_strings.begin(), reference_strings.end(), validity);
+
+  std::vector<std::string> valid_reference_strings;
+  thrust::copy_if(thrust::host,
+                  reference_strings.begin(),
+                  reference_strings.end(),
+                  thrust::make_counting_iterator(0),
+                  std::back_inserter(valid_reference_strings),
+                  validity.functor());
+  reference_strings = valid_reference_strings;
+
+  using DecimalType = TypeParam;
+  auto input_column = cudf::strings::to_fixed_point(
+    cudf::strings_column_view(strings),
+    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{3}});
+
+  auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
+
+  auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnPositiveScale.csv";
+
+  cudf_io::csv_writer_options writer_options =
+    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table);
+
+  cudf_io::write_csv(writer_options);
+
+  std::vector<std::string> result_strings;
+  result_strings.reserve(reference_strings.size());
+
+  std::ifstream read_result_file(filepath);
+  assert(read_result_file.is_open());
+
+  std::copy(std::istream_iterator<std::string>(read_result_file),
+            std::istream_iterator<std::string>(),
+            std::back_inserter(result_strings));
+
+  EXPECT_EQ(result_strings, reference_strings);
+}
+
 TEST_F(CsvReaderTest, MultiColumn)
 {
   constexpr auto num_rows = 10;
@@ -1015,6 +1119,57 @@ TEST_F(CsvReaderTest, StringInference)
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING);
 }
 
+TEST_F(CsvReaderTest, TypeInferenceThousands)
+{
+  std::string buffer = "1`400,123,1`234.56\n123`456,123456,12.34";
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .header(-1)
+      .thousands('`');
+  const auto result      = cudf_io::read_csv(in_opts);
+  const auto result_view = result.tbl->view();
+
+  EXPECT_EQ(result_view.num_columns(), 3);
+  EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64);
+
+  auto tsnd_sep_col = std::vector<int64_t>{1400L, 123456L};
+  auto int_col      = std::vector<int64_t>{123L, 123456L};
+  auto dbl_col      = std::vector<double>{1234.56, 12.34};
+  expect_column_data_equal(tsnd_sep_col, result_view.column(0));
+  expect_column_data_equal(int_col, result_view.column(1));
+  expect_column_data_equal(dbl_col, result_view.column(2));
+}
+
+TEST_F(CsvReaderTest, TypeInferenceWithDecimal)
+{
+  // Given that thousands:'`' and decimal(';'), we expect:
+  // col#0 => INT64 (column contains only digits & thousands sep)
+  // col#1 => STRING (contains digits and period character, which is NOT the decimal point here)
+  // col#2 => FLOAT64 (column contains digits and decimal point (i.e., ';'))
+  std::string buffer = "1`400,1.23,1`234;56\n123`456,123.456,12;34";
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .header(-1)
+      .thousands('`')
+      .decimal(';');
+  const auto result      = cudf_io::read_csv(in_opts);
+  const auto result_view = result.tbl->view();
+
+  EXPECT_EQ(result_view.num_columns(), 3);
+  EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::STRING);
+  EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64);
+
+  auto int_col = std::vector<int64_t>{1400L, 123456L};
+  auto str_col = std::vector<std::string>{"1.23", "123.456"};
+  auto dbl_col = std::vector<double>{1234.56, 12.34};
+  expect_column_data_equal(int_col, result_view.column(0));
+  expect_column_data_equal(str_col, result_view.column(1));
+  expect_column_data_equal(dbl_col, result_view.column(2));
+}
+
 TEST_F(CsvReaderTest, SkipRowsXorSkipFooter)
 {
   std::string buffer = "1,2,3";
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 108befa80a7..4c1d2635d0b 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -337,6 +337,10 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col3_data = random_values<int32_t>(num_rows);
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
+  auto col6_vals = random_values<int32_t>(num_rows);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
+  });
   auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), validity};
@@ -345,6 +349,7 @@ TEST_F(OrcWriterTest, MultiColumn)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
+  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, validity};
 
   cudf_io::table_metadata expected_metadata;
   expected_metadata.column_names.emplace_back("bools");
@@ -353,6 +358,7 @@ TEST_F(OrcWriterTest, MultiColumn)
   expected_metadata.column_names.emplace_back("int32s");
   expected_metadata.column_names.emplace_back("floats");
   expected_metadata.column_names.emplace_back("doubles");
+  expected_metadata.column_names.emplace_back("decimal");
 
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col0.release());
@@ -361,8 +367,9 @@ TEST_F(OrcWriterTest, MultiColumn)
   cols.push_back(col3.release());
   cols.push_back(col4.release());
   cols.push_back(col5.release());
+  cols.push_back(col6.release());
   auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(6, expected->num_columns());
+  EXPECT_EQ(7, expected->num_columns());
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf_io::orc_writer_options out_opts =
@@ -388,6 +395,10 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col3_data = random_values<int32_t>(num_rows);
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
+  auto col6_vals = random_values<int32_t>(num_rows);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
+  });
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
   auto col1_mask =
@@ -399,6 +410,8 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); });
   auto col5_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); });
+  auto col6_mask =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); });
 
   column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), col0_mask};
   column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), col1_mask};
@@ -406,6 +419,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), col3_mask};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
+  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, col6_mask};
 
   cudf_io::table_metadata expected_metadata;
   expected_metadata.column_names.emplace_back("bools");
@@ -414,6 +428,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   expected_metadata.column_names.emplace_back("int32s");
   expected_metadata.column_names.emplace_back("floats");
   expected_metadata.column_names.emplace_back("doubles");
+  expected_metadata.column_names.emplace_back("decimal");
 
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col0.release());
@@ -422,8 +437,9 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   cols.push_back(col3.release());
   cols.push_back(col4.release());
   cols.push_back(col5.release());
+  cols.push_back(col6.release());
   auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(6, expected->num_columns());
+  EXPECT_EQ(7, expected->num_columns());
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc");
   cudf_io::orc_writer_options out_opts =
@@ -516,29 +532,36 @@ TEST_F(OrcWriterTest, SlicedTable)
     "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
   const auto num_rows = strings.size();
 
-  auto seq_col0 = random_values<int>(num_rows);
-  auto seq_col2 = random_values<float>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto seq_col0  = random_values<int>(num_rows);
+  auto seq_col2  = random_values<float>(num_rows);
+  auto vals_col3 = random_values<int32_t>(num_rows);
+  auto seq_col3  = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{vals_col3[i], numeric::scale_type{2}};
+  });
+  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
   column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
+  column_wrapper<float> col3{seq_col3, seq_col3 + num_rows, validity};
 
   cudf_io::table_metadata expected_metadata;
   expected_metadata.column_names.emplace_back("col_other");
   expected_metadata.column_names.emplace_back("col_string");
   expected_metadata.column_names.emplace_back("col_another");
+  expected_metadata.column_names.emplace_back("col_decimal");
 
   std::vector<std::unique_ptr<column>> cols;
   cols.push_back(col0.release());
   cols.push_back(col1.release());
   cols.push_back(col2.release());
+  cols.push_back(col3.release());
   auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(3, expected->num_columns());
+  EXPECT_EQ(4, expected->num_columns());
 
   auto expected_slice = cudf::slice(expected->view(), {2, static_cast<cudf::size_type>(num_rows)});
 
-  auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet");
+  auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
   cudf_io::orc_writer_options out_opts =
     cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected_slice)
       .metadata(&expected_metadata);
@@ -978,47 +1001,41 @@ TEST_F(OrcStatisticsTest, Basic)
 
   auto validate_statistics = [&](std::vector<cudf_io::column_statistics> const& stats) {
     auto& s0 = stats[0];
-    EXPECT_EQ(s0.type(), cudf_io::statistics_type::NONE);
-    EXPECT_EQ(*s0.number_of_values(), 9ul);
+    EXPECT_EQ(*s0.number_of_values, 9ul);
 
     auto& s1 = stats[1];
-    EXPECT_EQ(s1.type(), cudf_io::statistics_type::INT);
-    EXPECT_EQ(*s1.number_of_values(), 4ul);
-    auto ts1 = s1.type_specific_stats<cudf_io::integer_statistics>();
-    EXPECT_EQ(*ts1->minimum(), 1);
-    EXPECT_EQ(*ts1->maximum(), 7);
-    EXPECT_EQ(*ts1->sum(), 16);
+    EXPECT_EQ(*s1.number_of_values, 4ul);
+    auto& ts1 = std::get<cudf_io::integer_statistics>(s1.type_specific_stats);
+    EXPECT_EQ(*ts1.minimum, 1);
+    EXPECT_EQ(*ts1.maximum, 7);
+    EXPECT_EQ(*ts1.sum, 16);
 
     auto& s2 = stats[2];
-    EXPECT_EQ(s2.type(), cudf_io::statistics_type::DOUBLE);
-    EXPECT_EQ(*s2.number_of_values(), 4ul);
-    auto ts2 = s2.type_specific_stats<cudf_io::double_statistics>();
-    EXPECT_EQ(*ts2->minimum(), 1.);
-    EXPECT_EQ(*ts2->maximum(), 7.);
+    EXPECT_EQ(*s2.number_of_values, 4ul);
+    auto& ts2 = std::get<cudf_io::double_statistics>(s2.type_specific_stats);
+    EXPECT_EQ(*ts2.minimum, 1.);
+    EXPECT_EQ(*ts2.maximum, 7.);
     // No sum ATM, filed #7087
-    EXPECT_EQ(ts2->sum(), nullptr);
+    ASSERT_FALSE(ts2.sum);
 
     auto& s3 = stats[3];
-    EXPECT_EQ(s3.type(), cudf_io::statistics_type::STRING);
-    EXPECT_EQ(*s3.number_of_values(), 9ul);
-    auto ts3 = s3.type_specific_stats<cudf_io::string_statistics>();
-    EXPECT_EQ(*ts3->minimum(), "Friday");
-    EXPECT_EQ(*ts3->maximum(), "Wednesday");
-    EXPECT_EQ(*ts3->sum(), 58ul);
+    EXPECT_EQ(*s3.number_of_values, 9ul);
+    auto& ts3 = std::get<cudf_io::string_statistics>(s3.type_specific_stats);
+    EXPECT_EQ(*ts3.minimum, "Friday");
+    EXPECT_EQ(*ts3.maximum, "Wednesday");
+    EXPECT_EQ(*ts3.sum, 58ul);
 
     auto& s4 = stats[4];
-    EXPECT_EQ(s4.type(), cudf_io::statistics_type::BUCKET);
-    EXPECT_EQ(*s4.number_of_values(), 9ul);
-    EXPECT_EQ(*s4.type_specific_stats<cudf_io::bucket_statistics>()->count(0), 8ul);
+    EXPECT_EQ(*s4.number_of_values, 9ul);
+    EXPECT_EQ(std::get<cudf_io::bucket_statistics>(s4.type_specific_stats).count[0], 8ul);
 
     auto& s5 = stats[5];
-    EXPECT_EQ(s5.type(), cudf_io::statistics_type::TIMESTAMP);
-    EXPECT_EQ(*s5.number_of_values(), 4ul);
-    auto ts5 = s5.type_specific_stats<cudf_io::timestamp_statistics>();
-    EXPECT_EQ(*ts5->minimum_utc(), 1000);
-    EXPECT_EQ(*ts5->maximum_utc(), 7000);
-    EXPECT_EQ(ts5->minimum(), nullptr);
-    EXPECT_EQ(ts5->maximum(), nullptr);
+    EXPECT_EQ(*s5.number_of_values, 4ul);
+    auto& ts5 = std::get<cudf_io::timestamp_statistics>(s5.type_specific_stats);
+    EXPECT_EQ(*ts5.minimum_utc, 1000);
+    EXPECT_EQ(*ts5.maximum_utc, 7000);
+    ASSERT_FALSE(ts5.minimum);
+    ASSERT_FALSE(ts5.maximum);
   };
 
   validate_statistics(stats.file_stats);
@@ -1062,4 +1079,78 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
 }
 
+struct OrcWriterTestDecimal : public OrcWriterTest,
+                              public ::testing::WithParamInterface<std::tuple<int, int>> {
+};
+
+TEST_P(OrcWriterTestDecimal, Decimal64)
+{
+  auto const num_rows = std::get<0>(GetParam());
+  auto const scale    = std::get<1>(GetParam());
+
+  // Using int16_t because scale causes values to overflow if they already require 32 bits
+  auto const vals = random_values<int32_t>(num_rows);
+  auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
+  column_wrapper<numeric::decimal64> col{data, data + num_rows, mask};
+
+  std::vector<std::unique_ptr<column>> cols;
+  cols.push_back(col.release());
+  auto tbl = std::make_unique<table>(std::move(cols));
+
+  auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl->view());
+
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl->view().column(0), result.tbl->view().column(0));
+}
+
+INSTANTIATE_TEST_CASE_P(OrcWriterTest,
+                        OrcWriterTestDecimal,
+                        ::testing::Combine(::testing::Values(1, 10000, 10001, 34567),
+                                           ::testing::Values(-2, 0, 2)));
+
+TEST_F(OrcWriterTest, Decimal32)
+{
+  constexpr auto num_rows = 12000;
+
+  // Using int16_t because scale causes values to overflow if they already require 32 bits
+  auto const vals = random_values<int16_t>(num_rows);
+  auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
+    return numeric::decimal32{vals[i], numeric::scale_type{2}};
+  });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; });
+  column_wrapper<numeric::decimal32> col{data, data + num_rows, mask};
+
+  std::vector<std::unique_ptr<column>> cols;
+  cols.push_back(col.release());
+  auto expected = std::make_unique<table>(std::move(cols));
+
+  auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_orc(in_opts);
+
+  // Need a 64bit decimal column for comparison since the reader always creates DECIMAL64 columns
+  auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
+    return numeric::decimal64{vals[i], numeric::scale_type{2}};
+  });
+  column_wrapper<numeric::decimal64> col64{data64, data64 + num_rows, mask};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 880f11aaeb2..e59a4accf66 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -2861,4 +2861,40 @@ TEST_F(ParquetReaderTest, DecimalRead)
   }
 }
 
+TEST_F(ParquetReaderTest, EmptyOutput)
+{
+  cudf::test::fixed_width_column_wrapper<int> c0;
+  cudf::test::strings_column_wrapper c1;
+  cudf::test::fixed_point_column_wrapper<int> c2({}, numeric::scale_type{2});
+  cudf::test::lists_column_wrapper<float> _c3{{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}};
+  auto c3 = cudf::empty_like(_c3);
+
+  cudf::test::fixed_width_column_wrapper<int> sc0;
+  cudf::test::strings_column_wrapper sc1;
+  cudf::test::lists_column_wrapper<int> _sc2{{1, 2}};
+  std::vector<std::unique_ptr<cudf::column>> struct_children;
+  struct_children.push_back(sc0.release());
+  struct_children.push_back(sc1.release());
+  struct_children.push_back(cudf::empty_like(_sc2));
+  cudf::test::structs_column_wrapper c4(std::move(struct_children));
+
+  table_view expected({c0, c1, c2, *c3, c4});
+
+  // set precision on the decimal column
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[2].set_decimal_precision(1);
+
+  auto filepath = temp_env->get_temp_filepath("EmptyOutput.parquet");
+  cudf_io::parquet_writer_options out_args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  out_args.set_metadata(&expected_metadata);
+  cudf_io::write_parquet(out_args);
+
+  cudf_io::parquet_reader_options read_args =
+    cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_parquet(read_args);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/iterator/README.md b/cpp/tests/iterator/README.md
new file mode 100644
index 00000000000..93c3dd63774
--- /dev/null
+++ b/cpp/tests/iterator/README.md
@@ -0,0 +1,18 @@
+
+# Iterator Test decomposition
+
+The Iterator tests have been decomposed across different types to
+make sure that no single test file takes too long to compile.
+
+The decomposition is that each of the following
+categorizes of types should be placed in a separate file:
+ - numeric
+ - chrono ( timestamp, duration )
+ - fixed point ( numeric::decimal32, numeric::decimal64 )
+ - string
+
+The `numeric` and `chrono` type lists have roughly the same
+number of entries allowing for a balanced compile time between
+those two. We follow the same pattern for `fixed point` and
+`string` so it is clear where to test those types, even though
+they have a smaller set of entries and will compile quickly.
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 34ce93c955e..68051098595 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -12,30 +12,33 @@
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
-
-#include <cudf/detail/iterator.cuh>                             // include iterator header
-#include <cudf/detail/utilities/transform_unary_functions.cuh>  //for meanvar
-
-#include <bitset>
-#include <cstdint>
-#include <iostream>
-#include <numeric>
-#include <random>
+#pragma once
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/detail/iterator.cuh>                             // include iterator header
+#include <cudf/detail/utilities/transform_unary_functions.cuh>  //for meanvar
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
 #include <thrust/equal.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
-// for reduction tests
-#include <thrust/device_vector.h>
 #include <cub/device/device_reduce.cuh>
 
-// Base Typed test fixture for iterator test
+#include <bitset>
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <random>
 
+// Base Typed test fixture for iterator test
 template <typename T>
 struct IteratorTest : public cudf::test::BaseFixture {
   // iterator test case which uses cub
@@ -43,7 +46,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
   void iterator_test_cub(T_output expected, InputIterator d_in, int num_items)
   {
     T_output init = cudf::test::make_type_param_scalar<T_output>(0);
-    thrust::device_vector<T_output> dev_result(1, init);
+    rmm::device_uvector<T_output> dev_result(1, rmm::cuda_stream_default);
 
     // Get temporary storage size
     size_t temp_storage_bytes = 0;
@@ -56,7 +59,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               init);
 
     // Allocate temporary storage
-    rmm::device_buffer d_temp_storage(temp_storage_bytes);
+    rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default);
 
     // Run reduction
     cub::DeviceReduce::Reduce(d_temp_storage.data(),
@@ -72,57 +75,41 @@ struct IteratorTest : public cudf::test::BaseFixture {
 
   // iterator test case which uses thrust
   template <typename InputIterator, typename T_output>
-  void iterator_test_thrust(thrust::host_vector<T_output>& expected,
+  void iterator_test_thrust(thrust::host_vector<T_output> const& expected,
                             InputIterator d_in,
                             int num_items)
   {
     InputIterator d_in_last = d_in + num_items;
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
-    thrust::device_vector<T_output> dev_expected(expected);
+    auto dev_expected = cudf::detail::make_device_uvector_sync(expected);
 
     // Can't use this because time_point make_pair bug in libcudacxx
     // bool result = thrust::equal(thrust::device, d_in, d_in_last, dev_expected.begin());
     bool result = thrust::transform_reduce(
-      thrust::device,
+      rmm::exec_policy(),
       thrust::make_zip_iterator(thrust::make_tuple(d_in, dev_expected.begin())),
       thrust::make_zip_iterator(thrust::make_tuple(d_in_last, dev_expected.end())),
       [] __device__(auto it) {
-        return static_cast<typename InputIterator::value_type>(thrust::get<0>(it)) ==
-               T_output(thrust::get<1>(it));
+        return static_cast<T_output>(thrust::get<0>(it)) == T_output(thrust::get<1>(it));
       },
       true,
       thrust::logical_and<bool>());
-#ifndef NDEBUG
-    thrust::device_vector<bool> vec(expected.size(), false);
-    thrust::transform(
-      thrust::device,
-      thrust::make_zip_iterator(thrust::make_tuple(d_in, dev_expected.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(d_in_last, dev_expected.end())),
-      vec.begin(),
-      [] __device__(auto it) { return (thrust::get<0>(it)) == T_output(thrust::get<1>(it)); });
-    thrust::copy(vec.begin(), vec.end(), std::ostream_iterator<bool>(std::cout, " "));
-    std::cout << std::endl;
-#endif
 
     EXPECT_TRUE(result) << "thrust test";
   }
 
   template <typename T_output>
   void evaluate(T_output expected,
-                thrust::device_vector<T_output>& dev_result,
+                rmm::device_uvector<T_output> const& dev_result,
                 const char* msg = nullptr)
   {
-    thrust::host_vector<T_output> hos_result(dev_result);
+    auto host_result = cudf::detail::make_host_vector_sync(dev_result);
 
-    EXPECT_EQ(expected, hos_result[0]) << msg;
-    std::cout << "Done: expected <" << msg
-              << "> = "
-              //<< hos_result[0] //TODO uncomment after time_point ostream operator<<
-              << std::endl;
+    EXPECT_EQ(expected, host_result[0]) << msg;
   }
 
   template <typename T_output>
-  void values_equal_test(thrust::host_vector<T_output>& expected,
+  void values_equal_test(thrust::host_vector<T_output> const& expected,
                          const cudf::column_device_view& col)
   {
     if (col.nullable()) {
diff --git a/cpp/tests/iterator/optional_iterator_test.cuh b/cpp/tests/iterator/optional_iterator_test.cuh
new file mode 100644
index 00000000000..6b1c2b360eb
--- /dev/null
+++ b/cpp/tests/iterator/optional_iterator_test.cuh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+template <typename T>
+void nonull_optional_iterator(IteratorTest<T>& testFixture)
+{
+  // data and valid arrays
+  auto host_values_std =
+    cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+  thrust::host_vector<T> host_values(host_values_std);
+
+  // create a column
+  cudf::test::fixed_width_column_wrapper<T> w_col(host_values.begin(), host_values.end());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<thrust::optional<T>> replaced_array(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), replaced_array.begin(), [](auto s) {
+    return thrust::optional<T>{s};
+  });
+
+  // GPU test
+  testFixture.iterator_test_thrust(
+    replaced_array,
+    cudf::detail::make_optional_iterator<T>(*d_col, cudf::contains_nulls::DYNAMIC{}, false),
+    host_values.size());
+  testFixture.iterator_test_thrust(
+    replaced_array,
+    cudf::detail::make_optional_iterator<T>(*d_col, cudf::contains_nulls::NO{}),
+    host_values.size());
+}
+
+template <typename T>
+void null_optional_iterator(IteratorTest<T>& testFixture)
+{
+  // data and valid arrays
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+  thrust::host_vector<bool> host_bools(std::vector<bool>({1, 1, 0, 1, 1, 1, 0, 1, 1}));
+
+  // create a column with bool vector
+  cudf::test::fixed_width_column_wrapper<T> w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<thrust::optional<T>> optional_values(host_values.size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 optional_values.begin(),
+                 [](auto s, bool b) { return b ? thrust::optional<T>{s} : thrust::optional<T>{}; });
+
+  thrust::host_vector<thrust::optional<T>> value_all_valid(host_values.size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 value_all_valid.begin(),
+                 [](auto s, bool b) { return thrust::optional<T>{s}; });
+
+  // GPU test for correct null mapping
+  testFixture.iterator_test_thrust(optional_values,
+                                   d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{}, true),
+                                   host_values.size());
+
+  testFixture.iterator_test_thrust(
+    optional_values, d_col->optional_begin<T>(cudf::contains_nulls::YES{}), host_values.size());
+  testFixture.iterator_test_thrust(
+    optional_values, d_col->optional_begin<T>(cudf::contains_nulls::YES{}), host_values.size());
+
+  // GPU test for ignoring null mapping
+  testFixture.iterator_test_thrust(value_all_valid,
+                                   d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{}, false),
+                                   host_values.size());
+
+  testFixture.iterator_test_thrust(
+    value_all_valid, d_col->optional_begin<T>(cudf::contains_nulls::NO{}), host_values.size());
+  testFixture.iterator_test_thrust(
+    value_all_valid, d_col->optional_begin<T>(cudf::contains_nulls::NO{}), host_values.size());
+}
diff --git a/cpp/tests/iterator/optional_iterator_test_chrono.cu b/cpp/tests/iterator/optional_iterator_test_chrono.cu
new file mode 100644
index 00000000000..c99814a3302
--- /dev/null
+++ b/cpp/tests/iterator/optional_iterator_test_chrono.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <tests/iterator/optional_iterator_test.cuh>
+
+using TestingTypes = cudf::test::ChronoTypes;
+
+template <typename T>
+struct ChronoOptionalIteratorTest : public IteratorTest<T> {
+};
+
+TYPED_TEST_CASE(ChronoOptionalIteratorTest, TestingTypes);
+TYPED_TEST(ChronoOptionalIteratorTest, nonull_optional_iterator)
+{
+  nonull_optional_iterator(*this);
+}
+TYPED_TEST(ChronoOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); }
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
new file mode 100644
index 00000000000..313fd1358f6
--- /dev/null
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <tests/iterator/optional_iterator_test.cuh>
+
+using TestingTypes = cudf::test::NumericTypes;
+
+template <typename T>
+struct NumericOptionalIteratorTest : public IteratorTest<T> {
+};
+
+TYPED_TEST_CASE(NumericOptionalIteratorTest, TestingTypes);
+TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator)
+{
+  nonull_optional_iterator(*this);
+}
+TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); }
+
+// to print meanvar for debug.
+template <typename T>
+std::ostream& operator<<(std::ostream& os, cudf::meanvar<T> const& rhs)
+{
+  return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] ";
+};
+
+// Transformers and Operators for optional_iterator test
+template <typename ElementType>
+struct transformer_optional_meanvar {
+  using ResultType = thrust::optional<cudf::meanvar<ElementType>>;
+
+  CUDA_HOST_DEVICE_CALLABLE
+  ResultType operator()(thrust::optional<ElementType> const& optional)
+  {
+    if (optional.has_value()) {
+      auto v = *optional;
+      return cudf::meanvar<ElementType>{v, static_cast<ElementType>(v * v), 1};
+    }
+    return thrust::nullopt;
+  }
+};
+
+struct sum_if_not_null {
+  template <typename T>
+  CUDA_HOST_DEVICE_CALLABLE thrust::optional<T> operator()(const thrust::optional<T>& lhs,
+                                                           const thrust::optional<T>& rhs)
+  {
+    return lhs.value_or(T{0}) + rhs.value_or(T{0});
+  }
+};
+
+// TODO: enable this test also at __CUDACC_DEBUG__
+// This test causes fatal compilation error only at device debug mode.
+// Workaround: exclude this test only at device debug mode.
+#if !defined(__CUDACC_DEBUG__)
+// This test computes `count`, `sum`, `sum_of_squares` at a single reduction call.
+// It would be useful for `var`, `std` operation
+TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
+{
+  using T        = TypeParam;
+  using T_output = cudf::meanvar<T>;
+  transformer_optional_meanvar<T> transformer{};
+
+  const int column_size{50};
+  const T init{0};
+
+  // data and valid arrays
+  std::vector<T> host_values(column_size);
+  std::vector<bool> host_bools(column_size);
+
+  cudf::test::UniformRandomGenerator<T> rng;
+  cudf::test::UniformRandomGenerator<bool> rbg;
+  std::generate(host_values.begin(), host_values.end(), [&rng]() { return rng.generate(); });
+  std::generate(host_bools.begin(), host_bools.end(), [&rbg]() { return rbg.generate(); });
+
+  cudf::test::fixed_width_column_wrapper<TypeParam> w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate expected values by CPU
+  T_output expected_value;
+
+  expected_value.count = d_col->size() - static_cast<cudf::column_view>(w_col).null_count();
+
+  std::vector<T> replaced_array(d_col->size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 replaced_array.begin(),
+                 [&](T x, bool b) { return (b) ? static_cast<T>(x) : init; });
+
+  expected_value.count = d_col->size() - static_cast<cudf::column_view>(w_col).null_count();
+  expected_value.value = std::accumulate(replaced_array.begin(), replaced_array.end(), T{0});
+  expected_value.value_squared = std::accumulate(
+    replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
+
+  // std::cout << "expected <mixed_output> = " << expected_value << std::endl;
+
+  // GPU test
+  auto it_dev         = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
+  auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
+  auto result         = thrust::reduce(it_dev_squared,
+                               it_dev_squared + d_col->size(),
+                               thrust::optional<T_output>{T_output{}},
+                               sum_if_not_null{});
+  if (not std::is_floating_point<T>()) {
+    EXPECT_EQ(expected_value, *result) << "optional iterator reduction sum";
+  } else {
+    EXPECT_NEAR(expected_value.value, result->value, 1e-3) << "optional iterator reduction sum";
+    EXPECT_NEAR(expected_value.value_squared, result->value_squared, 1e-3)
+      << "optional iterator reduction sum squared";
+    EXPECT_EQ(expected_value.count, result->count) << "optional iterator reduction count";
+  }
+}
+#endif
diff --git a/cpp/tests/iterator/pair_iterator_test.cuh b/cpp/tests/iterator/pair_iterator_test.cuh
new file mode 100644
index 00000000000..4d0f3021d3c
--- /dev/null
+++ b/cpp/tests/iterator/pair_iterator_test.cuh
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+template <typename T>
+void nonull_pair_iterator(IteratorTest<T>& testFixture)
+{
+  // data and valid arrays
+  auto host_values_std =
+    cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+  thrust::host_vector<T> host_values(host_values_std);
+
+  // create a column
+  cudf::test::fixed_width_column_wrapper<T> w_col(host_values.begin(), host_values.end());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<thrust::pair<T, bool>> replaced_array(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), replaced_array.begin(), [](auto s) {
+    return thrust::make_pair(s, true);
+  });
+
+  // GPU test
+  auto it_dev = d_col->pair_begin<T, false>();
+  testFixture.iterator_test_thrust(replaced_array, it_dev, host_values.size());
+}
+
+template <typename T>
+void null_pair_iterator(IteratorTest<T>& testFixture)
+{
+  // data and valid arrays
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+  thrust::host_vector<bool> host_bools(std::vector<bool>({1, 1, 0, 1, 1, 1, 0, 1, 1}));
+
+  // create a column with bool vector
+  cudf::test::fixed_width_column_wrapper<T> w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<thrust::pair<T, bool>> value_and_validity(host_values.size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 value_and_validity.begin(),
+                 [](auto s, auto b) {
+                   return thrust::pair<T, bool>{s, b};
+                 });
+  thrust::host_vector<thrust::pair<T, bool>> value_all_valid(host_values.size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 value_all_valid.begin(),
+                 [](auto s, auto b) {
+                   return thrust::pair<T, bool>{s, true};
+                 });
+
+  // GPU test
+  auto it_dev = d_col->pair_begin<T, true>();
+  testFixture.iterator_test_thrust(value_and_validity, it_dev, host_values.size());
+
+  auto it_hasnonull_dev = d_col->pair_begin<T, false>();
+  testFixture.iterator_test_thrust(value_all_valid, it_hasnonull_dev, host_values.size());
+
+  auto itb_dev = cudf::detail::make_validity_iterator(*d_col);
+  testFixture.iterator_test_thrust(host_bools, itb_dev, host_values.size());
+}
diff --git a/cpp/tests/iterator/pair_iterator_test_chrono.cu b/cpp/tests/iterator/pair_iterator_test_chrono.cu
new file mode 100644
index 00000000000..fb9cb645ab8
--- /dev/null
+++ b/cpp/tests/iterator/pair_iterator_test_chrono.cu
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <tests/iterator/pair_iterator_test.cuh>
+
+using TestingTypes = cudf::test::ChronoTypes;
+
+template <typename T>
+struct ChronoPairIteratorTest : public IteratorTest<T> {
+};
+
+TYPED_TEST_CASE(ChronoPairIteratorTest, TestingTypes);
+TYPED_TEST(ChronoPairIteratorTest, nonull_pair_iterator) { nonull_pair_iterator(*this); }
+TYPED_TEST(ChronoPairIteratorTest, null_pair_iterator) { null_pair_iterator(*this); }
diff --git a/cpp/tests/iterator/pair_iterator_test.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
similarity index 60%
rename from cpp/tests/iterator/pair_iterator_test.cu
rename to cpp/tests/iterator/pair_iterator_test_numeric.cu
index 6800cb3c654..21d163a6979 100644
--- a/cpp/tests/iterator/pair_iterator_test.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -12,8 +12,17 @@
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
-#include <tests/iterator/iterator_tests.cuh>
-#include <type_traits>
+#include <tests/iterator/pair_iterator_test.cuh>
+
+using TestingTypes = cudf::test::NumericTypes;
+
+template <typename T>
+struct NumericPairIteratorTest : public IteratorTest<T> {
+};
+
+TYPED_TEST_CASE(NumericPairIteratorTest, TestingTypes);
+TYPED_TEST(NumericPairIteratorTest, nonull_pair_iterator) { nonull_pair_iterator(*this); }
+TYPED_TEST(NumericPairIteratorTest, null_pair_iterator) { null_pair_iterator(*this); }
 
 // to print meanvar for debug.
 template <typename T>
@@ -49,17 +58,13 @@ struct sum_if_not_null {
   }
 };
 
-template <typename T>
-struct PairIteratorTest : public cudf::test::BaseFixture {
-};
-TYPED_TEST_CASE(PairIteratorTest, cudf::test::NumericTypes);
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
 // Workaround: exclude this test only at device debug mode.
 #if !defined(__CUDACC_DEBUG__)
 // This test computes `count`, `sum`, `sum_of_squares` at a single reduction call.
 // It would be useful for `var`, `std` operation
-TYPED_TEST(PairIteratorTest, mean_var_output)
+TYPED_TEST(NumericPairIteratorTest, mean_var_output)
 {
   using T        = TypeParam;
   using T_output = cudf::meanvar<T>;
@@ -98,7 +103,7 @@ TYPED_TEST(PairIteratorTest, mean_var_output)
   expected_value.value_squared = std::accumulate(
     replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
 
-  std::cout << "expected <mixed_output> = " << expected_value << std::endl;
+  // std::cout << "expected <mixed_output> = " << expected_value << std::endl;
 
   // GPU test
   auto it_dev         = d_col->pair_begin<T, true>();
@@ -117,71 +122,3 @@ TYPED_TEST(PairIteratorTest, mean_var_output)
   }
 }
 #endif
-
-using TestingTypes = cudf::test::AllTypes;
-
-TYPED_TEST_CASE(IteratorTest, TestingTypes);
-
-TYPED_TEST(IteratorTest, nonull_pair_iterator)
-{
-  using T = TypeParam;
-  // data and valid arrays
-  auto host_values_std =
-    cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  thrust::host_vector<T> host_values(host_values_std);
-
-  // create a column
-  cudf::test::fixed_width_column_wrapper<T> w_col(host_values.begin(), host_values.end());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // calculate the expected value by CPU.
-  thrust::host_vector<thrust::pair<T, bool>> replaced_array(host_values.size());
-  std::transform(host_values.begin(), host_values.end(), replaced_array.begin(), [](auto s) {
-    return thrust::make_pair(s, true);
-  });
-
-  // GPU test
-  auto it_dev = d_col->pair_begin<T, false>();
-  this->iterator_test_thrust(replaced_array, it_dev, host_values.size());
-}
-
-TYPED_TEST(IteratorTest, null_pair_iterator)
-{
-  using T = TypeParam;
-  // data and valid arrays
-  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  thrust::host_vector<bool> host_bools(std::vector<bool>({1, 1, 0, 1, 1, 1, 0, 1, 1}));
-
-  // create a column with bool vector
-  cudf::test::fixed_width_column_wrapper<T> w_col(
-    host_values.begin(), host_values.end(), host_bools.begin());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // calculate the expected value by CPU.
-  thrust::host_vector<thrust::pair<T, bool>> value_and_validity(host_values.size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 value_and_validity.begin(),
-                 [](auto s, auto b) {
-                   return thrust::pair<T, bool>{s, b};
-                 });
-  thrust::host_vector<thrust::pair<T, bool>> value_all_valid(host_values.size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 value_all_valid.begin(),
-                 [](auto s, auto b) {
-                   return thrust::pair<T, bool>{s, true};
-                 });
-
-  // GPU test
-  auto it_dev = d_col->pair_begin<T, true>();
-  this->iterator_test_thrust(value_and_validity, it_dev, host_values.size());
-
-  auto it_hasnonull_dev = d_col->pair_begin<T, false>();
-  this->iterator_test_thrust(value_all_valid, it_hasnonull_dev, host_values.size());
-
-  auto itb_dev = cudf::detail::make_validity_iterator(*d_col);
-  this->iterator_test_thrust(host_bools, itb_dev, host_values.size());
-}
diff --git a/cpp/tests/iterator/value_iterator.cpp b/cpp/tests/iterator/value_iterator.cpp
new file mode 100644
index 00000000000..a25b19e99d8
--- /dev/null
+++ b/cpp/tests/iterator/value_iterator.cpp
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/iterator/value_iterator_test.cu b/cpp/tests/iterator/value_iterator_test.cu
deleted file mode 100644
index 542123ffd25..00000000000
--- a/cpp/tests/iterator/value_iterator_test.cu
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software distributed under the License
- * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
- * or implied. See the License for the specific language governing permissions and limitations under
- * the License.
- */
-#include <tests/iterator/iterator_tests.cuh>
-
-auto strings_to_string_views(std::vector<std::string>& input_strings)
-{
-  auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  std::vector<char> chars;
-  std::vector<int32_t> offsets;
-  std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
-    input_strings.begin(), input_strings.end(), all_valid);
-  thrust::device_vector<char> dev_chars(chars);
-  char* c_start = thrust::raw_pointer_cast(dev_chars.data());
-
-  // calculate the expected value by CPU. (but contains device pointers)
-  std::vector<cudf::string_view> replaced_array(input_strings.size());
-  std::transform(thrust::counting_iterator<size_t>(0),
-                 thrust::counting_iterator<size_t>(replaced_array.size()),
-                 replaced_array.begin(),
-                 [c_start, offsets](auto i) {
-                   return cudf::string_view(c_start + offsets[i], offsets[i + 1] - offsets[i]);
-                 });
-  return std::make_tuple(std::move(dev_chars), replaced_array);
-}
-
-using TestingTypes = cudf::test::AllTypes;
-
-TYPED_TEST_CASE(IteratorTest, TestingTypes);
-
-// tests for non-null iterator (pointer of device array)
-TYPED_TEST(IteratorTest, non_null_iterator)
-{
-  using T         = TypeParam;
-  auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  thrust::device_vector<T> dev_array(host_array);
-
-  // calculate the expected value by CPU.
-  thrust::host_vector<T> replaced_array(host_array);
-
-  // driven by iterator as a pointer of device array.
-  auto it_dev      = dev_array.begin();
-  T expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
-  this->iterator_test_thrust(replaced_array, it_dev, dev_array.size());
-  this->iterator_test_cub(expected_value, it_dev, dev_array.size());
-
-  // test column input
-  cudf::test::fixed_width_column_wrapper<T> w_col(host_array.begin(), host_array.end());
-  this->values_equal_test(replaced_array, *cudf::column_device_view::create(w_col));
-}
-
-// Tests for null input iterator (column with null bitmap)
-// Actually, we can use cub for reduction with nulls without creating custom
-// kernel or multiple steps. We may accelerate the reduction for a column using
-// cub
-TYPED_TEST(IteratorTest, null_iterator)
-{
-  using T = TypeParam;
-  T init  = cudf::test::make_type_param_scalar<T>(0);
-  // data and valid arrays
-  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1, 1});
-
-  // create a column with bool vector
-  cudf::test::fixed_width_column_wrapper<T> w_col(
-    host_values.begin(), host_values.end(), host_bools.begin());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // calculate the expected value by CPU.
-  thrust::host_vector<T> replaced_array(host_values.size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 replaced_array.begin(),
-                 [&](T x, bool b) { return (b) ? x : init; });
-  T expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
-  // TODO uncomment after time_point ostream operator<<
-  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
-
-  // GPU test
-  auto it_dev =
-    cudf::detail::make_null_replacement_iterator(*d_col, cudf::test::make_type_param_scalar<T>(0));
-  this->iterator_test_cub(expected_value, it_dev, d_col->size());
-  this->values_equal_test(replaced_array, *d_col);
-}
-
-struct TransformedIteratorTest : public IteratorTest<int8_t> {
-};
-
-// Tests up cast reduction with null iterator.
-// The up cast iterator will be created by transform_iterator and
-// cudf::detail::make_null_replacement_iterator(col, T{0})
-TEST_F(TransformedIteratorTest, null_iterator_upcast)
-{
-  const int column_size{1000};
-  using T        = int8_t;
-  using T_upcast = int64_t;
-  T init{0};
-
-  // data and valid arrays
-  std::vector<T> host_values(column_size);
-  std::vector<bool> host_bools(column_size);
-
-  cudf::test::UniformRandomGenerator<T> rng(-128, 127);
-  cudf::test::UniformRandomGenerator<bool> rbg;
-  std::generate(host_values.begin(), host_values.end(), [&rng]() { return rng.generate(); });
-  std::generate(host_bools.begin(), host_bools.end(), [&rbg]() { return rbg.generate(); });
-
-  cudf::test::fixed_width_column_wrapper<T> w_col(
-    host_values.begin(), host_values.end(), host_bools.begin());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // calculate the expected value by CPU.
-  thrust::host_vector<T> replaced_array(d_col->size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 replaced_array.begin(),
-                 [&](T x, bool b) { return (b) ? x : init; });
-  T_upcast expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
-  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
-
-  // GPU test
-  auto it_dev        = cudf::detail::make_null_replacement_iterator(*d_col, T{0});
-  auto it_dev_upcast = thrust::make_transform_iterator(it_dev, thrust::identity<T_upcast>());
-  this->iterator_test_thrust(replaced_array, it_dev_upcast, d_col->size());
-  this->iterator_test_cub(expected_value, it_dev, d_col->size());
-}
-
-// Tests for square input iterator using helper strcut
-// `cudf::transformer_squared<T, T_upcast>` The up cast iterator will be created
-// by make_transform_iterator(
-//        cudf::detail::make_null_replacement_iterator(col, T{0}),
-//        cudf::detail::transformer_squared<T_upcast>)
-TEST_F(TransformedIteratorTest, null_iterator_square)
-{
-  const int column_size{1000};
-  using T        = int8_t;
-  using T_upcast = int64_t;
-  T init{0};
-  cudf::transformer_squared<T_upcast> transformer{};
-
-  // data and valid arrays
-  std::vector<T> host_values(column_size);
-  std::vector<bool> host_bools(column_size);
-
-  cudf::test::UniformRandomGenerator<T> rng(-128, 127);
-  cudf::test::UniformRandomGenerator<bool> rbg;
-  std::generate(host_values.begin(), host_values.end(), [&rng]() { return rng.generate(); });
-  std::generate(host_bools.begin(), host_bools.end(), [&rbg]() { return rbg.generate(); });
-
-  cudf::test::fixed_width_column_wrapper<T> w_col(
-    host_values.begin(), host_values.end(), host_bools.begin());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // calculate the expected value by CPU.
-  thrust::host_vector<T_upcast> replaced_array(d_col->size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 replaced_array.begin(),
-                 [&](T x, bool b) { return (b) ? x * x : init; });
-  T_upcast expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
-  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
-
-  // GPU test
-  auto it_dev         = cudf::detail::make_null_replacement_iterator(*d_col, T{0});
-  auto it_dev_upcast  = thrust::make_transform_iterator(it_dev, thrust::identity<T_upcast>());
-  auto it_dev_squared = thrust::make_transform_iterator(it_dev_upcast, transformer);
-  this->iterator_test_thrust(replaced_array, it_dev_squared, d_col->size());
-  this->iterator_test_cub(expected_value, it_dev_squared, d_col->size());
-}
-
-// TODO only few types
-TEST_F(TransformedIteratorTest, large_size_reduction)
-{
-  using T = int64_t;
-
-  const int column_size{1000000};
-  const T init{0};
-
-  // data and valid arrays
-  std::vector<T> host_values(column_size);
-  std::vector<bool> host_bools(column_size);
-
-  cudf::test::UniformRandomGenerator<T> rng(-128, 128);
-  cudf::test::UniformRandomGenerator<bool> rbg;
-  std::generate(host_values.begin(), host_values.end(), [&rng]() { return rng.generate(); });
-  std::generate(host_bools.begin(), host_bools.end(), [&rbg]() { return rbg.generate(); });
-
-  cudf::test::fixed_width_column_wrapper<T> w_col(
-    host_values.begin(), host_values.end(), host_bools.begin());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // calculate by cudf::reduce
-  thrust::host_vector<T> replaced_array(d_col->size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 replaced_array.begin(),
-                 [&](T x, bool b) { return (b) ? x : init; });
-  T expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
-  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
-
-  // GPU test
-  auto it_dev = cudf::detail::make_null_replacement_iterator(*d_col, init);
-  this->iterator_test_thrust(replaced_array, it_dev, d_col->size());
-  this->iterator_test_cub(expected_value, it_dev, d_col->size());
-}
-
-struct StringIteratorTest : public IteratorTest<cudf::string_view> {
-};
-
-TEST_F(StringIteratorTest, string_view_null_iterator)
-{
-  using T = cudf::string_view;
-  // T init = T{"", 0};
-  std::string zero("zero");
-  // the char data has to be in GPU
-  thrust::device_vector<char> initmsg(zero.begin(), zero.end());
-  T init = T{initmsg.data().get(), int(initmsg.size())};
-
-  // data and valid arrays
-  std::vector<std::string> host_values(
-    {"one", "two", "three", "four", "five", "six", "eight", "nine"});
-  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1, 1});
-
-  // replace nulls in CPU
-  std::vector<std::string> replaced_strings(host_values.size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 replaced_strings.begin(),
-                 [zero](auto s, auto b) { return b ? s : zero; });
-
-  thrust::device_vector<char> dev_chars;
-  thrust::host_vector<T> replaced_array(host_values.size());
-  std::tie(dev_chars, replaced_array) = strings_to_string_views(replaced_strings);
-
-  // create a column with bool vector
-  cudf::test::strings_column_wrapper w_col(
-    host_values.begin(), host_values.end(), host_bools.begin());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // GPU test
-  auto it_dev = cudf::detail::make_null_replacement_iterator(*d_col, init);
-  this->iterator_test_thrust(replaced_array, it_dev, host_values.size());
-  // this->values_equal_test(replaced_array, *d_col); //string_view{0} is invalid
-}
-
-TEST_F(StringIteratorTest, string_view_no_null_iterator)
-{
-  using T = cudf::string_view;
-  // T init = T{"", 0};
-  std::string zero("zero");
-  // the char data has to be in GPU
-  thrust::device_vector<char> initmsg(zero.begin(), zero.end());
-  T init = T{initmsg.data().get(), int(initmsg.size())};
-
-  // data array
-  std::vector<std::string> host_values(
-    {"one", "two", "three", "four", "five", "six", "eight", "nine"});
-
-  thrust::device_vector<char> dev_chars;
-  thrust::host_vector<T> all_array(host_values.size());
-  std::tie(dev_chars, all_array) = strings_to_string_views(host_values);
-
-  // create a column with bool vector
-  cudf::test::strings_column_wrapper w_col(host_values.begin(), host_values.end());
-  auto d_col = cudf::column_device_view::create(w_col);
-
-  // GPU test
-  auto it_dev = d_col->begin<T>();
-  this->iterator_test_thrust(all_array, it_dev, host_values.size());
-}
-
-TEST_F(StringIteratorTest, string_scalar_iterator)
-{
-  using T = cudf::string_view;
-  // T init = T{"", 0};
-  std::string zero("zero");
-  // the char data has to be in GPU
-  thrust::device_vector<char> initmsg(zero.begin(), zero.end());
-  T init = T{initmsg.data().get(), int(initmsg.size())};
-
-  // data array
-  std::vector<std::string> host_values(100, zero);
-
-  thrust::device_vector<char> dev_chars;
-  thrust::host_vector<T> all_array(host_values.size());
-  std::tie(dev_chars, all_array) = strings_to_string_views(host_values);
-
-  // calculate the expected value by CPU.
-  thrust::host_vector<thrust::pair<T, bool>> value_and_validity(host_values.size());
-  std::transform(all_array.begin(), all_array.end(), value_and_validity.begin(), [](auto v) {
-    return thrust::pair<T, bool>{v, true};
-  });
-
-  // create a scalar
-  using ScalarType = cudf::scalar_type_t<T>;
-  std::unique_ptr<cudf::scalar> s(new ScalarType{zero, true});
-
-  // GPU test
-  auto it_dev = cudf::detail::make_scalar_iterator<T>(*s);
-  this->iterator_test_thrust(all_array, it_dev, host_values.size());
-
-  auto it_pair_dev = cudf::detail::make_pair_iterator<T>(*s);
-  this->iterator_test_thrust(value_and_validity, it_pair_dev, host_values.size());
-}
-
-TYPED_TEST(IteratorTest, error_handling)
-{
-  using T         = TypeParam;
-  auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1, 1});
-
-  cudf::test::fixed_width_column_wrapper<T> w_col_no_null(host_array.begin(), host_array.end());
-  cudf::test::fixed_width_column_wrapper<T> w_col_null(
-    host_array.begin(), host_array.end(), host_bools.begin());
-  auto d_col_no_null = cudf::column_device_view::create(w_col_no_null);
-  auto d_col_null    = cudf::column_device_view::create(w_col_null);
-
-  // expects error: data type mismatch
-  if (!(std::is_same<T, double>::value)) {
-    CUDF_EXPECT_THROW_MESSAGE((d_col_null->begin<double>()), "the data type mismatch");
-  }
-  // expects error: data type mismatch
-  if (!(std::is_same<T, float>::value)) {
-    CUDF_EXPECT_THROW_MESSAGE((cudf::detail::make_null_replacement_iterator(*d_col_null, float{0})),
-                              "the data type mismatch");
-  }
-
-  CUDF_EXPECT_THROW_MESSAGE((cudf::detail::make_null_replacement_iterator(
-                              *d_col_no_null, cudf::test::make_type_param_scalar<T>(0))),
-                            "column with nulls must have a validity bitmask");
-
-  CUDF_EXPECT_THROW_MESSAGE((d_col_no_null->pair_begin<T, true>()),
-                            "Unexpected non-nullable column.");
-  CUDF_EXPECT_NO_THROW((d_col_null->pair_begin<T, false>()));
-  CUDF_EXPECT_NO_THROW((d_col_null->pair_begin<T, true>()));
-
-  // scalar iterator
-  using ScalarType = cudf::scalar_type_t<T>;
-  std::unique_ptr<cudf::scalar> s(new ScalarType{cudf::test::make_type_param_scalar<T>(1), false});
-  CUDF_EXPECT_THROW_MESSAGE((cudf::detail::make_scalar_iterator<T>(*s)),
-                            "the scalar value must be valid");
-  CUDF_EXPECT_NO_THROW((cudf::detail::make_pair_iterator<T>(*s)));
-  // expects error: data type mismatch
-  if (!(std::is_same<T, double>::value)) {
-    CUDF_EXPECT_THROW_MESSAGE((cudf::detail::make_scalar_iterator<double>(*s)),
-                              "the data type mismatch");
-    CUDF_EXPECT_THROW_MESSAGE((cudf::detail::make_pair_iterator<double>(*s)),
-                              "the data type mismatch");
-  }
-}
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
new file mode 100644
index 00000000000..3a7ef075a41
--- /dev/null
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <tests/iterator/iterator_tests.cuh>
+#include "cudf/detail/utilities/vector_factories.hpp"
+
+// tests for non-null iterator (pointer of device array)
+template <typename T>
+void non_null_iterator(IteratorTest<T>& testFixture)
+{
+  auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+  auto dev_array  = cudf::detail::make_device_uvector_sync(host_array);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<T> replaced_array(host_array);
+
+  // driven by iterator as a pointer of device array.
+  auto it_dev      = dev_array.begin();
+  T expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
+  testFixture.iterator_test_thrust(replaced_array, it_dev, dev_array.size());
+  testFixture.iterator_test_cub(expected_value, it_dev, dev_array.size());
+
+  // test column input
+  cudf::test::fixed_width_column_wrapper<T> w_col(host_array.begin(), host_array.end());
+  testFixture.values_equal_test(replaced_array, *cudf::column_device_view::create(w_col));
+}
+
+// Tests for null input iterator (column with null bitmap)
+// Actually, we can use cub for reduction with nulls without creating custom
+// kernel or multiple steps. We may accelerate the reduction for a column using
+// cub
+template <typename T>
+void null_iterator(IteratorTest<T>& testFixture)
+{
+  T init = cudf::test::make_type_param_scalar<T>(0);
+  // data and valid arrays
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1, 1});
+
+  // create a column with bool vector
+  cudf::test::fixed_width_column_wrapper<T> w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<T> replaced_array(host_values.size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 replaced_array.begin(),
+                 [&](T x, bool b) { return (b) ? x : init; });
+  T expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
+  // TODO uncomment after time_point ostream operator<<
+  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
+
+  // GPU test
+  auto it_dev =
+    cudf::detail::make_null_replacement_iterator(*d_col, cudf::test::make_type_param_scalar<T>(0));
+  testFixture.iterator_test_cub(expected_value, it_dev, d_col->size());
+  testFixture.values_equal_test(replaced_array, *d_col);
+}
diff --git a/cpp/tests/iterator/value_iterator_test_chrono.cu b/cpp/tests/iterator/value_iterator_test_chrono.cu
new file mode 100644
index 00000000000..5cdb1f88874
--- /dev/null
+++ b/cpp/tests/iterator/value_iterator_test_chrono.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <tests/iterator/value_iterator_test.cuh>
+
+using TestingTypes = cudf::test::ChronoTypes;
+
+template <typename T>
+struct ChronoValueIteratorTest : public IteratorTest<T> {
+};
+
+TYPED_TEST_CASE(ChronoValueIteratorTest, TestingTypes);
+TYPED_TEST(ChronoValueIteratorTest, non_null_iterator) { non_null_iterator(*this); }
+TYPED_TEST(ChronoValueIteratorTest, null_iterator) { null_iterator(*this); }
diff --git a/cpp/tests/iterator/value_iterator_test_numeric.cu b/cpp/tests/iterator/value_iterator_test_numeric.cu
new file mode 100644
index 00000000000..f24dae995b3
--- /dev/null
+++ b/cpp/tests/iterator/value_iterator_test_numeric.cu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <tests/iterator/value_iterator_test.cuh>
+
+using TestingTypes = cudf::test::NumericTypes;
+
+template <typename T>
+struct NumericValueIteratorTest : public IteratorTest<T> {
+};
+
+TYPED_TEST_CASE(NumericValueIteratorTest, TestingTypes);
+TYPED_TEST(NumericValueIteratorTest, non_null_iterator) { non_null_iterator(*this); }
+TYPED_TEST(NumericValueIteratorTest, null_iterator) { null_iterator(*this); }
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
new file mode 100644
index 00000000000..f28067649fd
--- /dev/null
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <tests/iterator/iterator_tests.cuh>
+#include "cudf/detail/utilities/vector_factories.hpp"
+#include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_uvector.hpp"
+
+auto strings_to_string_views(std::vector<std::string>& input_strings)
+{
+  auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  std::vector<char> chars;
+  std::vector<int32_t> offsets;
+  std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
+    input_strings.begin(), input_strings.end(), all_valid);
+  auto dev_chars = cudf::detail::make_device_uvector_sync(chars);
+
+  // calculate the expected value by CPU. (but contains device pointers)
+  thrust::host_vector<cudf::string_view> replaced_array(input_strings.size());
+  std::transform(thrust::counting_iterator<size_t>(0),
+                 thrust::counting_iterator<size_t>(replaced_array.size()),
+                 replaced_array.begin(),
+                 [c_start = dev_chars.begin(), offsets](auto i) {
+                   return cudf::string_view(c_start + offsets[i], offsets[i + 1] - offsets[i]);
+                 });
+  return std::make_tuple(std::move(dev_chars), replaced_array);
+}
+
+struct StringIteratorTest : public IteratorTest<cudf::string_view> {
+};
+
+TEST_F(StringIteratorTest, string_view_null_iterator)
+{
+  using T = cudf::string_view;
+  std::string zero("zero");
+  // the char data has to be in GPU
+  auto initmsg = cudf::detail::make_device_uvector_sync(zero);
+  T init       = T{initmsg.data(), int(initmsg.size())};
+
+  // data and valid arrays
+  std::vector<std::string> host_values(
+    {"one", "two", "three", "four", "five", "six", "eight", "nine"});
+  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1, 1});
+
+  // replace nulls in CPU
+  std::vector<std::string> replaced_strings(host_values.size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 replaced_strings.begin(),
+                 [zero](auto s, auto b) { return b ? s : zero; });
+
+  auto [dev_chars, replaced_array] = strings_to_string_views(replaced_strings);
+
+  // create a column with bool vector
+  cudf::test::strings_column_wrapper w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // GPU test
+  auto it_dev = cudf::detail::make_null_replacement_iterator(*d_col, init);
+  this->iterator_test_thrust(replaced_array, it_dev, host_values.size());
+  // this->values_equal_test(replaced_array, *d_col); //string_view{0} is invalid
+}
+
+TEST_F(StringIteratorTest, string_view_no_null_iterator)
+{
+  using T = cudf::string_view;
+  // T init = T{"", 0};
+  std::string zero("zero");
+  // the char data has to be in GPU
+  auto initmsg = cudf::detail::make_device_uvector_sync(zero);
+  T init       = T{initmsg.data(), int(initmsg.size())};
+
+  // data array
+  std::vector<std::string> host_values(
+    {"one", "two", "three", "four", "five", "six", "eight", "nine"});
+
+  auto [dev_chars, all_array] = strings_to_string_views(host_values);
+
+  // create a column with bool vector
+  cudf::test::strings_column_wrapper w_col(host_values.begin(), host_values.end());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // GPU test
+  auto it_dev = d_col->begin<T>();
+  this->iterator_test_thrust(all_array, it_dev, host_values.size());
+}
+
+TEST_F(StringIteratorTest, string_scalar_iterator)
+{
+  using T = cudf::string_view;
+  // T init = T{"", 0};
+  std::string zero("zero");
+  // the char data has to be in GPU
+  auto initmsg = cudf::detail::make_device_uvector_sync(zero);
+  T init       = T{initmsg.data(), int(initmsg.size())};
+
+  // data array
+  std::vector<std::string> host_values(100, zero);
+
+  auto [dev_chars, all_array] = strings_to_string_views(host_values);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<thrust::pair<T, bool>> value_and_validity(host_values.size());
+  std::transform(all_array.begin(), all_array.end(), value_and_validity.begin(), [](auto v) {
+    return thrust::pair<T, bool>{v, true};
+  });
+
+  // create a scalar
+  using ScalarType = cudf::scalar_type_t<T>;
+  std::unique_ptr<cudf::scalar> s(new ScalarType{zero, true});
+
+  // GPU test
+  auto it_dev = cudf::detail::make_scalar_iterator<T>(*s);
+  this->iterator_test_thrust(all_array, it_dev, host_values.size());
+
+  auto it_pair_dev = cudf::detail::make_pair_iterator<T>(*s);
+  this->iterator_test_thrust(value_and_validity, it_pair_dev, host_values.size());
+}
diff --git a/cpp/tests/iterator/value_iterator_test_transform.cu b/cpp/tests/iterator/value_iterator_test_transform.cu
new file mode 100644
index 00000000000..164872d236b
--- /dev/null
+++ b/cpp/tests/iterator/value_iterator_test_transform.cu
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+#include <tests/iterator/iterator_tests.cuh>
+
+struct TransformedIteratorTest : public IteratorTest<int8_t> {
+};
+
+// Tests up cast reduction with null iterator.
+// The up cast iterator will be created by transform_iterator and
+// cudf::detail::make_null_replacement_iterator(col, T{0})
+TEST_F(TransformedIteratorTest, null_iterator_upcast)
+{
+  const int column_size{1000};
+  using T        = int8_t;
+  using T_upcast = int64_t;
+  T init{0};
+
+  // data and valid arrays
+  std::vector<T> host_values(column_size);
+  std::vector<bool> host_bools(column_size);
+
+  cudf::test::UniformRandomGenerator<T> rng(-128, 127);
+  cudf::test::UniformRandomGenerator<bool> rbg;
+  std::generate(host_values.begin(), host_values.end(), [&rng]() { return rng.generate(); });
+  std::generate(host_bools.begin(), host_bools.end(), [&rbg]() { return rbg.generate(); });
+
+  cudf::test::fixed_width_column_wrapper<T> w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<T> replaced_array(d_col->size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 replaced_array.begin(),
+                 [&](T x, bool b) { return (b) ? x : init; });
+  T_upcast expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
+  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
+
+  // GPU test
+  auto it_dev        = cudf::detail::make_null_replacement_iterator(*d_col, T{0});
+  auto it_dev_upcast = thrust::make_transform_iterator(it_dev, thrust::identity<T_upcast>());
+  this->iterator_test_thrust(replaced_array, it_dev_upcast, d_col->size());
+  this->iterator_test_cub(expected_value, it_dev, d_col->size());
+}
+
+// Tests for square input iterator using helper strcut
+// `cudf::transformer_squared<T, T_upcast>` The up cast iterator will be created
+// by make_transform_iterator(
+//        cudf::detail::make_null_replacement_iterator(col, T{0}),
+//        cudf::detail::transformer_squared<T_upcast>)
+TEST_F(TransformedIteratorTest, null_iterator_square)
+{
+  const int column_size{1000};
+  using T        = int8_t;
+  using T_upcast = int64_t;
+  T init{0};
+  cudf::transformer_squared<T_upcast> transformer{};
+
+  // data and valid arrays
+  std::vector<T> host_values(column_size);
+  std::vector<bool> host_bools(column_size);
+
+  cudf::test::UniformRandomGenerator<T> rng(-128, 127);
+  cudf::test::UniformRandomGenerator<bool> rbg;
+  std::generate(host_values.begin(), host_values.end(), [&rng]() { return rng.generate(); });
+  std::generate(host_bools.begin(), host_bools.end(), [&rbg]() { return rbg.generate(); });
+
+  cudf::test::fixed_width_column_wrapper<T> w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate the expected value by CPU.
+  thrust::host_vector<T_upcast> replaced_array(d_col->size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 replaced_array.begin(),
+                 [&](T x, bool b) { return (b) ? x * x : init; });
+  T_upcast expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
+  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
+
+  // GPU test
+  auto it_dev         = cudf::detail::make_null_replacement_iterator(*d_col, T{0});
+  auto it_dev_upcast  = thrust::make_transform_iterator(it_dev, thrust::identity<T_upcast>());
+  auto it_dev_squared = thrust::make_transform_iterator(it_dev_upcast, transformer);
+  this->iterator_test_thrust(replaced_array, it_dev_squared, d_col->size());
+  this->iterator_test_cub(expected_value, it_dev_squared, d_col->size());
+}
+
+// TODO only few types
+TEST_F(TransformedIteratorTest, large_size_reduction)
+{
+  using T = int64_t;
+
+  const int column_size{1000000};
+  const T init{0};
+
+  // data and valid arrays
+  std::vector<T> host_values(column_size);
+  std::vector<bool> host_bools(column_size);
+
+  cudf::test::UniformRandomGenerator<T> rng(-128, 128);
+  cudf::test::UniformRandomGenerator<bool> rbg;
+  std::generate(host_values.begin(), host_values.end(), [&rng]() { return rng.generate(); });
+  std::generate(host_bools.begin(), host_bools.end(), [&rbg]() { return rbg.generate(); });
+
+  cudf::test::fixed_width_column_wrapper<T> w_col(
+    host_values.begin(), host_values.end(), host_bools.begin());
+  auto d_col = cudf::column_device_view::create(w_col);
+
+  // calculate by cudf::reduce
+  thrust::host_vector<T> replaced_array(d_col->size());
+  std::transform(host_values.begin(),
+                 host_values.end(),
+                 host_bools.begin(),
+                 replaced_array.begin(),
+                 [&](T x, bool b) { return (b) ? x : init; });
+  T expected_value = *std::min_element(replaced_array.begin(), replaced_array.end());
+  // std::cout << "expected <null_iterator> = " << expected_value << std::endl;
+
+  // GPU test
+  auto it_dev = cudf::detail::make_null_replacement_iterator(*d_col, init);
+  this->iterator_test_thrust(replaced_array, it_dev, d_col->size());
+  this->iterator_test_cub(expected_value, it_dev, d_col->size());
+}
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 365653d701f..b0a2149d50f 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -607,29 +607,6 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
-TEST_F(JoinTest, InnerJoinSizeOverflow)
-{
-  auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
-  zero->set_valid(true);
-  static_cast<cudf::scalar_type_t<int32_t> *>(zero.get())->set_value(0);
-
-  // Should cause size overflow, raise exception
-  int32_t left  = 4;
-  int32_t right = 1073741825;
-
-  auto col0_0 = cudf::make_column_from_scalar(*zero, left);
-  auto col1_0 = cudf::make_column_from_scalar(*zero, right);
-
-  CVector cols0, cols1;
-  cols0.push_back(std::move(col0_0));
-  cols1.push_back(std::move(col1_0));
-
-  Table t0(std::move(cols0));
-  Table t1(std::move(cols1));
-
-  EXPECT_THROW(cudf::inner_join(t0, t1, {0}, {0}), cudf::logic_error);
-}
-
 TEST_F(JoinTest, InnerJoinNoNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
@@ -1551,8 +1528,8 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
 
   auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 1}};
 
-  auto col0_3 =
-    cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
+  auto col0_3 = cudf::test::structs_column_wrapper{
+    {col0_names_col, col0_ages_col, col0_is_human_col}, {1, 1, 1, 1, 1}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}};
   strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}};
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
new file mode 100644
index 00000000000..5b38bafb122
--- /dev/null
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/join.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+template <typename T>
+using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
+using strcol_wrapper = cudf::test::strings_column_wrapper;
+using column_vector  = std::vector<std::unique_ptr<cudf::column>>;
+using Table          = cudf::table;
+
+struct JoinTest : public cudf::test::BaseFixture {
+};
+
+std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_tables(
+  std::vector<bool> const& left_is_human_nulls, std::vector<bool> const& right_is_human_nulls)
+{
+  column_wrapper<int32_t> col0_0{{99, 1, 2, 0, 2}, {0, 1, 1, 1, 1}};
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
+  auto col0_names_col = strcol_wrapper{
+    "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
+  auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
+
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, left_is_human_nulls.begin()};
+
+  auto col0_3 = cudf::test::structs_column_wrapper{
+    {col0_names_col, col0_ages_col, col0_is_human_col}, {1, 1, 1, 1, 1}};
+
+  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, -99}, {1, 1, 1, 1, 0}};
+  strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
+  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  auto col1_names_col = strcol_wrapper{"Carrot Ironfoundersson",
+                                       "Angua von Überwald",
+                                       "Detritus",
+                                       "Carrot Ironfoundersson",
+                                       "Samuel Vimes"};
+  auto col1_ages_col  = column_wrapper<int32_t>{{351, 25, 27, 31, 48}};
+
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, false, false, false, true}, right_is_human_nulls.begin()};
+
+  auto col1_3 =
+    cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
+
+  column_vector cols0, cols1;
+  cols0.push_back(col0_0.release());
+  cols0.push_back(col0_1.release());
+  cols0.push_back(col0_2.release());
+  cols0.push_back(col0_3.release());
+  cols1.push_back(col1_0.release());
+  cols1.push_back(col1_1.release());
+  cols1.push_back(col1_2.release());
+  cols1.push_back(col1_3.release());
+
+  return {std::make_unique<Table>(std::move(cols0)), std::make_unique<Table>(std::move(cols1))};
+}
+
+TEST_F(JoinTest, SemiJoinWithStructsAndNulls)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
+
+  auto result = cudf::left_semi_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{99, 2}, {0, 1}};
+  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 1}};
+  auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
+  auto col_gold_3_ages_col  = column_wrapper<int32_t>{{48, 25}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
+
+TEST_F(JoinTest, SemiJoinWithStructsAndNullsNotEqual)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
+
+  auto result = cudf::left_semi_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{2}, {1}};
+  strcol_wrapper col_gold_1({"s0"}, {1});
+  column_wrapper<int32_t> col_gold_2{{1}};
+  auto col_gold_3_names_col = strcol_wrapper{"Angua von Überwald"};
+  auto col_gold_3_ages_col  = column_wrapper<int32_t>{{25}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{false}, {1}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
+
+TEST_F(JoinTest, AntiJoinWithStructsAndNulls)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
+
+  auto result = cudf::left_anti_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{1, 2, 0}, {1, 1, 1}};
+  strcol_wrapper col_gold_1({"s1", "s0", "s4"}, {1, 0, 1});
+  column_wrapper<int32_t> col_gold_2{{1, 2, 4}};
+  auto col_gold_3_names_col = strcol_wrapper{"Carrot Ironfoundersson", "Detritus", "Samuel Vimes"};
+  auto col_gold_3_ages_col  = column_wrapper<int32_t>{{27, 351, 31}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false, false}, {1, 0, 1}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
+
+TEST_F(JoinTest, AntiJoinWithStructsAndNullsNotEqual)
+{
+  auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
+
+  auto result = cudf::left_anti_join(
+    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
+  auto result_sort_order = cudf::sorted_order(result->view());
+  auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
+
+  column_wrapper<int32_t> col_gold_0{{99, 1, 2, 0}, {0, 1, 1, 1}};
+  strcol_wrapper col_gold_1({"s1", "s1", "s0", "s4"}, {1, 1, 0, 1});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4}};
+  auto col_gold_3_names_col =
+    strcol_wrapper{"Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes"};
+  auto col_gold_3_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31}};
+
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, true, false, false}, {1, 1, 0, 1}};
+
+  auto col_gold_3 = cudf::test::structs_column_wrapper{
+    {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
+
+  column_vector cols_gold;
+  cols_gold.push_back(col_gold_0.release());
+  cols_gold.push_back(col_gold_1.release());
+  cols_gold.push_back(col_gold_2.release());
+  cols_gold.push_back(col_gold_3.release());
+  Table gold(std::move(cols_gold));
+
+  auto gold_sort_order = cudf::sorted_order(gold.view());
+  auto sorted_gold     = cudf::gather(gold.view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
+}
diff --git a/cpp/tests/join/semi_join_tests.cpp b/cpp/tests/join/semi_join_tests.cpp
deleted file mode 100644
index 8de9610b07d..00000000000
--- a/cpp/tests/join/semi_join_tests.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/dictionary/encode.hpp>
-#include <cudf/join.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <thrust/iterator/transform_iterator.h>
-
-template <typename T>
-using column_wrapper = cudf::test::fixed_width_column_wrapper<T>;
-
-struct JoinTest : public cudf::test::BaseFixture {
-};
diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
new file mode 100644
index 00000000000..7d79cf4aebe
--- /dev/null
+++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/lists/combine.hpp>
+
+namespace {
+using StrListsCol = cudf::test::lists_column_wrapper<cudf::string_view>;
+using IntListsCol = cudf::test::lists_column_wrapper<int32_t>;
+using IntCol      = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+constexpr bool print_all{false};  // For debugging
+constexpr int32_t null{0};
+
+template <class T, class... Ts>
+auto build_lists_col(T& list, Ts&... lists)
+{
+  return T(std::initializer_list<T>{std::move(list), std::move(lists)...});
+}
+
+auto all_nulls() { return cudf::test::iterator_all_nulls(); }
+
+auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); }
+
+auto null_at(std::vector<cudf::size_type> const& indices)
+{
+  return cudf::test::iterator_with_null_at(cudf::host_span<cudf::size_type const>{indices});
+}
+
+}  // namespace
+
+struct ConcatenateListElementsTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(ConcatenateListElementsTest, InvalidInput)
+{
+  // Input lists is not a 2-level depth lists column.
+  {
+    auto const col = IntCol{};
+    EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error);
+  }
+
+  // Input lists is not at least 2-level depth lists column.
+  {
+    auto const col = IntListsCol{1, 2, 3};
+    EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error);
+  }
+}
+
+template <typename T>
+struct ConcatenateListElementsTypedTest : public cudf::test::BaseFixture {
+};
+
+using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                        cudf::test::FloatingPointTypes,
+                                        cudf::test::FixedPointTypes>;
+TYPED_TEST_CASE(ConcatenateListElementsTypedTest, TypesForTest);
+
+TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNoNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto row0           = ListsCol{{1, 2}, {3}, {4, 5, 6}};
+  auto row1           = ListsCol{ListsCol{}};
+  auto row2           = ListsCol{{7, 8}, {9, 10}};
+  auto const col      = build_lists_col(row0, row1, row2);
+  auto const results  = cudf::lists::concatenate_list_elements(col);
+  auto const expected = ListsCol{{1, 2, 3, 4, 5, 6}, {}, {7, 8, 9, 10}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+}
+
+TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsNoNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}};
+  auto row01 = ListsCol{ListsCol{}};
+  auto row02 = ListsCol{{7, 8}, {9, 10}};
+  auto row0  = build_lists_col(row00, row01, row02);
+
+  auto row10 = ListsCol{{1, 2}, {3}, {4, 5, 6}};
+  auto row11 = ListsCol{ListsCol{}};
+  auto row12 = ListsCol{{7, 8}, {9, 10}};
+  auto row1  = build_lists_col(row10, row11, row12);
+
+  auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}};
+  auto row21 = ListsCol{ListsCol{}};
+  auto row22 = ListsCol{{7, 8}, {9, 10}};
+  auto row2  = build_lists_col(row20, row21, row22);
+
+  auto const col      = build_lists_col(row0, row1, row2);
+  auto const results  = cudf::lists::concatenate_list_elements(col);
+  auto const expected = ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}},
+                                 ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}},
+                                 ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+}
+
+TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnNoNull)
+{
+  auto row0 = StrListsCol{StrListsCol{"Tomato", "Apple"}, StrListsCol{"Orange"}};
+  auto row1 = StrListsCol{StrListsCol{"Banana", "Kiwi", "Cherry"}, StrListsCol{"Lemon", "Peach"}};
+  auto row2 = StrListsCol{StrListsCol{"Coconut"}, StrListsCol{}};
+  auto const col      = build_lists_col(row0, row1, row2);
+  auto const results  = cudf::lists::concatenate_list_elements(col);
+  auto const expected = StrListsCol{StrListsCol{"Tomato", "Apple", "Orange"},
+                                    StrListsCol{"Banana", "Kiwi", "Cherry", "Lemon", "Peach"},
+                                    StrListsCol{"Coconut"}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+}
+
+TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+  auto row0      = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)},
+                        ListsCol{{10, 11, 12, null}, null_at(3)},
+                        ListsCol{} /*NULL*/},
+                       null_at(2)};
+  auto row1      = ListsCol{ListsCol{{null, 2, 3, 4}, null_at(0)},
+                       ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)},
+                       ListsCol{{20, null}, null_at(1)}};
+  auto row2      = ListsCol{{ListsCol{{null, 2, 3, 4}, null_at(0)},
+                        ListsCol{} /*NULL*/,
+                        ListsCol{{null, 21, null, null}, null_at({0, 2, 3})}},
+                       null_at(1)};
+  auto row3      = ListsCol{{ListsCol{} /*NULL*/, ListsCol{{null, 18}, null_at(0)}}, null_at(0)};
+  auto row4      = ListsCol{ListsCol{{1, 2, null, 4}, null_at(2)},
+                       ListsCol{{19, 20, null}, null_at(2)},
+                       ListsCol{22, 23, 24, 25}};
+  auto row5      = ListsCol{ListsCol{{1, 2, 3, null}, null_at(3)},
+                       ListsCol{{null}, null_at(0)},
+                       ListsCol{{null, null, null, null, null}, all_nulls()}};
+  auto row6 =
+    ListsCol{{ListsCol{} /*NULL*/, ListsCol{} /*NULL*/, ListsCol{} /*NULL*/}, all_nulls()};
+  auto const col = build_lists_col(row0, row1, row2, row3, row4, row5, row6);
+
+  // Ignore null list elements.
+  {
+    auto const results = cudf::lists::concatenate_list_elements(col);
+    auto const expected =
+      ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})},
+                ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
+                ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})},
+                ListsCol{{null, 18}, null_at(0)},
+                ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
+                ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at(6)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+
+  // Null lists result in null rows.
+  {
+    auto const results = cudf::lists::concatenate_list_elements(
+      col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected =
+      ListsCol{{ListsCol{} /*NULL*/,
+                ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
+                ListsCol{} /*NULL*/,
+                ListsCol{} /*NULL*/,
+                ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
+                ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at({0, 2, 3, 6})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+}
+
+TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}};
+  auto row01 = ListsCol{ListsCol{}}; /*NULL*/
+  auto row02 = ListsCol{{7, 8}, {9, 10}};
+  auto row0  = ListsCol{{std::move(row00), std::move(row01), std::move(row02)}, null_at(1)};
+
+  auto row10 = ListsCol{{{1, 2}, {3}, {4, 5, 6} /*NULL*/}, null_at(2)};
+  auto row11 = ListsCol{ListsCol{}};
+  auto row12 = ListsCol{{7, 8}, {9, 10}};
+  auto row1  = build_lists_col(row10, row11, row12);
+
+  auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}};
+  auto row21 = ListsCol{ListsCol{}};
+  auto row22 = ListsCol{ListsCol{{null, 8}, null_at(0)}, {9, 10}};
+  auto row2  = build_lists_col(row20, row21, row22);
+
+  auto const col = build_lists_col(row0, row1, row2);
+
+  // Ignore null list elements.
+  {
+    auto const results = cudf::lists::concatenate_list_elements(col);
+    auto const expected =
+      ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {7, 8}, {9, 10}},
+               ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)},
+               ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+
+  // Null lists result in null rows.
+  {
+    auto const results = cudf::lists::concatenate_list_elements(
+      col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected =
+      ListsCol{{ListsCol{ListsCol{}}, /*NULL*/
+                ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)},
+                ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}},
+               null_at(0)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+}
+
+TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithNulls)
+{
+  auto row0 = StrListsCol{
+    StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)},
+    StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}};
+  auto row1 = StrListsCol{
+    StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})},
+    StrListsCol{"Lemon", "Peach"}};
+  auto row2      = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)};
+  auto const col = build_lists_col(row0, row1, row2);
+
+  // Ignore null list elements.
+  {
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = StrListsCol{
+      StrListsCol{{"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, ""
+                   /*NULL*/},
+                  null_at({1, 4, 5, 6})},
+      StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"},
+                  null_at({1, 4})},
+      StrListsCol{"Coconut"}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+
+  // Null lists result in null rows.
+  {
+    auto const results = cudf::lists::concatenate_list_elements(
+      col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected = StrListsCol{
+      {StrListsCol{
+         {"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/},
+         null_at({1, 4, 5, 6})},
+       StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"},
+                   null_at({1, 4})},
+       StrListsCol{} /*NULL*/},
+      null_at(2)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+}
+TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithEmptyStringsAndNulls)
+{
+  auto row0 =
+    StrListsCol{StrListsCol{"", "", ""},
+                StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})}};
+  auto row1 = StrListsCol{
+    StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/}, null_at({1, 4})},
+    StrListsCol{""}};
+  auto row2      = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)};
+  auto const col = build_lists_col(row0, row1, row2);
+
+  // Ignore null list elements.
+  {
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = StrListsCol{
+      StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/},
+                  null_at({4, 5, 6})},
+      StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})},
+      StrListsCol{"Coconut"}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+
+  // Null lists result in null rows.
+  {
+    auto const results = cudf::lists::concatenate_list_elements(
+      col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected = StrListsCol{
+      {StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/},
+                   null_at({4, 5, 6})},
+       StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})},
+       StrListsCol{} /*NULL*/},
+      null_at(2)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+}
+
+TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputNoNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col_original = ListsCol{ListsCol{{1, 2, 3}, {2, 3}},
+                                     ListsCol{{3, 4, 5, 6}, {5, 6}, {}, {7}},
+                                     ListsCol{{7, 7, 7}, {7, 8, 1, 0}, {1}},
+                                     ListsCol{{9, 10, 11}},
+                                     ListsCol{},
+                                     ListsCol{{12, 13, 14, 15}, {16}, {17}}};
+
+  {
+    auto const col     = cudf::slice(col_original, {0, 3})[0];
+    auto const results = cudf::lists::concatenate_list_elements(col);
+    auto const expected =
+      ListsCol{{1, 2, 3, 2, 3}, {3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col      = cudf::slice(col_original, {1, 4})[0];
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = ListsCol{{3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col      = cudf::slice(col_original, {2, 5})[0];
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = ListsCol{{7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}, {}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col      = cudf::slice(col_original, {3, 6})[0];
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = ListsCol{{9, 10, 11}, {}, {12, 13, 14, 15, 16, 17}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+}
+
+TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto row0 = ListsCol{ListsCol{{null, 2, 3}, null_at(0)}, ListsCol{2, 3}};
+  auto row1 = ListsCol{ListsCol{{3, null, null, 6}, null_at({1, 2})},
+                       ListsCol{{5, 6, null}, null_at(2)},
+                       ListsCol{},
+                       ListsCol{{7, null}, null_at(1)}};
+  auto row2 = ListsCol{ListsCol{7, 7, 7}, ListsCol{{7, 8, null, 0}, null_at(2)}, ListsCol{1}};
+  auto row3 = ListsCol{ListsCol{9, 10, 11}};
+  auto row4 = ListsCol{ListsCol{}};
+  auto row5 = ListsCol{ListsCol{{12, null, 14, 15}, null_at(1)}, ListsCol{16}, ListsCol{17}};
+  auto const col_original = build_lists_col(row0, row1, row2, row3, row4, row5);
+
+  {
+    auto const col     = cudf::slice(col_original, {0, 3})[0];
+    auto const results = cudf::lists::concatenate_list_elements(col);
+    auto const expected =
+      ListsCol{ListsCol{{null, 2, 3, 2, 3}, null_at(0)},
+               ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})},
+               ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col     = cudf::slice(col_original, {1, 4})[0];
+    auto const results = cudf::lists::concatenate_list_elements(col);
+    auto const expected =
+      ListsCol{ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})},
+               ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)},
+               ListsCol{9, 10, 11}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col     = cudf::slice(col_original, {2, 5})[0];
+    auto const results = cudf::lists::concatenate_list_elements(col);
+    auto const expected =
+      ListsCol{ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}, ListsCol{9, 10, 11}, ListsCol{}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col     = cudf::slice(col_original, {3, 6})[0];
+    auto const results = cudf::lists::concatenate_list_elements(col);
+    auto const expected =
+      ListsCol{ListsCol{9, 10, 11}, ListsCol{}, ListsCol{{12, null, 14, 15, 16, 17}, null_at(1)}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+}
+
+TEST_F(ConcatenateListElementsTest, SlicedStringsColumnsInputWithNulls)
+{
+  auto row0 = StrListsCol{
+    StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)},
+    StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})},
+    StrListsCol{"Coconut"}};
+  auto row1 = StrListsCol{
+    StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})},
+    StrListsCol{"Coconut"},
+    StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}};
+  auto row2 = StrListsCol{
+    StrListsCol{"Coconut"},
+    StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})},
+    StrListsCol{"Lemon", "Peach"}};
+  auto row3 = StrListsCol{
+    {StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})},
+     StrListsCol{"Lemon", "Peach"},
+     StrListsCol{} /*NULL*/},
+    null_at(2)};
+  auto const col_original = build_lists_col(row0, row1, row2, row3);
+
+  {
+    auto const col      = cudf::slice(col_original, {0, 2})[0];
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = StrListsCol{StrListsCol{{"Tomato",
+                                                   "" /*NULL*/,
+                                                   "Apple",
+                                                   "Banana",
+                                                   "" /*NULL*/,
+                                                   "Kiwi",
+                                                   "Cherry",
+                                                   "" /*NULL*/,
+                                                   "Coconut"},
+                                                  null_at({1, 4, 7})},
+                                      StrListsCol{{"Banana",
+                                                   "" /*NULL*/,
+                                                   "Kiwi",
+                                                   "Cherry",
+                                                   "" /*NULL*/,
+                                                   "Coconut",
+                                                   "Orange",
+                                                   "" /*NULL*/,
+                                                   "" /*NULL*/,
+                                                   "" /*NULL*/},
+                                                  null_at({1, 4, 7, 8, 9})}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col      = cudf::slice(col_original, {1, 3})[0];
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = StrListsCol{StrListsCol{{"Banana",
+                                                   "" /*NULL*/,
+                                                   "Kiwi",
+                                                   "Cherry",
+                                                   "" /*NULL*/,
+                                                   "Coconut",
+                                                   "Orange",
+                                                   "" /*NULL*/,
+                                                   "" /*NULL*/,
+                                                   "" /*NULL*/},
+                                                  null_at({1, 4, 7, 8, 9})},
+                                      StrListsCol{{"Coconut",
+                                                   "Orange",
+                                                   "" /*NULL*/,
+                                                   "" /*NULL*/,
+                                                   "", /*NULL*/
+                                                   "Lemon",
+                                                   "Peach"},
+                                                  null_at({2, 3, 4})}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col      = cudf::slice(col_original, {2, 4})[0];
+    auto const results  = cudf::lists::concatenate_list_elements(col);
+    auto const expected = StrListsCol{StrListsCol{{"Coconut",
+                                                   "Orange",
+                                                   "" /*NULL*/,
+                                                   "" /*NULL*/,
+                                                   "", /*NULL*/
+                                                   "Lemon",
+                                                   "Peach"},
+                                                  null_at({2, 3, 4})},
+                                      StrListsCol{{"Orange",
+                                                   "" /*NULL*/,
+                                                   "" /*NULL*/,
+                                                   "", /*NULL*/
+                                                   "Lemon",
+                                                   "Peach"},
+                                                  null_at({1, 2, 3})}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+  {
+    auto const col     = cudf::slice(col_original, {2, 4})[0];
+    auto const results = cudf::lists::concatenate_list_elements(
+      col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected = StrListsCol{{StrListsCol{{"Coconut",
+                                                    "Orange",
+                                                    "" /*NULL*/,
+                                                    "" /*NULL*/,
+                                                    "", /*NULL*/
+                                                    "Lemon",
+                                                    "Peach"},
+                                                   null_at({2, 3, 4})},
+                                       StrListsCol{} /*NULL*/},
+                                      null_at(1)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all);
+  }
+}
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
new file mode 100644
index 00000000000..af22f329634
--- /dev/null
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/lists/combine.hpp>
+
+namespace {
+using StrListsCol = cudf::test::lists_column_wrapper<cudf::string_view>;
+using IntListsCol = cudf::test::lists_column_wrapper<int32_t>;
+using IntCol      = cudf::test::fixed_width_column_wrapper<int32_t>;
+using TView       = cudf::table_view;
+
+constexpr bool print_all{false};  // For debugging
+constexpr int32_t null{0};
+
+auto all_nulls() { return cudf::test::iterator_all_nulls(); }
+
+auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); }
+
+auto null_at(std::vector<cudf::size_type> const& indices)
+{
+  return cudf::test::iterator_with_null_at(cudf::host_span<cudf::size_type const>{indices});
+}
+
+}  // namespace
+
+struct ListConcatenateRowsTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(ListConcatenateRowsTest, InvalidInput)
+{
+  // Empty input table
+  EXPECT_THROW(cudf::lists::concatenate_rows(TView{}), cudf::logic_error);
+
+  // Input table contains non-list column
+  {
+    auto const col1 = IntCol{}.release();
+    auto const col2 = IntListsCol{}.release();
+    EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}),
+                 cudf::logic_error);
+  }
+
+  // Types mismatch
+  {
+    auto const col1 = IntListsCol{}.release();
+    auto const col2 = StrListsCol{}.release();
+    EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}),
+                 cudf::logic_error);
+  }
+
+  // Nested types are not supported
+  {
+    auto const col = IntListsCol{{IntListsCol{1, 2, 3}, IntListsCol{4, 5, 6}}}.release();
+    EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col->view(), col->view()}}),
+                 cudf::logic_error);
+  }
+}
+
+template <typename T>
+struct ListConcatenateRowsTypedTest : public cudf::test::BaseFixture {
+};
+
+using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                        cudf::test::FloatingPointTypes,
+                                        cudf::test::FixedPointTypes>;
+TYPED_TEST_CASE(ListConcatenateRowsTypedTest, TypesForTest);
+
+TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateEmptyColumns)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col     = ListsCol{}.release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col->view(), col->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all);
+}
+
+TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateOneColumnNotNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col     = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all);
+}
+
+TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateOneColumnWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col = ListsCol{{ListsCol{{1, 2, null}, null_at(2)},
+                             ListsCol{} /*NULL*/,
+                             ListsCol{{null, 3, 4, 4, 4, 4}, null_at(0)},
+                             ListsCol{5, 6}},
+                            null_at(1)}
+                     .release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all);
+}
+
+TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputNoNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col1     = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
+  auto const col2     = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
+  auto const expected = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release();
+  auto const results  = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNullableChild)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col1 = ListsCol{{1, 2}, ListsCol{{null}, null_at(0)}, {5, 6}}.release();
+  auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
+  auto const expected =
+    ListsCol{{1, 2, 7, 8}, ListsCol{{null, 9, 10}, null_at(0)}, {5, 6, 11, 12}}.release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TEST_F(ListConcatenateRowsTest, SimpleInputStringsColumnsNoNull)
+{
+  auto const col1 = StrListsCol{
+    StrListsCol{"Tomato", "Apple"},
+    StrListsCol{"Banana", "Kiwi", "Cherry"},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const col2 =
+    StrListsCol{StrListsCol{"Orange"}, StrListsCol{"Lemon", "Peach"}, StrListsCol{}}.release();
+  auto const expected = StrListsCol{
+    StrListsCol{"Tomato", "Apple", "Orange"},
+    StrListsCol{"Banana", "Kiwi", "Cherry", "Lemon", "Peach"},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TEST_F(ListConcatenateRowsTest, SimpleInputStringsColumnsWithNullableChild)
+{
+  auto const col1 = StrListsCol{
+    StrListsCol{"Tomato", "Apple"},
+    StrListsCol{"Banana", "Kiwi", "Cherry"},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const col2 = StrListsCol{
+    StrListsCol{"Orange"},
+    StrListsCol{{"Lemon", "Peach"}, null_at(1)},
+    StrListsCol{}}.release();
+  auto const expected = StrListsCol{
+    StrListsCol{"Tomato", "Apple", "Orange"},
+    StrListsCol{{"Banana", "Kiwi", "Cherry", "Lemon", "Peach"}, null_at(4)},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col1 = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)},
+                              ListsCol{{null, 2, 3, 4}, null_at(0)},
+                              ListsCol{{null, 2, 3, 4}, null_at(0)},
+                              ListsCol{} /*NULL*/,
+                              ListsCol{{1, 2, null, 4}, null_at(2)},
+                              ListsCol{{1, 2, 3, null}, null_at(3)},
+                              ListsCol{} /*NULL*/},
+                             null_at({3, 6})}
+                      .release();
+  auto const col2 = ListsCol{{ListsCol{{10, 11, 12, null}, null_at(3)},
+                              ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)},
+                              ListsCol{} /*NULL*/,
+                              ListsCol{{null, 18}, null_at(0)},
+                              ListsCol{{19, 20, null}, null_at(2)},
+                              ListsCol{{null}, null_at(0)},
+                              ListsCol{} /*NULL*/},
+                             null_at({2, 6})}
+                      .release();
+  auto const col3 = ListsCol{{ListsCol{} /*NULL*/,
+                              ListsCol{{20, null}, null_at(1)},
+                              ListsCol{{null, 21, null, null}, null_at({0, 2, 3})},
+                              ListsCol{},
+                              ListsCol{22, 23, 24, 25},
+                              ListsCol{{null, null, null, null, null}, all_nulls()},
+                              ListsCol{} /*NULL*/},
+                             null_at({0, 6})}
+                      .release();
+
+  // Ignore null list elements
+  {
+    auto const results =
+      cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}});
+    auto const expected =
+      ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})},
+                ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
+                ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})},
+                ListsCol{{null, 18}, null_at(0)},
+                ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
+                ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at(6)}
+        .release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+
+  // Null list rows result in null list rows
+  {
+    auto const results =
+      cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}},
+                                    cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected =
+      ListsCol{{ListsCol{} /*NULL*/,
+                ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})},
+                ListsCol{} /*NULL*/,
+                ListsCol{} /*NULL*/,
+                ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})},
+                ListsCol{{1, 2, 3, null, null, null, null, null, null, null},
+                         null_at({3, 4, 5, 6, 7, 8, 9})},
+                ListsCol{} /*NULL*/},
+               null_at({0, 2, 3, 6})}
+        .release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+}
+
+TEST_F(ListConcatenateRowsTest, SimpleInputStringsColumnsWithNulls)
+{
+  auto const col1 = StrListsCol{
+    StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)},
+    StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const col2 =
+    StrListsCol{
+      {StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{"Lemon", "Peach"},
+       StrListsCol{{"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/}, all_nulls()}}, /*NULL*/
+      null_at(2)}
+      .release();
+
+  // Ignore null list elements
+  {
+    auto const results  = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+    auto const expected = StrListsCol{
+      StrListsCol{{"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/},
+                  null_at({1, 4, 5, 6})},
+      StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"},
+                  null_at({1, 4})},
+      StrListsCol{
+        "Coconut"}}.release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+
+  // Null list rows result in null list rows
+  {
+    auto const results =
+      cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}},
+                                    cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected =
+      StrListsCol{
+        {StrListsCol{
+           {"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/},
+           null_at({1, 4, 5, 6})},
+         StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"},
+                     null_at({1, 4})},
+         StrListsCol{""} /*NULL*/},
+        null_at(2)}
+        .release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+}
+
+TEST_F(ListConcatenateRowsTest, SimpleInputStringsColumnsWithEmptyLists)
+{
+  auto const col1 =
+    StrListsCol{StrListsCol{{"" /*NULL*/}, null_at(0)}, StrListsCol{"One"}}.release();
+  auto const col2 = StrListsCol{
+    StrListsCol{{"Tomato", "" /*NULL*/, "Apple"}, null_at(1)},
+    StrListsCol{
+      "Two"}}.release();
+  auto const col3 =
+    StrListsCol{{StrListsCol{"Lemon", "Peach"}, StrListsCol{"Three"} /*NULL*/}, null_at(1)}
+      .release();
+
+  // Ignore null list elements
+  {
+    auto const results =
+      cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}});
+    auto const expected = StrListsCol{
+      StrListsCol{{"" /*NULL*/, "Tomato", "" /*NULL*/, "Apple", "Lemon", "Peach"}, null_at({0, 2})},
+      StrListsCol{"One",
+                  "Two"}}.release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+
+  // Null list rows result in null list rows
+  {
+    auto const results =
+      cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}},
+                                    cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected =
+      StrListsCol{{StrListsCol{{"" /*NULL*/, "Tomato", "" /*NULL*/, "Apple", "Lemon", "Peach"},
+                               null_at({0, 2})},
+                   StrListsCol{""} /*NULL*/},
+                  null_at(1)}
+        .release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+}
+
+TYPED_TEST(ListConcatenateRowsTypedTest, SlicedColumnsInputNoNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col_original = ListsCol{{1, 2, 3}, {2, 3}, {3, 4, 5, 6}, {5, 6}, {}, {7}}.release();
+  auto const col1         = cudf::slice(col_original->view(), {0, 3})[0];
+  auto const col2         = cudf::slice(col_original->view(), {1, 4})[0];
+  auto const col3         = cudf::slice(col_original->view(), {2, 5})[0];
+  auto const col4         = cudf::slice(col_original->view(), {3, 6})[0];
+  auto const expected     = ListsCol{
+    {1, 2, 3, 2, 3, 3, 4, 5, 6, 5, 6},
+    {2, 3, 3, 4, 5, 6, 5, 6},
+    {3, 4, 5, 6, 5, 6, 7}}.release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col1, col2, col3, col4}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListConcatenateRowsTypedTest, SlicedColumnsInputWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col_original = ListsCol{{ListsCol{{null, 2, 3}, null_at(0)},
+                                      ListsCol{2, 3}, /*NULL*/
+                                      ListsCol{{3, null, 5, 6}, null_at(1)},
+                                      ListsCol{5, 6}, /*NULL*/
+                                      ListsCol{},     /*NULL*/
+                                      ListsCol{7},
+                                      ListsCol{8, 9, 10}},
+                                     null_at({1, 3, 4})}
+                              .release();
+  auto const col1     = cudf::slice(col_original->view(), {0, 3})[0];
+  auto const col2     = cudf::slice(col_original->view(), {1, 4})[0];
+  auto const col3     = cudf::slice(col_original->view(), {2, 5})[0];
+  auto const col4     = cudf::slice(col_original->view(), {3, 6})[0];
+  auto const col5     = cudf::slice(col_original->view(), {4, 7})[0];
+  auto const expected = ListsCol{
+    ListsCol{{null, 2, 3, 3, null, 5, 6}, null_at({0, 4})},
+    ListsCol{{3, null, 5, 6, 7}, null_at(1)},
+    ListsCol{{3, null, 5, 6, 7, 8, 9, 10},
+             null_at(1)}}.release();
+  auto const results = cudf::lists::concatenate_rows(TView{{col1, col2, col3, col4, col5}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TEST_F(ListConcatenateRowsTest, SlicedStringsColumnsInputWithNulls)
+{
+  auto const col =
+    StrListsCol{
+      {StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)},
+       StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})},
+       StrListsCol{"Coconut"},
+       StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{"Lemon", "Peach"},
+       StrListsCol{{"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/}, all_nulls()}}, /*NULL*/
+      null_at(5)}
+      .release();
+  auto const col1 = cudf::slice(col->view(), {0, 3})[0];
+  auto const col2 = cudf::slice(col->view(), {1, 4})[0];
+  auto const col3 = cudf::slice(col->view(), {2, 5})[0];
+  auto const col4 = cudf::slice(col->view(), {3, 6})[0];
+
+  {
+    auto const results  = cudf::lists::concatenate_rows(TView{{col1, col2, col3, col4}});
+    auto const expected = StrListsCol{
+      StrListsCol{{"Tomato",
+                   "" /*NULL*/,
+                   "Apple",
+                   "Banana",
+                   "" /*NULL*/,
+                   "Kiwi",
+                   "Cherry",
+                   "" /*NULL*/,
+                   "Coconut",
+                   "Orange",
+                   "" /*NULL*/,
+                   "" /*NULL*/,
+                   "" /*NULL*/},
+                  null_at({1, 4, 7, 10, 11, 12})},
+      StrListsCol{{"Banana",
+                   "" /*NULL*/,
+                   "Kiwi",
+                   "Cherry",
+                   "" /*NULL*/,
+                   "Coconut",
+                   "Orange",
+                   "" /*NULL*/,
+                   "" /*NULL*/,
+                   "", /*NULL*/
+                   "Lemon",
+                   "Peach"},
+                  null_at({1, 4, 7, 8, 9})},
+      StrListsCol{
+        {
+          "Coconut",
+          "Orange",
+          "" /*NULL*/,
+          "" /*NULL*/,
+          "", /*NULL*/
+          "Lemon",
+          "Peach",
+        },
+        null_at({2, 3, 4})}}.release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+
+  {
+    auto const results = cudf::lists::concatenate_rows(
+      TView{{col1, col2, col3, col4}}, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
+    auto const expected = StrListsCol{{StrListsCol{{"Tomato",
+                                                    "" /*NULL*/,
+                                                    "Apple",
+                                                    "Banana",
+                                                    "" /*NULL*/,
+                                                    "Kiwi",
+                                                    "Cherry",
+                                                    "" /*NULL*/,
+                                                    "Coconut",
+                                                    "Orange",
+                                                    "" /*NULL*/,
+                                                    "" /*NULL*/,
+                                                    "" /*NULL*/},
+                                                   null_at({1, 4, 7, 10, 11, 12})},
+                                       StrListsCol{{"Banana",
+                                                    "" /*NULL*/,
+                                                    "Kiwi",
+                                                    "Cherry",
+                                                    "" /*NULL*/,
+                                                    "Coconut",
+                                                    "Orange",
+                                                    "" /*NULL*/,
+                                                    "" /*NULL*/,
+                                                    "", /*NULL*/
+                                                    "Lemon",
+                                                    "Peach"},
+                                                   null_at({1, 4, 7, 8, 9})},
+                                       StrListsCol{} /*NULL*/},
+                                      null_at(2)}
+                            .release();
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+  }
+}
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 7f8ae436a27..73194271a32 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -16,6 +16,7 @@
  */
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/lists/contains.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -154,6 +155,55 @@ TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullLists)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
 }
 
+TYPED_TEST(TypedContainsTest, SlicedLists)
+{
+  // Test sliced List columns.
+
+  using namespace cudf;
+
+  using T     = TypeParam;
+  using bools = fixed_width_column_wrapper<bool>;
+
+  auto search_space = lists_column_wrapper<T, int32_t>{
+    {{0, 1, 2},
+     {3, 4, 5},
+     {6, 7, 8},
+     {},
+     {9, 0, 1},
+     {2, 3, 4},
+     {5, 6, 7},
+     {8, 9, 0},
+     {},
+     {1, 2, 3},
+     {}},
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return (i != 3) && (i != 10);
+    })}.release();
+
+  auto sliced_column_1 = cudf::detail::slice(search_space->view(), {1, 8}).front();
+
+  auto search_key_one = create_scalar_search_key<T>(1);
+  auto result_1       = lists::contains(sliced_column_1, *search_key_one);
+
+  auto expected_result_1 = bools{
+    {0, 0, 0, 1, 0, 0, 0}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return (i != 2);
+    })}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_1->view(), result_1->view());
+
+  auto sliced_column_2 = cudf::detail::slice(search_space->view(), {3, 10}).front();
+
+  auto result_2 = lists::contains(sliced_column_2, *search_key_one);
+
+  auto expected_result_2 = bools{
+    {0, 1, 0, 0, 0, 0, 1}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return (i != 0);
+    })}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view());
+}
+
 TYPED_TEST(TypedContainsTest, ListContainsScalarNonNullListsWithNullValues)
 {
   // Test List columns that have no NULL list rows, but NULL elements in some list rows.
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
index ded3d2b9193..1685f2793ce 100644
--- a/cpp/tests/lists/explode_tests.cpp
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -370,6 +370,91 @@ TEST_F(ExplodeTest, NestedStructs)
   CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
 }
 
+TEST_F(ExplodeTest, ListOfStructsWithEmpties)
+{
+  //  a           b
+  //  [{1}}]      "a"
+  //  [{null}]    "b"
+  //  [null]      "c"
+  //  []          "d"
+  //  null        "e"
+
+  constexpr auto null = 0;
+
+  // row 0.  1 struct that contains a single int
+  cudf::test::fixed_width_column_wrapper<int32_t> i0{1};
+  std::vector<std::unique_ptr<cudf::column>> s0_cols;
+  s0_cols.push_back(i0.release());
+  cudf::test::structs_column_wrapper s0(std::move(s0_cols));
+  cudf::test::fixed_width_column_wrapper<int32_t> off0{0, 1};
+  auto row0 = cudf::make_lists_column(1, off0.release(), s0.release(), 0, rmm::device_buffer{});
+
+  // row 1.  1 struct that contains a null value
+  cudf::test::fixed_width_column_wrapper<int32_t> i1{{1}, {false}};
+  std::vector<std::unique_ptr<cudf::column>> s1_cols;
+  s1_cols.push_back(i1.release());
+  cudf::test::structs_column_wrapper s1(std::move(s1_cols));
+  cudf::test::fixed_width_column_wrapper<int32_t> off1{0, 1};
+  auto row1 = cudf::make_lists_column(1, off1.release(), s1.release(), 0, rmm::device_buffer{});
+
+  // row 2.  1 null struct
+  cudf::test::fixed_width_column_wrapper<int32_t> i2{0};
+  std::vector<std::unique_ptr<cudf::column>> s2_cols;
+  s2_cols.push_back(i2.release());
+  std::vector<bool> r2_valids{false};
+  auto s2 = cudf::make_structs_column(
+    1,
+    std::move(s2_cols),
+    1,
+    cudf::test::detail::make_null_mask(r2_valids.begin(), r2_valids.end()));
+  cudf::test::fixed_width_column_wrapper<int32_t> off2{0, 1};
+  auto row2 = cudf::make_lists_column(1, off2.release(), std::move(s2), 0, rmm::device_buffer{});
+
+  // row 3.  empty list.
+  cudf::test::fixed_width_column_wrapper<int32_t> i3{};
+  std::vector<std::unique_ptr<cudf::column>> s3_cols;
+  s3_cols.push_back(i3.release());
+  auto s3 = cudf::make_structs_column(0, std::move(s3_cols), 0, rmm::device_buffer{});
+  cudf::test::fixed_width_column_wrapper<int32_t> off3{0, 0};
+  auto row3 = cudf::make_lists_column(1, off3.release(), std::move(s3), 0, rmm::device_buffer{});
+
+  // row 4.  null list
+  cudf::test::fixed_width_column_wrapper<int32_t> i4{};
+  std::vector<std::unique_ptr<cudf::column>> s4_cols;
+  s4_cols.push_back(i4.release());
+  auto s4 = cudf::make_structs_column(0, std::move(s4_cols), 0, rmm::device_buffer{});
+  cudf::test::fixed_width_column_wrapper<int32_t> off4{0, 0};
+  std::vector<bool> r4_valids{false};
+  auto row4 =
+    cudf::make_lists_column(1,
+                            off4.release(),
+                            std::move(s4),
+                            1,
+                            cudf::test::detail::make_null_mask(r4_valids.begin(), r4_valids.end()));
+
+  // concatenated
+  auto final_col =
+    cudf::concatenate(std::vector<cudf::column_view>({*row0, *row1, *row2, *row3, *row4}));
+  auto s = strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
+
+  cudf::table_view t({final_col->view(), s->view()});
+
+  auto ret                  = cudf::explode(t, 0);
+  auto expected_numeric_col = fixed_width_column_wrapper<int32_t>{{1, null, null}, {1, 0, 0}};
+
+  auto expected_a = structs_column_wrapper{{expected_numeric_col}, {1, 1, 0}}.release();
+  auto expected_b = strings_column_wrapper({"a", "b", "c"}).release();
+
+  cudf::table_view expected({expected_a->view(), expected_b->view()});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+  FCW expected_pos_col{0, 0, 0};
+  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b->view()});
+
+  auto pos_ret = cudf::explode_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
 TYPED_TEST(ExplodeTypedTest, ListOfStructs)
 {
   //  a                        b
@@ -532,7 +617,6 @@ TEST_F(ExplodeOuterTest, SingleNull)
 
   FCW expected_pos_col{{0, 0, 1, 0, 0, 1}, {0, 1, 1, 0, 1, 1}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
-
   auto pos_ret = cudf::explode_outer_position(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
 }
@@ -624,7 +708,7 @@ TEST_F(ExplodeOuterTest, SequentialNulls)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1}};
+  FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}, {1, 1, 1, 1, 1, 0, 0, 1, 1, 1}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -753,7 +837,7 @@ TEST_F(ExplodeOuterTest, NullsInList)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1}};
+  FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -885,7 +969,7 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
   FCW expected_pos_col{{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1},
-                       {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
+                       {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0);
@@ -931,6 +1015,93 @@ TEST_F(ExplodeOuterTest, NestedStructs)
   CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
 }
 
+TEST_F(ExplodeOuterTest, ListOfStructsWithEmpties)
+{
+  //  a           b
+  //  [{1}}]      "a"
+  //  [{null}]    "b"
+  //  [null]      "c"
+  //  []          "d"
+  //  null        "e"
+
+  constexpr auto null = 0;
+
+  // row 0.  1 struct that contains a single int
+  cudf::test::fixed_width_column_wrapper<int32_t> i0{1};
+  std::vector<std::unique_ptr<cudf::column>> s0_cols;
+  s0_cols.push_back(i0.release());
+  cudf::test::structs_column_wrapper s0(std::move(s0_cols));
+  cudf::test::fixed_width_column_wrapper<int32_t> off0{0, 1};
+  auto row0 = cudf::make_lists_column(1, off0.release(), s0.release(), 0, rmm::device_buffer{});
+
+  // row 1.  1 struct that contains a null value
+  cudf::test::fixed_width_column_wrapper<int32_t> i1{{1}, {false}};
+  std::vector<std::unique_ptr<cudf::column>> s1_cols;
+  s1_cols.push_back(i1.release());
+  cudf::test::structs_column_wrapper s1(std::move(s1_cols));
+  cudf::test::fixed_width_column_wrapper<int32_t> off1{0, 1};
+  auto row1 = cudf::make_lists_column(1, off1.release(), s1.release(), 0, rmm::device_buffer{});
+
+  // row 2.  1 null struct
+  cudf::test::fixed_width_column_wrapper<int32_t> i2{0};
+  std::vector<std::unique_ptr<cudf::column>> s2_cols;
+  s2_cols.push_back(i2.release());
+  std::vector<bool> r2_valids{false};
+  auto s2 = cudf::make_structs_column(
+    1,
+    std::move(s2_cols),
+    1,
+    cudf::test::detail::make_null_mask(r2_valids.begin(), r2_valids.end()));
+  cudf::test::fixed_width_column_wrapper<int32_t> off2{0, 1};
+  auto row2 = cudf::make_lists_column(1, off2.release(), std::move(s2), 0, rmm::device_buffer{});
+
+  // row 3.  empty list.
+  cudf::test::fixed_width_column_wrapper<int32_t> i3{};
+  std::vector<std::unique_ptr<cudf::column>> s3_cols;
+  s3_cols.push_back(i3.release());
+  auto s3 = cudf::make_structs_column(0, std::move(s3_cols), 0, rmm::device_buffer{});
+  cudf::test::fixed_width_column_wrapper<int32_t> off3{0, 0};
+  auto row3 = cudf::make_lists_column(1, off3.release(), std::move(s3), 0, rmm::device_buffer{});
+
+  // row 4.  null list
+  cudf::test::fixed_width_column_wrapper<int32_t> i4{};
+  std::vector<std::unique_ptr<cudf::column>> s4_cols;
+  s4_cols.push_back(i4.release());
+  auto s4 = cudf::make_structs_column(0, std::move(s4_cols), 0, rmm::device_buffer{});
+  cudf::test::fixed_width_column_wrapper<int32_t> off4{0, 0};
+  std::vector<bool> r4_valids{false};
+  auto row4 =
+    cudf::make_lists_column(1,
+                            off4.release(),
+                            std::move(s4),
+                            1,
+                            cudf::test::detail::make_null_mask(r4_valids.begin(), r4_valids.end()));
+
+  // concatenated
+  auto final_col =
+    cudf::concatenate(std::vector<cudf::column_view>({*row0, *row1, *row2, *row3, *row4}));
+  auto s = strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
+
+  cudf::table_view t({final_col->view(), s->view()});
+
+  auto ret = cudf::explode_outer(t, 0);
+
+  auto expected_numeric_col =
+    fixed_width_column_wrapper<int32_t>{{1, null, null, null, null}, {1, 0, 0, 0, 0}};
+
+  auto expected_a = structs_column_wrapper{{expected_numeric_col}, {1, 1, 0, 0, 0}}.release();
+  auto expected_b = strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
+
+  cudf::table_view expected({expected_a->view(), expected_b->view()});
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+  FCW expected_pos_col{{0, 0, 0, null, null}, {1, 1, 1, 0, 0}};
+  cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b->view()});
+
+  auto pos_ret = cudf::explode_outer_position(t, 0);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
 TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs)
 {
   //  a                        b
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index b7d98704aff..9c8c1248b41 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -30,8 +30,6 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <rmm/thrust_rmm_allocator.h>
-
 #include <vector>
 
 template <typename T>
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 97c61c10718..1a39c7701f6 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
+#include "cudf/detail/utilities/vector_factories.hpp"
 
 using cudf::test::fixed_width_column_wrapper;
 using cudf::test::strings_column_wrapper;
@@ -310,15 +311,15 @@ void run_fixed_width_test(size_t cols,
 
   // Compute the partition number for each row
   cudf::size_type partition = 0;
-  thrust::host_vector<cudf::size_type> partitions;
+  std::vector<cudf::size_type> partitions;
   std::for_each(offsets1.begin() + 1, offsets1.end(), [&](cudf::size_type const& count) {
     std::fill_n(std::back_inserter(partitions), count, partition++);
   });
 
   // Make a table view of the partition numbers
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
-  rmm::device_vector<cudf::size_type> d_partitions(partitions);
-  cudf::column_view partitions_col(dtype, rows, d_partitions.data().get());
+  auto d_partitions = cudf::detail::make_device_uvector_sync(partitions);
+  cudf::column_view partitions_col(dtype, rows, d_partitions.data());
   cudf::table_view partitions_table({partitions_col});
 
   // Sort partition numbers by the corresponding row hashes of each output
diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp
index ed994da20f8..669d406d80a 100644
--- a/cpp/tests/partitioning/partition_test.cpp
+++ b/cpp/tests/partitioning/partition_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -280,3 +280,53 @@ TYPED_TEST(PartitionTestFixedPoint, Partition2)
 
   run_partition_test(cudf::table_view{{input}}, map, 3, cudf::table_view{{expected}}, offsets);
 }
+
+struct PartitionTestNotTyped : public cudf::test::BaseFixture {
+};
+
+TEST_F(PartitionTestNotTyped, ListOfStringsEmpty)
+{
+  cudf::test::lists_column_wrapper<cudf::string_view> list{{}, {}};
+  auto table_to_partition = cudf::table_view{{list}};
+  fixed_width_column_wrapper<int32_t> map{0, 0};
+
+  auto result = cudf::partition(table_to_partition, map, 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view());
+  EXPECT_EQ(3, result.second.size());
+}
+
+TEST_F(PartitionTestNotTyped, ListOfListOfIntEmpty)
+{
+  cudf::test::lists_column_wrapper<int32_t> level_2_list;
+
+  fixed_width_column_wrapper<int32_t> level_1_offsets{0, 0, 0};
+  std::unique_ptr<cudf::column> level_1_list =
+    cudf::make_lists_column(2, level_1_offsets.release(), level_2_list.release(), 0, {});
+
+  auto table_to_partition = cudf::table_view{{*level_1_list}};
+  fixed_width_column_wrapper<int32_t> map{0, 0};
+
+  auto result = cudf::partition(table_to_partition, map, 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view());
+  EXPECT_EQ(3, result.second.size());
+}
+
+TEST_F(PartitionTestNotTyped, ListOfListOfListOfIntEmpty)
+{
+  cudf::test::lists_column_wrapper<int32_t> level_3_list{};
+
+  fixed_width_column_wrapper<int32_t> level_2_offsets{};
+  std::unique_ptr<cudf::column> level_2_list =
+    cudf::make_lists_column(0, level_2_offsets.release(), level_3_list.release(), 0, {});
+
+  fixed_width_column_wrapper<int32_t> level_1_offsets{0, 0};
+  std::unique_ptr<cudf::column> level_1_list =
+    cudf::make_lists_column(1, level_1_offsets.release(), std::move(level_2_list), 0, {});
+
+  auto table_to_partition = cudf::table_view{{*level_1_list}};
+  fixed_width_column_wrapper<int32_t> map{0};
+
+  auto result = cudf::partition(table_to_partition, map, 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view());
+  EXPECT_EQ(3, result.second.size());
+}
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index fce9e77dc55..1d3782daac9 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -27,7 +27,6 @@
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <iostream>
@@ -403,7 +402,13 @@ TYPED_TEST(MultiStepReductionTest, Mean)
                        cudf::data_type(cudf::type_id::FLOAT64));
 }
 
+// This test is disabled for only a Debug build because a compiler error
+// documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
+#ifdef NDEBUG
 TYPED_TEST(MultiStepReductionTest, var_std)
+#else
+TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
+#endif
 {
   using T = TypeParam;
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
@@ -472,7 +477,13 @@ struct ReductionMultiStepErrorCheck : public ReductionTest<T> {
 
 TYPED_TEST_CASE(ReductionMultiStepErrorCheck, cudf::test::AllTypes);
 
+// This test is disabled for only a Debug build because a compiler error
+// documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
+#ifdef NDEBUG
 TYPED_TEST(ReductionMultiStepErrorCheck, ErrorHandling)
+#else
+TYPED_TEST(ReductionMultiStepErrorCheck, DISABLED_ErrorHandling)
+#endif
 {
   using T = TypeParam;
   std::vector<int> int_values({-3, 2});
@@ -701,7 +712,13 @@ struct ReductionParamTest : public ReductionTest<double>,
 
 INSTANTIATE_TEST_CASE_P(ddofParam, ReductionParamTest, ::testing::Range(1, 5));
 
+// This test is disabled for only a Debug build because a compiler error
+// documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
+#ifdef NDEBUG
 TEST_P(ReductionParamTest, std_var)
+#else
+TEST_P(ReductionParamTest, DISABLED_std_var)
+#endif
 {
   int ddof = GetParam();
   std::vector<double> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
@@ -1682,7 +1699,11 @@ TYPED_TEST(DictionaryReductionTest, Mean)
                        output_type);
 }
 
+#ifdef NDEBUG
 TYPED_TEST(DictionaryReductionTest, VarStd)
+#else
+TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
+#endif
 {
   using T = TypeParam;
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 8372b3977c0..ca2af428432 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -423,12 +423,12 @@ TEST_F(ScanStringTest, skip_nulls)
   CUDF_EXPECT_THROW_MESSAGE(
     (cudf::scan(
       col_nulls, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE)),
-    "String types supports only inclusive min/max for `cudf::scan`");
+    "Non-arithmetic types not supported for exclusive scan");
 
   CUDF_EXPECT_THROW_MESSAGE(
     (cudf::scan(
       col_nulls, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE)),
-    "String types supports only inclusive min/max for `cudf::scan`");
+    "Non-arithmetic types not supported for exclusive scan");
 }
 
 TYPED_TEST(ScanTest, EmptyColumnskip_nulls)
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index 47599035709..499745c7dc4 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -135,8 +135,8 @@ TEST_F(ClampEmptyCaseTest, EmptyInput)
 
 template <class T>
 struct ClampTestNumeric : public cudf::test::BaseFixture {
-  std::unique_ptr<cudf::column> run_clamp(std::vector<T> input,
-                                          std::vector<cudf::size_type> input_validity,
+  std::unique_ptr<cudf::column> run_clamp(cudf::host_span<T const> input,
+                                          cudf::host_span<cudf::size_type const> input_validity,
                                           T lo,
                                           bool lo_validity,
                                           T hi,
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index e969f53609e..cd19b0a70f3 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -168,8 +168,9 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
 {
   std::vector<std::string> input{"", "", "", "", "", "", "", ""};
   std::vector<cudf::valid_type> input_v{0, 0, 0, 0, 0, 0, 0, 0};
-  std::unique_ptr<cudf::scalar> repl = cudf::make_string_scalar("rep", 0, mr());
-  repl->set_valid(true, 0);
+  std::unique_ptr<cudf::scalar> repl =
+    cudf::make_string_scalar("rep", rmm::cuda_stream_default, mr());
+  repl->set_valid(true, rmm::cuda_stream_default);
   std::vector<std::string> expected{"rep", "rep", "rep", "rep", "rep", "rep", "rep", "rep"};
 
   cudf::test::strings_column_wrapper input_w{input.begin(), input.end(), input_v.begin()};
@@ -266,10 +267,11 @@ void ReplaceNullsScalar(cudf::test::fixed_width_column_wrapper<T> input,
 
 TYPED_TEST(ReplaceNullsTest, ReplaceColumn)
 {
-  std::vector<TypeParam> inputColumn =
+  auto const inputColumn =
     cudf::test::make_type_param_vector<TypeParam>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-  std::vector<cudf::valid_type> inputValid{0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
-  std::vector<TypeParam> replacementColumn =
+  auto const inputValid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({0, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+  auto const replacementColumn =
     cudf::test::make_type_param_vector<TypeParam>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
 
   ReplaceNullsColumn<TypeParam>(cudf::test::fixed_width_column_wrapper<TypeParam>(
@@ -289,10 +291,11 @@ TYPED_TEST(ReplaceNullsTest, ReplaceColumn_Empty)
 
 TYPED_TEST(ReplaceNullsTest, ReplaceScalar)
 {
-  std::vector<TypeParam> inputColumn =
+  auto const inputColumn =
     cudf::test::make_type_param_vector<TypeParam>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-  std::vector<cudf::valid_type> inputValid{0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
-  std::vector<TypeParam> expectedColumn =
+  auto const inputValid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({0, 0, 0, 0, 0, 1, 1, 1, 1, 1});
+  auto const expectedColumn =
     cudf::test::make_type_param_vector<TypeParam>({1, 1, 1, 1, 1, 5, 6, 7, 8, 9});
   cudf::numeric_scalar<TypeParam> replacement(1);
 
@@ -307,13 +310,16 @@ TYPED_TEST(ReplaceNullsTest, ReplacementHasNulls)
 {
   using T = TypeParam;
 
-  std::vector<T> input_column   = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
-  std::vector<T> replace_column = cudf::test::make_type_param_vector<T>({4, 5, 6, 7, 8, 9, 0, 1});
-  std::vector<T> result_column  = cudf::test::make_type_param_vector<T>({4, 5, 6, 3, 1, 2, 8, 4});
+  auto const input_column   = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
+  auto const replace_column = cudf::test::make_type_param_vector<T>({4, 5, 6, 7, 8, 9, 0, 1});
+  auto const result_column  = cudf::test::make_type_param_vector<T>({4, 5, 6, 3, 1, 2, 8, 4});
 
-  std::vector<cudf::valid_type> input_valid{0, 0, 1, 1, 1, 1, 1, 1};
-  std::vector<cudf::valid_type> replace_valid{1, 0, 1, 1, 1, 1, 1, 1};
-  std::vector<cudf::valid_type> result_valid{1, 0, 1, 1, 1, 1, 1, 1};
+  auto const input_valid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({0, 0, 1, 1, 1, 1, 1, 1});
+  auto const replace_valid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 1, 1, 1, 1, 1, 1});
+  auto const result_valid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 1, 1, 1, 1, 1, 1});
 
   ReplaceNullsColumn<T>(cudf::test::fixed_width_column_wrapper<T>(
                           input_column.begin(), input_column.end(), input_valid.begin()),
@@ -374,11 +380,9 @@ void TestReplaceNullsWithPolicy(cudf::test::fixed_width_column_wrapper<T> input,
 
 TYPED_TEST(ReplaceNullsPolicyTest, PrecedingFill)
 {
-  std::vector<TypeParam> col =
-    cudf::test::make_type_param_vector<TypeParam>({42, 2, 1, -10, 20, -30});
-  std::vector<cudf::valid_type> mask =
-    cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 0, 1, 0, 1});
-  std::vector<TypeParam> expect_col =
+  auto const col  = cudf::test::make_type_param_vector<TypeParam>({42, 2, 1, -10, 20, -30});
+  auto const mask = cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 0, 1, 0, 1});
+  auto const expect_col =
     cudf::test::make_type_param_vector<TypeParam>({42, 42, 42, -10, -10, -30});
 
   TestReplaceNullsWithPolicy(
@@ -390,11 +394,9 @@ TYPED_TEST(ReplaceNullsPolicyTest, PrecedingFill)
 
 TYPED_TEST(ReplaceNullsPolicyTest, FollowingFill)
 {
-  std::vector<TypeParam> col =
-    cudf::test::make_type_param_vector<TypeParam>({42, 2, 1, -10, 20, -30});
-  std::vector<cudf::valid_type> mask =
-    cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 0, 1, 0, 1});
-  std::vector<TypeParam> expect_col =
+  auto const col  = cudf::test::make_type_param_vector<TypeParam>({42, 2, 1, -10, 20, -30});
+  auto const mask = cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 0, 1, 0, 1});
+  auto const expect_col =
     cudf::test::make_type_param_vector<TypeParam>({42, -10, -10, -10, -30, -30});
 
   TestReplaceNullsWithPolicy(
@@ -406,13 +408,10 @@ TYPED_TEST(ReplaceNullsPolicyTest, FollowingFill)
 
 TYPED_TEST(ReplaceNullsPolicyTest, PrecedingFillLeadingNulls)
 {
-  std::vector<TypeParam> col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-  std::vector<cudf::valid_type> mask =
-    cudf::test::make_type_param_vector<cudf::valid_type>({0, 0, 1, 0, 1});
-  std::vector<TypeParam> expect_col =
-    cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 3, 5});
-  std::vector<cudf::valid_type> expect_mask =
-    cudf::test::make_type_param_vector<cudf::valid_type>({0, 0, 1, 1, 1});
+  auto const col         = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+  auto const mask        = cudf::test::make_type_param_vector<cudf::valid_type>({0, 0, 1, 0, 1});
+  auto const expect_col  = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 3, 5});
+  auto const expect_mask = cudf::test::make_type_param_vector<cudf::valid_type>({0, 0, 1, 1, 1});
 
   TestReplaceNullsWithPolicy(
     cudf::test::fixed_width_column_wrapper<TypeParam>(col.begin(), col.end(), mask.begin()),
@@ -423,13 +422,10 @@ TYPED_TEST(ReplaceNullsPolicyTest, PrecedingFillLeadingNulls)
 
 TYPED_TEST(ReplaceNullsPolicyTest, FollowingFillTrailingNulls)
 {
-  std::vector<TypeParam> col = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
-  std::vector<cudf::valid_type> mask =
-    cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 1, 0, 0});
-  std::vector<TypeParam> expect_col =
-    cudf::test::make_type_param_vector<TypeParam>({1, 3, 3, 4, 5});
-  std::vector<cudf::valid_type> expect_mask =
-    cudf::test::make_type_param_vector<cudf::valid_type>({1, 1, 1, 0, 0});
+  auto const col         = cudf::test::make_type_param_vector<TypeParam>({1, 2, 3, 4, 5});
+  auto const mask        = cudf::test::make_type_param_vector<cudf::valid_type>({1, 0, 1, 0, 0});
+  auto const expect_col  = cudf::test::make_type_param_vector<TypeParam>({1, 3, 3, 4, 5});
+  auto const expect_mask = cudf::test::make_type_param_vector<cudf::valid_type>({1, 1, 1, 0, 0});
 
   TestReplaceNullsWithPolicy(
     cudf::test::fixed_width_column_wrapper<TypeParam>(col.begin(), col.end(), mask.begin()),
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 32ddf11f16d..58ef08f6052 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -25,17 +25,17 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/dictionary/encode.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
-#include "cudf/fixed_point/fixed_point.hpp"
 
-#include <thrust/device_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <gtest/gtest.h>
 #include <cstdlib>
 #include <iostream>
 #include <vector>
+#include "cudf/types.hpp"
 
 struct ReplaceErrorTest : public cudf::test::BaseFixture {
 };
@@ -315,13 +315,12 @@ struct ReplaceTest : cudf::test::BaseFixture {
  * @param print Optionally print the set of columns for debug
  */
 template <typename T>
-void test_replace(
-  std::vector<T> const& input_column,
-  std::vector<T> const& values_to_replace_column,
-  std::vector<T> const& replacement_values_column,
-  std::vector<cudf::valid_type> const& input_column_valid       = std::vector<cudf::valid_type>{},
-  std::vector<cudf::valid_type> const& replacement_values_valid = std::vector<cudf::valid_type>{},
-  bool print                                                    = false)
+void test_replace(cudf::host_span<T const> input_column,
+                  cudf::host_span<T const> values_to_replace_column,
+                  cudf::host_span<T const> replacement_values_column,
+                  cudf::host_span<cudf::valid_type const> input_column_valid       = {},
+                  cudf::host_span<cudf::valid_type const> replacement_values_valid = {},
+                  bool print                                                       = false)
 {
   cudf::test::fixed_width_column_wrapper<T> _input_column(input_column.begin(), input_column.end());
   if (input_column_valid.size() > 0) {
@@ -346,9 +345,10 @@ void test_replace(
                     _input_column, _values_to_replace_column, _replacement_values_column));
 
   /* computing the expected result */
-  std::vector<T> reference_result(input_column);
-  std::vector<bool> isReplaced(reference_result.size(), false);
-  std::vector<cudf::valid_type> expected_valid(input_column_valid);
+  thrust::host_vector<T> reference_result(input_column.begin(), input_column.end());
+  thrust::host_vector<bool> isReplaced(reference_result.size(), false);
+  thrust::host_vector<cudf::valid_type> expected_valid(input_column_valid.begin(),
+                                                       input_column_valid.end());
   if (replacement_values_valid.size() > 0 && 0 == input_column_valid.size()) {
     expected_valid.assign(input_column.size(), true);
   }
@@ -396,10 +396,10 @@ TYPED_TEST_CASE(ReplaceTest, Types);
 // Simple test, replacing all even replacement_values_column
 TYPED_TEST(ReplaceTest, ReplaceEvenPosition)
 {
-  using T                     = TypeParam;
-  std::vector<T> input_column = cudf::test::make_type_param_vector<T>({1, 2, 3, 4, 5, 6, 7, 8});
-  std::vector<T> values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
-  std::vector<T> replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
+  using T                 = TypeParam;
+  auto const input_column = cudf::test::make_type_param_vector<T>({1, 2, 3, 4, 5, 6, 7, 8});
+  auto const values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
+  auto const replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
 
   test_replace<T>(input_column, values_to_replace_column, replacement_values_column);
 }
@@ -407,10 +407,10 @@ TYPED_TEST(ReplaceTest, ReplaceEvenPosition)
 // Similar test as ReplaceEvenPosition, but with unordered data
 TYPED_TEST(ReplaceTest, Unordered)
 {
-  using T                     = TypeParam;
-  std::vector<T> input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
-  std::vector<T> values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
-  std::vector<T> replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
+  using T                 = TypeParam;
+  auto const input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
+  auto const values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
+  auto const replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
 
   test_replace<T>(input_column, values_to_replace_column, replacement_values_column);
 }
@@ -418,10 +418,10 @@ TYPED_TEST(ReplaceTest, Unordered)
 // Testing with Nothing To Replace
 TYPED_TEST(ReplaceTest, NothingToReplace)
 {
-  using T                     = TypeParam;
-  std::vector<T> input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
-  std::vector<T> values_to_replace_column  = cudf::test::make_type_param_vector<T>({10, 11, 12});
-  std::vector<T> replacement_values_column = cudf::test::make_type_param_vector<T>({15, 16, 17});
+  using T                 = TypeParam;
+  auto const input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
+  auto const values_to_replace_column  = cudf::test::make_type_param_vector<T>({10, 11, 12});
+  auto const replacement_values_column = cudf::test::make_type_param_vector<T>({15, 16, 17});
 
   test_replace<T>(input_column, values_to_replace_column, replacement_values_column);
 }
@@ -430,9 +430,9 @@ TYPED_TEST(ReplaceTest, NothingToReplace)
 TYPED_TEST(ReplaceTest, EmptyData)
 {
   using T = TypeParam;
-  std::vector<T> input_column{{}};
-  std::vector<T> values_to_replace_column  = cudf::test::make_type_param_vector<T>({10, 11, 12});
-  std::vector<T> replacement_values_column = cudf::test::make_type_param_vector<T>({15, 16, 17});
+  thrust::host_vector<T> input_column{{}};
+  auto const values_to_replace_column  = cudf::test::make_type_param_vector<T>({10, 11, 12});
+  auto const replacement_values_column = cudf::test::make_type_param_vector<T>({15, 16, 17});
 
   test_replace<T>(input_column, values_to_replace_column, replacement_values_column);
 }
@@ -440,10 +440,10 @@ TYPED_TEST(ReplaceTest, EmptyData)
 // Testing with empty Replace
 TYPED_TEST(ReplaceTest, EmptyReplace)
 {
-  using T                     = TypeParam;
-  std::vector<T> input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
-  std::vector<T> values_to_replace_column{};
-  std::vector<T> replacement_values_column{};
+  using T                 = TypeParam;
+  auto const input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
+  thrust::host_vector<T> values_to_replace_column{};
+  thrust::host_vector<T> replacement_values_column{};
 
   test_replace<T>(input_column, values_to_replace_column, replacement_values_column);
 }
@@ -451,11 +451,12 @@ TYPED_TEST(ReplaceTest, EmptyReplace)
 // Testing with input column containing nulls
 TYPED_TEST(ReplaceTest, NullsInData)
 {
-  using T                     = TypeParam;
-  std::vector<T> input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
-  std::vector<cudf::valid_type> input_column_valid{1, 1, 1, 0, 0, 1, 1, 1};
-  std::vector<T> values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
-  std::vector<T> replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
+  using T                 = TypeParam;
+  auto const input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
+  auto const input_column_valid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({1, 1, 1, 0, 0, 1, 1, 1});
+  auto const values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
+  auto const replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
 
   test_replace<T>(
     input_column, values_to_replace_column, replacement_values_column, input_column_valid);
@@ -464,11 +465,12 @@ TYPED_TEST(ReplaceTest, NullsInData)
 // Testing with replacement column containing nulls
 TYPED_TEST(ReplaceTest, NullsInNewValues)
 {
-  using T                     = TypeParam;
-  std::vector<T> input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
-  std::vector<T> values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
-  std::vector<T> replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
-  std::vector<cudf::valid_type> replacement_values_valid{0, 1, 1, 1};
+  using T                 = TypeParam;
+  auto const input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
+  auto const values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
+  auto const replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
+  auto const replacement_values_valid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({0, 1, 1, 1});
 
   test_replace<TypeParam>(input_column,
                           values_to_replace_column,
@@ -480,12 +482,14 @@ TYPED_TEST(ReplaceTest, NullsInNewValues)
 // Testing with both replacement and input column containing nulls
 TYPED_TEST(ReplaceTest, NullsInBoth)
 {
-  using T                     = TypeParam;
-  std::vector<T> input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
-  std::vector<cudf::valid_type> input_column_valid{1, 1, 1, 0, 0, 1, 1, 1};
-  std::vector<T> values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
-  std::vector<T> replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
-  std::vector<cudf::valid_type> replacement_values_valid{1, 1, 0, 1};
+  using T                 = TypeParam;
+  auto const input_column = cudf::test::make_type_param_vector<T>({7, 5, 6, 3, 1, 2, 8, 4});
+  auto const input_column_valid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({1, 1, 1, 0, 0, 1, 1, 1});
+  auto const values_to_replace_column  = cudf::test::make_type_param_vector<T>({2, 6, 4, 8});
+  auto const replacement_values_column = cudf::test::make_type_param_vector<T>({0, 4, 2, 6});
+  auto const replacement_values_valid =
+    cudf::test::make_type_param_vector<cudf::valid_type>({1, 1, 0, 1});
 
   test_replace<TypeParam>(input_column,
                           values_to_replace_column,
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index 654df7589e6..bece196ccac 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,17 +16,12 @@
 
 #include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/reshape.hpp>
 
-#include <type_traits>
-#include "cudf/utilities/traits.hpp"
-
 using namespace cudf::test;
 
 template <typename T>
@@ -374,4 +369,371 @@ TYPED_TEST(FixedPointTestBothReps, FixedPointInterleave)
   }
 }
 
+namespace {
+using StrListsCol = cudf::test::lists_column_wrapper<cudf::string_view>;
+using IntListsCol = cudf::test::lists_column_wrapper<int32_t>;
+using IntCol      = cudf::test::fixed_width_column_wrapper<int32_t>;
+using TView       = cudf::table_view;
+
+constexpr bool print_all{false};  // For debugging
+constexpr int32_t null{0};
+
+auto all_nulls() { return cudf::test::iterator_all_nulls(); }
+
+auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); }
+
+auto null_at(std::vector<cudf::size_type> const& indices)
+{
+  return cudf::test::iterator_with_null_at(cudf::host_span<cudf::size_type const>{indices});
+}
+
+}  // namespace
+
+struct ListsColumnsInterleaveTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(ListsColumnsInterleaveTest, InvalidInput)
+{
+  // Input table contains non-list column
+  {
+    auto const col1 = IntCol{}.release();
+    auto const col2 = IntListsCol{}.release();
+    EXPECT_THROW(cudf::interleave_columns(TView{{col1->view(), col2->view()}}), cudf::logic_error);
+  }
+
+  // Types mismatch
+  {
+    auto const col1 = IntListsCol{}.release();
+    auto const col2 = StrListsCol{}.release();
+    EXPECT_THROW(cudf::interleave_columns(TView{{col1->view(), col2->view()}}), cudf::logic_error);
+  }
+
+  // Nested types are not supported
+  {
+    auto const col = IntListsCol{{IntListsCol{1, 2, 3}, IntListsCol{4, 5, 6}}}.release();
+    EXPECT_THROW(cudf::interleave_columns(TView{{col->view(), col->view()}}), cudf::logic_error);
+  }
+}
+
+template <typename T>
+struct ListsColumnsInterleaveTypedTest : public cudf::test::BaseFixture {
+};
+
+using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                        cudf::test::FloatingPointTypes,
+                                        cudf::test::FixedPointTypes>;
+TYPED_TEST_CASE(ListsColumnsInterleaveTypedTest, TypesForTest);
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, InterleaveEmptyColumns)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col     = ListsCol{}.release();
+  auto const results = cudf::interleave_columns(TView{{col->view(), col->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, InterleaveOneColumnNotNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col     = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
+  auto const results = cudf::interleave_columns(TView{{col->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, InterleaveOneColumnWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col = ListsCol{{ListsCol{{1, 2, null}, null_at(2)},
+                             ListsCol{} /*NULL*/,
+                             ListsCol{{null, 3, 4, 4, 4, 4}, null_at(0)},
+                             ListsCol{5, 6}},
+                            null_at(1)}
+                     .release();
+  auto const results = cudf::interleave_columns(TView{{col->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, SimpleInputNoNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col1     = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
+  auto const col2     = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
+  auto const expected = ListsCol{{1, 2}, {7, 8}, {3, 4}, {9, 10}, {5, 6}, {11, 12}}.release();
+  auto const results  = cudf::interleave_columns(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TEST_F(ListsColumnsInterleaveTest, SimpleInputStringsColumnsNoNull)
+{
+  auto const col1 = StrListsCol{
+    StrListsCol{"Tomato", "Apple"},
+    StrListsCol{"Banana", "Kiwi", "Cherry"},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const col2 =
+    StrListsCol{StrListsCol{"Orange"}, StrListsCol{"Lemon", "Peach"}, StrListsCol{}}.release();
+  auto const expected = StrListsCol{
+    StrListsCol{"Tomato", "Apple"},
+    StrListsCol{"Orange"},
+    StrListsCol{"Banana", "Kiwi", "Cherry"},
+    StrListsCol{"Lemon", "Peach"},
+    StrListsCol{"Coconut"},
+    StrListsCol{}}.release();
+  auto const results = cudf::interleave_columns(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, SimpleInputWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col1 = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)},
+                              ListsCol{{null, 2, 3, 4}, null_at(0)},
+                              ListsCol{{null, 2, 3, 4}, null_at(0)},
+                              ListsCol{} /*NULL*/,
+                              ListsCol{{1, 2, null, 4}, null_at(2)},
+                              ListsCol{{1, 2, 3, null}, null_at(3)}},
+                             null_at(3)}
+                      .release();
+  auto const col2 = ListsCol{{ListsCol{{10, 11, 12, null}, null_at(3)},
+                              ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)},
+                              ListsCol{} /*NULL*/,
+                              ListsCol{{null, 18}, null_at(0)},
+                              ListsCol{{19, 20, null}, null_at(2)},
+                              ListsCol{{null}, null_at(0)}},
+                             null_at(2)}
+                      .release();
+  auto const col3 = ListsCol{{ListsCol{} /*NULL*/,
+                              ListsCol{{20, null}, null_at(1)},
+                              ListsCol{{null, 21, null, null}, null_at({0, 2, 3})},
+                              ListsCol{},
+                              ListsCol{22, 23, 24, 25},
+                              ListsCol{{null, null, null, null, null}, all_nulls()}},
+                             null_at(0)}
+                      .release();
+  auto const expected = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)},
+                                  ListsCol{{10, 11, 12, null}, null_at(3)},
+                                  ListsCol{} /*NULL*/,
+                                  ListsCol{{null, 2, 3, 4}, null_at(0)},
+                                  ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)},
+                                  ListsCol{{20, null}, null_at(1)},
+                                  ListsCol{{null, 2, 3, 4}, null_at(0)},
+                                  ListsCol{} /*NULL*/,
+                                  ListsCol{{null, 21, null, null}, null_at({0, 2, 3})},
+                                  ListsCol{} /*NULL*/,
+                                  ListsCol{{null, 18}, null_at(0)},
+                                  ListsCol{},
+                                  ListsCol{{1, 2, null, 4}, null_at(2)},
+                                  ListsCol{{19, 20, null}, null_at(2)},
+                                  ListsCol{22, 23, 24, 25},
+                                  ListsCol{{1, 2, 3, null}, null_at(3)},
+                                  ListsCol{{null}, null_at(0)},
+                                  ListsCol{{null, null, null, null, null}, all_nulls()}},
+                                 null_at({2, 7, 9})}
+                          .release();
+  auto const results = cudf::interleave_columns(TView{{col1->view(), col2->view(), col3->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, SimpleInputWithNullableChild)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col1 = ListsCol{{1, 2}, {3, 4}}.release();
+  auto const col2 = ListsCol{{5, 6}, {7, 8}}.release();
+  auto const col3 = ListsCol{{9, 10}, ListsCol{{null, 12}, null_at(0)}}.release();
+  auto const expected =
+    ListsCol{{1, 2}, {5, 6}, {9, 10}, {3, 4}, {7, 8}, ListsCol{{null, 12}, null_at(0)}}.release();
+  auto const results = cudf::interleave_columns(TView{{col1->view(), col2->view(), col3->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TEST_F(ListsColumnsInterleaveTest, SimpleInputStringsColumnsWithNulls)
+{
+  auto const col1 = StrListsCol{
+    StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)},
+    StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const col2 =
+    StrListsCol{
+      {StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{"Lemon", "Peach"},
+       StrListsCol{{"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/}, all_nulls()}}, /*NULL*/
+      null_at(2)}
+      .release();
+
+  auto const expected =
+    StrListsCol{
+      {StrListsCol{{"Tomato", "" /*NULL*/, "Apple"}, null_at(1)},
+       StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/}, null_at({1, 4})},
+       StrListsCol{"Lemon", "Peach"},
+       StrListsCol{"Coconut"},
+       StrListsCol{}}, /*NULL*/
+      null_at(5)}
+      .release();
+  auto const results = cudf::interleave_columns(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TEST_F(ListsColumnsInterleaveTest, SimpleInputStringsColumnsWithNullableChild)
+{
+  auto const col1 = StrListsCol{
+    StrListsCol{"Tomato", "Bear", "Apple"},
+    StrListsCol{"Banana", "Pig", "Kiwi", "Cherry", "Whale"},
+    StrListsCol{
+      "Coconut"}}.release();
+  auto const col2 = StrListsCol{
+    StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})},
+    StrListsCol{"Lemon", "Peach"},
+    StrListsCol{
+      {"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/},
+      all_nulls()}}.release();
+
+  auto const expected = StrListsCol{
+    StrListsCol{"Tomato", "Bear", "Apple"},
+    StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})},
+    StrListsCol{"Banana", "Pig", "Kiwi", "Cherry", "Whale"},
+    StrListsCol{"Lemon", "Peach"},
+    StrListsCol{"Coconut"},
+    StrListsCol{
+      {"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/},
+      all_nulls()}}.release();
+  auto const results = cudf::interleave_columns(TView{{col1->view(), col2->view()}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, SlicedColumnsInputNoNull)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col      = ListsCol{{1, 2, 3}, {2, 3}, {3, 4, 5, 6}, {5, 6}, {}, {7}}.release();
+  auto const col1     = cudf::slice(col->view(), {0, 3})[0];
+  auto const col2     = cudf::slice(col->view(), {1, 4})[0];
+  auto const col3     = cudf::slice(col->view(), {2, 5})[0];
+  auto const col4     = cudf::slice(col->view(), {3, 6})[0];
+  auto const expected = ListsCol{
+    ListsCol{1, 2, 3},
+    ListsCol{2, 3},
+    ListsCol{3, 4, 5, 6},
+    ListsCol{5, 6},
+    ListsCol{2, 3},
+    ListsCol{3, 4, 5, 6},
+    ListsCol{5, 6},
+    ListsCol{},
+    ListsCol{3, 4, 5, 6},
+    ListsCol{5, 6},
+    ListsCol{},
+    ListsCol{7}}.release();
+  auto const results = cudf::interleave_columns(TView{{col1, col2, col3, col4}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, SlicedColumnsInputWithNulls)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col = ListsCol{{ListsCol{{null, 2, 3}, null_at(0)},
+                             ListsCol{2, 3}, /*NULL*/
+                             ListsCol{{3, null, 5, 6}, null_at(1)},
+                             ListsCol{5, 6}, /*NULL*/
+                             ListsCol{},     /*NULL*/
+                             ListsCol{7},
+                             ListsCol{8, 9, 10}},
+                            null_at({1, 3, 4})}
+                     .release();
+  auto const col1     = cudf::slice(col->view(), {0, 3})[0];
+  auto const col2     = cudf::slice(col->view(), {1, 4})[0];
+  auto const col3     = cudf::slice(col->view(), {2, 5})[0];
+  auto const col4     = cudf::slice(col->view(), {3, 6})[0];
+  auto const col5     = cudf::slice(col->view(), {4, 7})[0];
+  auto const expected = ListsCol{{ListsCol{{null, 2, 3}, null_at(0)},
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{{3, null, 5, 6}, null_at(1)},
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{{3, null, 5, 6}, null_at(1)},
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{7},
+                                  ListsCol{{3, null, 5, 6}, null_at(1)},
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{}, /*NULL*/
+                                  ListsCol{7},
+                                  ListsCol{8, 9, 10}},
+                                 null_at({1, 3, 4, 5, 7, 8, 11, 12})}
+                          .release();
+  auto const results = cudf::interleave_columns(TView{{col1, col2, col3, col4, col5}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TYPED_TEST(ListsColumnsInterleaveTypedTest, SlicedColumnsInputNullableChild)
+{
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  auto const col =
+    ListsCol{{1, 2, 3}, ListsCol{{null, 3}, null_at(0)}, {3, 4, 5, 6}, {5, 6}, {}, {7}}.release();
+  auto const col1     = cudf::slice(col->view(), {0, 3})[0];
+  auto const col2     = cudf::slice(col->view(), {1, 4})[0];
+  auto const col3     = cudf::slice(col->view(), {2, 5})[0];
+  auto const col4     = cudf::slice(col->view(), {3, 6})[0];
+  auto const expected = ListsCol{
+    ListsCol{1, 2, 3},
+    ListsCol{{null, 3}, null_at(0)},
+    ListsCol{3, 4, 5, 6},
+    ListsCol{5, 6},
+    ListsCol{{null, 3}, null_at(0)},
+    ListsCol{3, 4, 5, 6},
+    ListsCol{5, 6},
+    ListsCol{},
+    ListsCol{3, 4, 5, 6},
+    ListsCol{5, 6},
+    ListsCol{},
+    ListsCol{7}}.release();
+  auto const results = cudf::interleave_columns(TView{{col1, col2, col3, col4}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
+TEST_F(ListsColumnsInterleaveTest, SlicedStringsColumnsInputWithNulls)
+{
+  auto const col =
+    StrListsCol{
+      {StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)},
+       StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})},
+       StrListsCol{"Coconut"},
+       StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{"Lemon", "Peach"},
+       StrListsCol{{"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/}, all_nulls()}}, /*NULL*/
+      null_at(5)}
+      .release();
+  auto const col1 = cudf::slice(col->view(), {0, 3})[0];
+  auto const col2 = cudf::slice(col->view(), {1, 4})[0];
+  auto const col3 = cudf::slice(col->view(), {2, 5})[0];
+  auto const col4 = cudf::slice(col->view(), {3, 6})[0];
+  auto const expected =
+    StrListsCol{
+      {StrListsCol{{"Tomato", "" /*NULL*/, "Apple"}, null_at(1)},
+       StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/}, null_at({1, 4})},
+       StrListsCol{"Coconut"},
+       StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/}, null_at({1, 4})},
+       StrListsCol{"Coconut"},
+       StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{"Lemon", "Peach"},
+       StrListsCol{"Coconut"},
+       StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})},
+       StrListsCol{"Lemon", "Peach"},
+       StrListsCol{}}, /*NULL*/
+      null_at(11)}
+      .release();
+  auto const results = cudf::interleave_columns(TView{{col1, col2, col3, col4}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/rolling/collect_list_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
similarity index 53%
rename from cpp/tests/rolling/collect_list_test.cpp
rename to cpp/tests/rolling/collect_ops_test.cpp
index de179223d68..f97e13b49f1 100644
--- a/cpp/tests/rolling/collect_list_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -64,7 +64,11 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow)
             static_cast<column_view>(foll_column).size());
 
   auto const result_column_based_window =
-    rolling_window(input_column, prev_column, foll_column, 1, make_collect_list_aggregation());
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   1,
+                   *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result =
     lists_column_wrapper<T, int32_t>{
@@ -79,11 +83,15 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
 
   auto const result_fixed_window =
-    rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation());
+    rolling_window(input_column, 2, 1, 1, *make_collect_list_aggregation<rolling_aggregation>());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view());
 
   auto const result_with_nulls_excluded =
-    rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation(null_policy::EXCLUDE));
+    rolling_window(input_column,
+                   2,
+                   1,
+                   1,
+                   *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -104,7 +112,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists)
             static_cast<column_view>(foll_column).size());
 
   auto const result_column_based_window =
-    rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation());
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   0,
+                   *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result =
     lists_column_wrapper<T, int32_t>{
@@ -119,8 +131,12 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
 
-  auto const result_with_nulls_excluded = rolling_window(
-    input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded =
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   0,
+                   *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -137,16 +153,23 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds)
   auto const prev_column = fixed_width_column_wrapper<size_type>{0, 2, 2, 2, 2, 0};
   auto foll_column       = fixed_width_column_wrapper<size_type>{0, 1, 1, 1, 1, 0};
 
-  auto const result =
-    rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation());
+  auto const result = rolling_window(input_column,
+                                     prev_column,
+                                     foll_column,
+                                     0,
+                                     *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result =
     lists_column_wrapper<T, int32_t>{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release();
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded = rolling_window(
-    input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded =
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   0,
+                   *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -167,8 +190,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
   auto preceding    = 2;
   auto following    = 1;
   auto min_periods  = 3;
-  auto const result = rolling_window(
-    input_column, preceding, following, min_periods, make_collect_list_aggregation());
+  auto const result = rolling_window(input_column,
+                                     preceding,
+                                     following,
+                                     min_periods,
+                                     *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}},
@@ -183,7 +209,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_list_aggregation(null_policy::EXCLUDE));
+                   *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 
@@ -191,8 +217,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
   following   = 2;
   min_periods = 4;
 
-  auto result_2 = rolling_window(
-    input_column, preceding, following, min_periods, make_collect_list_aggregation());
+  auto result_2          = rolling_window(input_column,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *make_collect_list_aggregation<rolling_aggregation>());
   auto expected_result_2 = lists_column_wrapper<T, int32_t>{
     {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}},
     cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
@@ -206,7 +235,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_list_aggregation(null_policy::EXCLUDE));
+                   *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(),
                                       result_2_with_nulls_excluded->view());
@@ -231,8 +260,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
     auto preceding    = 2;
     auto following    = 1;
     auto min_periods  = 3;
-    auto const result = rolling_window(
-      input_column, preceding, following, min_periods, make_collect_list_aggregation());
+    auto const result = rolling_window(input_column,
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *make_collect_list_aggregation<rolling_aggregation>());
 
     auto expected_result_child_values   = std::vector<int32_t>{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5};
     auto expected_result_child_validity = std::vector<bool>{1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1};
@@ -258,14 +290,15 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
   {
     // One result row at each end should be null.
     // Exclude nulls: No nulls elements for any output list rows.
-    auto preceding    = 2;
-    auto following    = 1;
-    auto min_periods  = 3;
-    auto const result = rolling_window(input_column,
-                                       preceding,
-                                       following,
-                                       min_periods,
-                                       make_collect_list_aggregation(null_policy::EXCLUDE));
+    auto preceding   = 2;
+    auto following   = 1;
+    auto min_periods = 3;
+    auto const result =
+      rolling_window(input_column,
+                     preceding,
+                     following,
+                     min_periods,
+                     *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
     auto expected_result_child_values = std::vector<int32_t>{0, 2, 2, 3, 2, 3, 3, 5};
     auto expected_result_child        = fixed_width_column_wrapper<T, int32_t>(
@@ -290,8 +323,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
     auto preceding    = 2;
     auto following    = 2;
     auto min_periods  = 4;
-    auto const result = rolling_window(
-      input_column, preceding, following, min_periods, make_collect_list_aggregation());
+    auto const result = rolling_window(input_column,
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *make_collect_list_aggregation<rolling_aggregation>());
 
     auto expected_result_child_values   = std::vector<int32_t>{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5};
     auto expected_result_child_validity = std::vector<bool>{1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1};
@@ -318,14 +354,15 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
   {
     // First result row, and the last two result rows should be null.
     // Exclude nulls: No nulls elements for any output list rows.
-    auto preceding    = 2;
-    auto following    = 2;
-    auto min_periods  = 4;
-    auto const result = rolling_window(input_column,
-                                       preceding,
-                                       following,
-                                       min_periods,
-                                       make_collect_list_aggregation(null_policy::EXCLUDE));
+    auto preceding   = 2;
+    auto following   = 2;
+    auto min_periods = 4;
+    auto const result =
+      rolling_window(input_column,
+                     preceding,
+                     following,
+                     min_periods,
+                     *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
     auto expected_result_child_values = std::vector<int32_t>{0, 2, 3, 2, 3, 2, 3, 5};
     auto expected_result_child        = fixed_width_column_wrapper<T, int32_t>(
@@ -361,8 +398,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
   auto preceding    = 2;
   auto following    = 1;
   auto min_periods  = 3;
-  auto const result = rolling_window(
-    input_column, preceding, following, min_periods, make_collect_list_aggregation());
+  auto const result = rolling_window(input_column,
+                                     preceding,
+                                     following,
+                                     min_periods,
+                                     *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result = lists_column_wrapper<string_view>{
     {{}, {"0", "1", "2"}, {"1", "2", "3"}, {"2", "3", "4"}, {"3", "4", "5"}, {}},
@@ -377,7 +417,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_list_aggregation(null_policy::EXCLUDE));
+                   *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 
@@ -385,8 +425,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
   following   = 2;
   min_periods = 4;
 
-  auto result_2 = rolling_window(
-    input_column, preceding, following, min_periods, make_collect_list_aggregation());
+  auto result_2          = rolling_window(input_column,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *make_collect_list_aggregation<rolling_aggregation>());
   auto expected_result_2 = lists_column_wrapper<string_view>{
     {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}},
     cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
@@ -400,7 +443,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
                    preceding,
                    following,
                    min_periods,
-                   make_collect_list_aggregation(null_policy::EXCLUDE));
+                   *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(),
                                       result_2_with_nulls_excluded->view());
@@ -424,8 +467,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
     auto preceding    = 2;
     auto following    = 1;
     auto min_periods  = 3;
-    auto const result = rolling_window(
-      input_column, preceding, following, min_periods, make_collect_list_aggregation());
+    auto const result = rolling_window(input_column,
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *make_collect_list_aggregation<rolling_aggregation>());
 
     auto expected_result_child_values = std::vector<int32_t>{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5};
     auto expected_result_child =
@@ -451,7 +497,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
                      preceding,
                      following,
                      min_periods,
-                     make_collect_list_aggregation(null_policy::EXCLUDE));
+                     *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(),
                                         result_with_nulls_excluded->view());
@@ -462,8 +508,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
     auto preceding    = 2;
     auto following    = 2;
     auto min_periods  = 4;
-    auto const result = rolling_window(
-      input_column, preceding, following, min_periods, make_collect_list_aggregation());
+    auto const result = rolling_window(input_column,
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *make_collect_list_aggregation<rolling_aggregation>());
 
     auto expected_result_child_values = std::vector<int32_t>{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5};
     auto expected_result_child =
@@ -489,7 +538,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
                      preceding,
                      following,
                      min_periods,
-                     make_collect_list_aggregation(null_policy::EXCLUDE));
+                     *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(),
                                         result_with_nulls_excluded->view());
@@ -515,7 +564,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow)
                                              preceding,
                                              following,
                                              min_periods,
-                                             make_collect_list_aggregation());
+                                             *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {10, 11},
@@ -530,13 +579,13 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
-                           input_column,
-                           preceding,
-                           following,
-                           min_periods,
-                           make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -558,12 +607,13 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls)
 
   {
     // Nulls included.
-    auto const result = grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                               input_column,
-                                               preceding,
-                                               following,
-                                               min_periods,
-                                               make_collect_list_aggregation());
+    auto const result =
+      grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
+                             input_column,
+                             preceding,
+                             following,
+                             min_periods,
+                             *make_collect_list_aggregation<rolling_aggregation>());
 
     auto expected_child = fixed_width_column_wrapper<T, int32_t>{
       {10, 11, 10, 11, 12, 11, 12, 13, 12, 13, 14, 13, 14, 20, 21, 20, 21, 22, 21, 22, 23, 22, 23},
@@ -582,12 +632,13 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls)
 
   {
     // Nulls excluded.
-    auto const result = grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                               input_column,
-                                               preceding,
-                                               following,
-                                               min_periods,
-                                               make_collect_list_aggregation(null_policy::EXCLUDE));
+    auto const result = grouped_rolling_window(
+      table_view{std::vector<column_view>{group_column}},
+      input_column,
+      preceding,
+      following,
+      min_periods,
+      *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
     auto expected_child = fixed_width_column_wrapper<T, int32_t>{
       10, 10, 12, 12, 13, 12, 13, 14, 13, 14, 20, 20, 22, 22, 23, 22, 23};
@@ -627,7 +678,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {10, 11, 12, 13},
@@ -642,15 +693,15 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -678,7 +729,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto null_at_0 = iterator_with_null_at(0);
   auto null_at_1 = iterator_with_null_at(1);
@@ -697,15 +748,15 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<T, int32_t>{
@@ -744,7 +795,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result = lists_column_wrapper<cudf::string_view>{
     {"10", "11", "12", "13"},
@@ -759,15 +810,15 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -793,7 +844,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto null_at_0 = iterator_with_null_at(0);
   auto null_at_1 = iterator_with_null_at(1);
@@ -813,15 +864,15 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<cudf::string_view>{
@@ -868,7 +919,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto expected_numeric_column = fixed_width_column_wrapper<T, int32_t>{
     10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12,
@@ -890,15 +941,15 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      struct_column->view(),
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    struct_column->view(),
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -928,7 +979,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result = lists_column_wrapper<T, int32_t>{
     {{10, 11, 12, 13},
@@ -946,15 +997,15 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -984,7 +1035,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto null_at_1 = iterator_with_null_at(1);
 
@@ -1005,15 +1056,15 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<T, int32_t>{
@@ -1056,7 +1107,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods)
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto const expected_result = lists_column_wrapper<cudf::string_view>{
     {{"10", "11", "12", "13"},
@@ -1074,15 +1125,15 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 }
@@ -1110,7 +1161,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto null_at_1 = iterator_with_null_at(1);
 
@@ -1131,15 +1182,15 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
-  auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      input_column,
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   // After null exclusion, `11`, `21`, and `null` should not appear.
   auto const expected_result_with_nulls_excluded = lists_column_wrapper<cudf::string_view>{
@@ -1190,7 +1241,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe
                                       preceding,
                                       following,
                                       min_periods,
-                                      make_collect_list_aggregation());
+                                      *make_collect_list_aggregation<rolling_aggregation>());
 
   auto expected_numeric_column = fixed_width_column_wrapper<T, int32_t>{
     10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14};
@@ -1218,15 +1269,774 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
 
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    struct_column->view(),
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+struct CollectSetTest : public cudf::test::BaseFixture {
+};
+
+template <typename T>
+struct TypedCollectSetTest : public CollectSetTest {
+};
+
+using TypesForSetTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                           cudf::test::FloatingPointTypes,
+                                           cudf::test::DurationTypes,
+                                           cudf::test::FixedPointTypes>;
+
+TYPED_TEST_CASE(TypedCollectSetTest, TypesForSetTest);
+
+TYPED_TEST(TypedCollectSetTest, BasicRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const input_column = fixed_width_column_wrapper<T, int32_t>{10, 10, 11, 12, 11};
+
+  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+
+  EXPECT_EQ(static_cast<column_view>(prev_column).size(),
+            static_cast<column_view>(foll_column).size());
+
+  auto const result_column_based_window =
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   1,
+                   *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result =
+    lists_column_wrapper<T, int32_t>{
+      {10},
+      {10, 11},
+      {10, 11, 12},
+      {11, 12},
+      {11, 12},
+    }
+      .release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
+
+  auto const result_fixed_window =
+    rolling_window(input_column, 2, 1, 1, *make_collect_set_aggregation<rolling_aggregation>());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view());
+
+  auto const result_with_nulls_excluded =
+    rolling_window(input_column,
+                   2,
+                   1,
+                   1,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+TYPED_TEST(TypedCollectSetTest, RollingWindowWithEmptyOutputLists)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const input_column = fixed_width_column_wrapper<T, int32_t>{10, 11, 11, 11, 14, 15};
+
+  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 0, 2, 2};
+  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 0, 1, 0};
+
+  EXPECT_EQ(static_cast<column_view>(prev_column).size(),
+            static_cast<column_view>(foll_column).size());
+
+  auto const result_column_based_window =
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   0,
+                   *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result =
+    lists_column_wrapper<T, int32_t>{
+      {10, 11},
+      {10, 11},
+      {11},
+      {},
+      {11, 14, 15},
+      {14, 15},
+    }
+      .release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
+
+  auto const result_with_nulls_excluded =
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   0,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+TYPED_TEST(TypedCollectSetTest, RollingWindowHonoursMinPeriods)
+{
+  // Test that when the number of observations is fewer than min_periods,
+  // the result is null.
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const input_column = fixed_width_column_wrapper<T, int32_t>{0, 1, 2, 2, 4, 5};
+  auto const num_elements = static_cast<column_view>(input_column).size();
+
+  auto preceding    = 2;
+  auto following    = 1;
+  auto min_periods  = 3;
+  auto const result = rolling_window(input_column,
+                                     preceding,
+                                     following,
+                                     min_periods,
+                                     *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result = lists_column_wrapper<T, int32_t>{
+    {{}, {0, 1, 2}, {1, 2}, {2, 4}, {2, 4, 5}, {}},
+    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+      return i != 0 && i != (num_elements - 1);
+    })}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
   auto const result_with_nulls_excluded =
-    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
-                                      time_column,
-                                      cudf::order::ASCENDING,
-                                      struct_column->view(),
-                                      preceding,
-                                      following,
-                                      min_periods,
-                                      make_collect_list_aggregation(null_policy::EXCLUDE));
+    rolling_window(input_column,
+                   preceding,
+                   following,
+                   min_periods,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+
+  preceding   = 2;
+  following   = 2;
+  min_periods = 4;
+
+  auto result_2          = rolling_window(input_column,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *make_collect_set_aggregation<rolling_aggregation>());
+  auto expected_result_2 = lists_column_wrapper<T, int32_t>{
+    {{}, {0, 1, 2}, {1, 2, 4}, {2, 4, 5}, {}, {}},
+    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+      return i != 0 && i < 4;
+    })}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view());
+
+  auto result_2_with_nulls_excluded =
+    rolling_window(input_column,
+                   preceding,
+                   following,
+                   min_periods,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(),
+                                      result_2_with_nulls_excluded->view());
+}
+
+TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsOnStrings)
+{
+  // Test that when the number of observations is fewer than min_periods,
+  // the result is null.
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto const input_column = strings_column_wrapper{"0", "1", "2", "2", "4", "4"};
+  auto const num_elements = static_cast<column_view>(input_column).size();
+
+  auto preceding    = 2;
+  auto following    = 1;
+  auto min_periods  = 3;
+  auto const result = rolling_window(input_column,
+                                     preceding,
+                                     following,
+                                     min_periods,
+                                     *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result = lists_column_wrapper<string_view>{
+    {{}, {"0", "1", "2"}, {"1", "2"}, {"2", "4"}, {"2", "4"}, {}},
+    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+      return i != 0 && i != (num_elements - 1);
+    })}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+  auto const result_with_nulls_excluded =
+    rolling_window(input_column,
+                   preceding,
+                   following,
+                   min_periods,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+
+  preceding   = 2;
+  following   = 2;
+  min_periods = 4;
+
+  auto result_2          = rolling_window(input_column,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *make_collect_set_aggregation<rolling_aggregation>());
+  auto expected_result_2 = lists_column_wrapper<string_view>{
+    {{}, {"0", "1", "2"}, {"1", "2", "4"}, {"2", "4"}, {}, {}},
+    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+      return i != 0 && i < 4;
+    })}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view());
+
+  auto result_2_with_nulls_excluded =
+    rolling_window(input_column,
+                   preceding,
+                   following,
+                   min_periods,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(),
+                                      result_2_with_nulls_excluded->view());
+}
+
+TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsWithDecimal)
+{
+  // Test that when the number of observations is fewer than min_periods,
+  // the result is null.
+
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto const input_column =
+    fixed_point_column_wrapper<int32_t>{{0, 0, 1, 2, 3, 3}, numeric::scale_type{0}};
+
+  {
+    // One result row at each end should be null.
+    auto preceding    = 2;
+    auto following    = 1;
+    auto min_periods  = 3;
+    auto const result = rolling_window(input_column,
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *make_collect_set_aggregation<rolling_aggregation>());
+
+    auto expected_result_child_values = std::vector<int32_t>{0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
+    auto expected_result_child =
+      fixed_point_column_wrapper<int32_t>{expected_result_child_values.begin(),
+                                          expected_result_child_values.end(),
+                                          numeric::scale_type{0}};
+    auto expected_offsets  = fixed_width_column_wrapper<size_type>{0, 0, 2, 5, 8, 10, 10}.release();
+    auto expected_num_rows = expected_offsets->size() - 1;
+    auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
+      size_type{0}, [expected_num_rows](auto i) { return i != 0 && i != (expected_num_rows - 1); });
+
+    auto expected_result = make_lists_column(
+      expected_num_rows,
+      std::move(expected_offsets),
+      expected_result_child.release(),
+      2,
+      cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows));
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+    auto const result_with_nulls_excluded =
+      rolling_window(input_column,
+                     preceding,
+                     following,
+                     min_periods,
+                     *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(),
+                                        result_with_nulls_excluded->view());
+  }
+
+  {
+    // First result row, and the last two result rows should be null.
+    auto preceding    = 2;
+    auto following    = 2;
+    auto min_periods  = 4;
+    auto const result = rolling_window(input_column,
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *make_collect_set_aggregation<rolling_aggregation>());
+
+    auto expected_result_child_values = std::vector<int32_t>{0, 1, 2, 0, 1, 2, 3, 1, 2, 3};
+    auto expected_result_child =
+      fixed_point_column_wrapper<int32_t>{expected_result_child_values.begin(),
+                                          expected_result_child_values.end(),
+                                          numeric::scale_type{0}};
+    auto expected_offsets = fixed_width_column_wrapper<size_type>{0, 0, 3, 7, 10, 10, 10}.release();
+    auto expected_num_rows = expected_offsets->size() - 1;
+    auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
+      size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; });
+
+    auto expected_result = make_lists_column(
+      expected_num_rows,
+      std::move(expected_offsets),
+      expected_result_child.release(),
+      3,
+      cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows));
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+    auto const result_with_nulls_excluded =
+      rolling_window(input_column,
+                     preceding,
+                     following,
+                     min_periods,
+                     *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(),
+                                        result_with_nulls_excluded->view());
+  }
+}
+
+TYPED_TEST(TypedCollectSetTest, BasicGroupedRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const input_column =
+    fixed_width_column_wrapper<T, int32_t>{10, 11, 11, 13, 13, 20, 21, 20, 23};
+
+  auto const preceding   = 2;
+  auto const following   = 1;
+  auto const min_periods = 1;
+  auto const result = grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
+                                             input_column,
+                                             preceding,
+                                             following,
+                                             min_periods,
+                                             *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result =
+    lists_column_wrapper<T, int32_t>{
+      {10, 11}, {10, 11}, {11, 13}, {11, 13}, {13}, {20, 21}, {20, 21}, {20, 21, 23}, {20, 23}}
+      .release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+  auto const result_with_nulls_excluded = grouped_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+TYPED_TEST(TypedCollectSetTest, BasicGroupedRollingWindowWithNulls)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const input_column = fixed_width_column_wrapper<T, int32_t>{
+    {10, 11, 12, 13, 13, 20, 21, 21, 23}, {1, 0, 0, 1, 1, 1, 0, 1, 1}};
+
+  auto const preceding   = 2;
+  auto const following   = 1;
+  auto const min_periods = 1;
+
+  {
+    // Nulls included.
+    auto const result =
+      grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
+                             input_column,
+                             preceding,
+                             following,
+                             min_periods,
+                             *make_collect_set_aggregation<rolling_aggregation>());
+    // Null values are sorted to the tails of lists (sets)
+    auto expected_child = fixed_width_column_wrapper<T, int32_t>{
+      {10, 11, 10, 11, 13, 11, 13, 12, 13, 20, 21, 20, 21, 21, 21, 23, 21, 21, 23},
+      {1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1}};
+    auto expected_offsets = fixed_width_column_wrapper<int32_t>{0, 2, 4, 6, 8, 9, 11, 14, 17, 19};
+
+    auto expected_result = make_lists_column(static_cast<column_view>(group_column).size(),
+                                             expected_offsets.release(),
+                                             expected_child.release(),
+                                             0,
+                                             {});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+  }
+
+  {
+    // Nulls excluded.
+    auto const result = grouped_rolling_window(
+      table_view{std::vector<column_view>{group_column}},
+      input_column,
+      preceding,
+      following,
+      min_periods,
+      *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+    auto expected_child =
+      fixed_width_column_wrapper<T, int32_t>{10, 10, 13, 13, 13, 20, 20, 21, 21, 23, 21, 23};
+
+    auto expected_offsets = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3, 4, 5, 6, 8, 10, 12};
+
+    auto expected_result = make_lists_column(static_cast<column_view>(group_column).size(),
+                                             expected_offsets.release(),
+                                             expected_child.release(),
+                                             0,
+                                             {});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+  }
+}
+
+TYPED_TEST(TypedCollectSetTest, BasicGroupedTimeRangeRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const time_column = fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
+    1, 1, 2, 2, 3, 1, 4, 5, 6};
+  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const input_column =
+    fixed_width_column_wrapper<T, int32_t>{10, 11, 12, 13, 14, 20, 21, 22, 23};
+  auto const preceding   = 2;
+  auto const following   = 1;
+  auto const min_periods = 1;
+  auto const result =
+    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
+                                      time_column,
+                                      cudf::order::ASCENDING,
+                                      input_column,
+                                      preceding,
+                                      following,
+                                      min_periods,
+                                      *make_collect_list_aggregation<rolling_aggregation>());
+
+  auto const expected_result = lists_column_wrapper<T, int32_t>{
+    {10, 11, 12, 13},
+    {10, 11, 12, 13},
+    {10, 11, 12, 13, 14},
+    {10, 11, 12, 13, 14},
+    {10, 11, 12, 13, 14},
+    {20},
+    {21, 22},
+    {21, 22, 23},
+    {21, 22, 23}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+TYPED_TEST(TypedCollectSetTest, GroupedTimeRangeRollingWindowWithNulls)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const time_column = fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
+    1, 1, 2, 2, 3, 1, 4, 5, 6};
+  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const input_column = fixed_width_column_wrapper<T, int32_t>{
+    {10, 10, 12, 13, 14, 20, 21, 22, 22}, {1, 0, 1, 1, 1, 1, 0, 1, 1}};
+  auto const preceding   = 2;
+  auto const following   = 1;
+  auto const min_periods = 1;
+  auto const result =
+    grouped_time_range_rolling_window(table_view{std::vector<column_view>{group_column}},
+                                      time_column,
+                                      cudf::order::ASCENDING,
+                                      input_column,
+                                      preceding,
+                                      following,
+                                      min_periods,
+                                      *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto null_at_1 = iterator_with_null_at(1);
+  auto null_at_2 = iterator_with_null_at(2);
+  auto null_at_3 = iterator_with_null_at(3);
+  auto null_at_4 = iterator_with_null_at(4);
+
+  // In the results, `11` and `21` should be nulls.
+  auto const expected_result = lists_column_wrapper<T, int32_t>{
+    {{10, 12, 13, 10}, null_at_3},
+    {{10, 12, 13, 10}, null_at_3},
+    {{10, 12, 13, 14, 10}, null_at_4},
+    {{10, 12, 13, 14, 10}, null_at_4},
+    {{10, 12, 13, 14, 10}, null_at_4},
+    {{20}, null_at_1},
+    {{22, 21}, null_at_1},
+    {{22, 21}, null_at_1},
+    {{22, 21}, null_at_1}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+  auto const result_with_nulls_excluded = grouped_time_range_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    time_column,
+    cudf::order::ASCENDING,
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  // After null exclusion, `11`, `21`, and `null` should not appear.
+  auto const expected_result_with_nulls_excluded = lists_column_wrapper<T, int32_t>{
+    {10, 12, 13},
+    {10, 12, 13},
+    {10, 12, 13, 14},
+    {10, 12, 13, 14},
+    {10, 12, 13, 14},
+    {20},
+    {22},
+    {22},
+    {22}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_with_nulls_excluded->view(),
+                                      result_with_nulls_excluded->view());
+}
+
+TYPED_TEST(TypedCollectSetTest, SlicedGroupedRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using T = TypeParam;
+
+  auto const group_original = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const input_original =
+    fixed_width_column_wrapper<T, int32_t>{10, 11, 11, 13, 13, 20, 21, 21, 23};
+  auto const group_col = cudf::slice(group_original, {2, 7})[0];  // { 1, 1, 1, 2, 2 }
+  auto const input_col = cudf::slice(input_original, {2, 7})[0];  // { 11, 13, 13, 20, 21 }
+
+  auto const preceding   = 2;
+  auto const following   = 1;
+  auto const min_periods = 1;
+  auto const result      = grouped_rolling_window(table_view{std::vector<column_view>{group_col}},
+                                             input_col,
+                                             preceding,
+                                             following,
+                                             min_periods,
+                                             *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result =
+    lists_column_wrapper<T, int32_t>{{11, 13}, {11, 13}, {13}, {20, 21}, {20, 21}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+}
+
+TEST_F(CollectSetTest, BoolRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto const input_column = fixed_width_column_wrapper<bool>{false, false, true, true, true};
+
+  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+
+  EXPECT_EQ(static_cast<column_view>(prev_column).size(),
+            static_cast<column_view>(foll_column).size());
+
+  auto const result_column_based_window =
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   1,
+                   *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result =
+    lists_column_wrapper<bool>{
+      {false},
+      {false, true},
+      {false, true},
+      {true},
+      {true},
+    }
+      .release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
+
+  auto const result_fixed_window =
+    rolling_window(input_column, 2, 1, 1, *make_collect_set_aggregation<rolling_aggregation>());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view());
+
+  auto const result_with_nulls_excluded =
+    rolling_window(input_column,
+                   2,
+                   1,
+                   1,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+TEST_F(CollectSetTest, BoolGroupedRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto const group_column = fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
+  auto const input_column =
+    fixed_width_column_wrapper<bool>{false, true, false, true, false, false, false, true, true};
+
+  auto const preceding   = 2;
+  auto const following   = 1;
+  auto const min_periods = 1;
+  auto const result = grouped_rolling_window(table_view{std::vector<column_view>{group_column}},
+                                             input_column,
+                                             preceding,
+                                             following,
+                                             min_periods,
+                                             *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result = lists_column_wrapper<bool>{
+    {false, true},
+    {false, true},
+    {false, true},
+    {false, true},
+    {false, true},
+    {false},
+    {false, true},
+    {false, true},
+    {true}}.release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view());
+
+  auto const result_with_nulls_excluded = grouped_rolling_window(
+    table_view{std::vector<column_view>{group_column}},
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto const input_column =
+    fixed_width_column_wrapper<double>{1.23, 0.2341, std::nan("1"), std::nan("1"), -5.23e9};
+
+  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+
+  EXPECT_EQ(static_cast<column_view>(prev_column).size(),
+            static_cast<column_view>(foll_column).size());
+
+  auto const result_column_based_window =
+    rolling_window(input_column,
+                   prev_column,
+                   foll_column,
+                   1,
+                   *make_collect_set_aggregation<rolling_aggregation>());
+
+  auto const expected_result =
+    lists_column_wrapper<double>{
+      {0.2341, 1.23},
+      {0.2341, 1.23, std::nan("1")},
+      {0.2341, std::nan("1"), std::nan("1")},
+      {-5.23e9, std::nan("1"), std::nan("1")},
+      {-5.23e9, std::nan("1")},
+    }
+      .release();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
+
+  auto const result_fixed_window =
+    rolling_window(input_column, 2, 1, 1, *make_collect_set_aggregation<rolling_aggregation>());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view());
+
+  auto const result_with_nulls_excluded =
+    rolling_window(input_column,
+                   2,
+                   1,
+                   1,
+                   *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
+}
+
+TEST_F(CollectSetTest, ListTypeRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto const input_column = lists_column_wrapper<int32_t>{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}};
+
+  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+
+  EXPECT_THROW(rolling_window(input_column,
+                              prev_column,
+                              foll_column,
+                              1,
+                              *make_collect_set_aggregation<rolling_aggregation>()),
+               cudf::logic_error);
+}
+
+TEST_F(CollectSetTest, StructTypeRollingWindow)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto col1               = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5};
+  auto col2               = strings_column_wrapper{"a", "b", "c", "d", "e"};
+  auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}};
+  auto const prev_column  = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column  = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+
+  EXPECT_THROW(rolling_window(input_column,
+                              prev_column,
+                              foll_column,
+                              1,
+                              *make_collect_set_aggregation<rolling_aggregation>()),
+               cudf::logic_error);
 }
diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp
new file mode 100644
index 00000000000..3296f9d32f9
--- /dev/null
+++ b/cpp/tests/rolling/empty_input_test.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+namespace {
+// Helper functions to construct rolling window operators.
+auto count_valid()
+{
+  return cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::EXCLUDE);
+}
+
+auto count_all()
+{
+  return cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE);
+}
+
+auto sum() { return cudf::make_sum_aggregation<cudf::rolling_aggregation>(); }
+
+auto mean() { return cudf::make_mean_aggregation<cudf::rolling_aggregation>(); }
+
+auto min() { return cudf::make_min_aggregation<cudf::rolling_aggregation>(); }
+
+auto max() { return cudf::make_max_aggregation<cudf::rolling_aggregation>(); }
+
+auto lead() { return cudf::make_lead_aggregation<cudf::rolling_aggregation>(3); }
+
+auto lag() { return cudf::make_lag_aggregation<cudf::rolling_aggregation>(3); }
+
+auto row_number() { return cudf::make_row_number_aggregation<cudf::rolling_aggregation>(); }
+
+auto collect_list() { return cudf::make_collect_list_aggregation<cudf::rolling_aggregation>(); }
+
+auto udf()
+{
+  return cudf::make_udf_aggregation<cudf::rolling_aggregation>(
+    cudf::udf_type::CUDA, "", cudf::data_type{cudf::type_id::INT32});
+}
+
+// Constants for rolling_window.
+auto const min_periods      = 1;
+auto const preceding        = 2;
+auto const following        = 2;
+auto const preceding_scalar = cudf::numeric_scalar<cudf::size_type>(preceding);
+auto const following_scalar = cudf::numeric_scalar<cudf::size_type>(following);
+auto const preceding_column = cudf::test::fixed_width_column_wrapper<cudf::size_type>{}.release();
+auto const following_column = cudf::test::fixed_width_column_wrapper<cudf::size_type>{}.release();
+auto const preceding_col    = preceding_column -> view();
+auto const following_col    = following_column -> view();
+}  // namespace
+
+struct RollingEmptyInputTest : cudf::test::BaseFixture {
+};
+
+template <typename T>
+struct TypedRollingEmptyInputTest : RollingEmptyInputTest {
+};
+
+TYPED_TEST_CASE(TypedRollingEmptyInputTest, cudf::test::FixedWidthTypes);
+
+using cudf::rolling_aggregation;
+using agg_vector_t = std::vector<std::unique_ptr<rolling_aggregation>>;
+
+void rolling_output_type_matches(cudf::column_view const& result,
+                                 cudf::type_id expected_type,
+                                 cudf::type_id expected_child_type)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  EXPECT_EQ(result.type().id(), expected_type);
+  EXPECT_EQ(result.size(), 0);
+  if (expected_type == cudf::type_id::LIST) {
+    EXPECT_EQ(result.child(cudf::lists_column_view::child_column_index).type().id(),
+              expected_child_type);
+  }
+  if (expected_type == cudf::type_id::STRUCT) {
+    EXPECT_EQ(result.child(0).type().id(), expected_child_type);
+  }
+}
+
+void rolling_output_type_matches(cudf::column_view const& empty_input,
+                                 agg_vector_t const& aggs,
+                                 cudf::type_id expected_type,
+                                 cudf::type_id expected_child_type = cudf::type_id::EMPTY)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  for (auto const& agg : aggs) {
+    auto rolling_output_numeric_bounds =
+      rolling_window(empty_input, preceding, following, min_periods, *agg);
+    rolling_output_type_matches(
+      rolling_output_numeric_bounds->view(), expected_type, expected_child_type);
+
+    auto rolling_output_columnar_bounds =
+      rolling_window(empty_input, preceding_col, following_col, min_periods, *agg);
+    rolling_output_type_matches(
+      rolling_output_columnar_bounds->view(), expected_type, expected_child_type);
+
+    auto grouped_rolling_output = grouped_rolling_window(
+      table_view{std::vector{empty_input}}, empty_input, preceding, following, min_periods, *agg);
+    rolling_output_type_matches(grouped_rolling_output->view(), expected_type, expected_child_type);
+
+    auto grouped_range_rolling_output =
+      grouped_range_rolling_window(table_view{std::vector{empty_input}},
+                                   empty_input,
+                                   order::ASCENDING,
+                                   empty_input,
+                                   range_window_bounds::get(preceding_scalar),
+                                   range_window_bounds::get(following_scalar),
+                                   min_periods,
+                                   *agg);
+    rolling_output_type_matches(
+      grouped_range_rolling_output->view(), expected_type, expected_child_type);
+  }
+}
+
+void rolling_window_throws(cudf::column_view const& empty_input, agg_vector_t const& aggs)
+{
+  for (auto const& agg : aggs) {
+    EXPECT_THROW(rolling_window(empty_input, 2, 2, 1, *agg), cudf::logic_error);
+  }
+}
+
+TYPED_TEST(TypedRollingEmptyInputTest, EmptyFixedWidthInputs)
+{
+  using InputType = TypeParam;
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto input_col   = fixed_width_column_wrapper<InputType>{}.release();
+  auto empty_input = input_col->view();
+
+  /// Test aggregations that yield columns of type `size_type`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(count_valid());
+    aggs.emplace_back(count_all());
+    aggs.emplace_back(row_number());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<size_type>());
+  }
+
+  /// Test aggregations that yield columns of same type as input.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(min());
+    aggs.emplace_back(max());
+    aggs.emplace_back(lead());
+    aggs.emplace_back(lag());
+    aggs.emplace_back(udf());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<InputType>());
+  }
+
+  /// `SUM` returns 64-bit promoted types for integral/decimal input.
+  /// For other fixed-width input types, the same type is returned.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(sum());
+
+    using expected_type = cudf::detail::target_type_t<InputType, aggregation::SUM>;
+    rolling_output_type_matches(empty_input, aggs, type_to_id<expected_type>());
+  }
+
+  /// `MEAN` returns float64 for all numeric types,
+  /// except for chrono-types, which yield the same chrono-type.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(mean());
+
+    using expected_type = cudf::detail::target_type_t<InputType, aggregation::MEAN>;
+    rolling_output_type_matches(empty_input, aggs, type_to_id<expected_type>());
+  }
+
+  /// For an input type `T`, `COLLECT_LIST` returns a column of type `list<T>`.
+  {
+    auto aggs = std::vector<std::unique_ptr<rolling_aggregation>>{};
+    aggs.emplace_back(collect_list());
+
+    rolling_output_type_matches(
+      empty_input, aggs, type_to_id<list_view>(), type_to_id<InputType>());
+  }
+}
+
+TEST_F(RollingEmptyInputTest, Strings)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto input_col   = strings_column_wrapper{}.release();
+  auto empty_input = input_col->view();
+
+  /// Test aggregations that yield columns of type `size_type`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(count_valid());
+    aggs.emplace_back(count_all());
+    aggs.emplace_back(row_number());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<size_type>());
+  }
+
+  /// Test aggregations that yield columns of same type as input.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(min());
+    aggs.emplace_back(max());
+    aggs.emplace_back(lead());
+    aggs.emplace_back(lag());
+    aggs.emplace_back(udf());
+
+    rolling_output_type_matches(empty_input, aggs, type_id::STRING);
+  }
+
+  /// For an input type `T`, `COLLECT_LIST` returns a column of type `list<T>`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(collect_list());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<list_view>(), type_id::STRING);
+  }
+
+  /// All other aggregations are unsupported.
+  {
+    auto unsupported_aggs = agg_vector_t{};
+    unsupported_aggs.emplace_back(sum());
+    unsupported_aggs.emplace_back(mean());
+
+    rolling_window_throws(empty_input, unsupported_aggs);
+  }
+}
+
+TEST_F(RollingEmptyInputTest, Dictionaries)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto input_col   = dictionary_column_wrapper<std::string>{}.release();
+  auto empty_input = input_col->view();
+
+  /// Test aggregations that yield columns of type `size_type`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(count_valid());
+    aggs.emplace_back(count_all());
+    aggs.emplace_back(row_number());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<size_type>());
+  }
+
+  /// Test aggregations that yield columns of same type as input.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(min());
+    aggs.emplace_back(max());
+    aggs.emplace_back(lead());
+    aggs.emplace_back(lag());
+    aggs.emplace_back(udf());
+
+    rolling_output_type_matches(empty_input, aggs, type_id::DICTIONARY32);
+  }
+
+  /// For an input type `T`, `COLLECT_LIST` returns a column of type `list<T>`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(collect_list());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<list_view>(), type_id::DICTIONARY32);
+  }
+
+  /// All other aggregations are unsupported.
+  {
+    auto unsupported_aggs = agg_vector_t{};
+    unsupported_aggs.emplace_back(sum());
+    unsupported_aggs.emplace_back(mean());
+
+    rolling_window_throws(empty_input, unsupported_aggs);
+  }
+}
+
+TYPED_TEST(TypedRollingEmptyInputTest, Lists)
+{
+  using T = TypeParam;
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto input_col   = lists_column_wrapper<T>{}.release();
+  auto empty_input = input_col->view();
+
+  /// Test aggregations that yield columns of type `size_type`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(count_valid());
+    aggs.emplace_back(count_all());
+    aggs.emplace_back(row_number());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<size_type>());
+  }
+
+  /// Test aggregations that yield columns of same type as input.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(min());
+    aggs.emplace_back(max());
+    aggs.emplace_back(lead());
+    aggs.emplace_back(lag());
+    aggs.emplace_back(udf());
+
+    rolling_output_type_matches(empty_input, aggs, type_id::LIST, type_to_id<T>());
+  }
+
+  /// For an input type `T`, `COLLECT_LIST` returns a column of type `list<T>`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(collect_list());
+
+    rolling_output_type_matches(empty_input, aggs, type_id::LIST, type_id::LIST);
+  }
+
+  /// All other aggregations are unsupported.
+  {
+    auto unsupported_aggs = agg_vector_t{};
+    unsupported_aggs.emplace_back(sum());
+    unsupported_aggs.emplace_back(mean());
+
+    rolling_window_throws(empty_input, unsupported_aggs);
+  }
+}
+
+TYPED_TEST(TypedRollingEmptyInputTest, Structs)
+{
+  using T = TypeParam;
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto member_col  = fixed_width_column_wrapper<T>{};
+  auto input_col   = structs_column_wrapper{{member_col}}.release();
+  auto empty_input = input_col->view();
+
+  /// Test aggregations that yield columns of type `size_type`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(count_valid());
+    aggs.emplace_back(count_all());
+    aggs.emplace_back(row_number());
+
+    rolling_output_type_matches(empty_input, aggs, type_to_id<size_type>());
+  }
+
+  /// Test aggregations that yield columns of same type as input.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(min());
+    aggs.emplace_back(max());
+    aggs.emplace_back(lead());
+    aggs.emplace_back(lag());
+    aggs.emplace_back(udf());
+
+    rolling_output_type_matches(empty_input, aggs, type_id::STRUCT, type_to_id<T>());
+  }
+
+  /// For an input type `T`, `COLLECT_LIST` returns a column of type `list<T>`.
+  {
+    auto aggs = agg_vector_t{};
+    aggs.emplace_back(collect_list());
+
+    rolling_output_type_matches(empty_input, aggs, type_id::LIST, type_id::STRUCT);
+  }
+
+  /// All other aggregations are unsupported.
+  {
+    auto unsupported_aggs = agg_vector_t{};
+    unsupported_aggs.emplace_back(sum());
+    unsupported_aggs.emplace_back(mean());
+
+    rolling_window_throws(empty_input, unsupported_aggs);
+  }
+}
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 6f930f99b50..cb123114fd8 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "rolling_test.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -126,7 +128,7 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
                     size_type const& preceding_window,
                     size_type const& following_window,
                     size_type min_periods,
-                    std::unique_ptr<cudf::aggregation> const& op)
+                    cudf::rolling_aggregation const& op)
   {
     std::unique_ptr<cudf::column> output;
 
@@ -170,28 +172,29 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
                  preceding_window,
                  following_window,
                  min_periods,
-                 cudf::make_min_aggregation());
-    run_test_col(keys,
-                 input,
-                 expected_grouping,
-                 preceding_window,
-                 following_window,
-                 min_periods,
-                 cudf::make_count_aggregation());
+                 *cudf::make_min_aggregation<cudf::rolling_aggregation>());
     run_test_col(keys,
                  input,
                  expected_grouping,
                  preceding_window,
                  following_window,
                  min_periods,
-                 cudf::make_count_aggregation(cudf::null_policy::INCLUDE));
+                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
+    run_test_col(
+      keys,
+      input,
+      expected_grouping,
+      preceding_window,
+      following_window,
+      min_periods,
+      *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
     run_test_col(keys,
                  input,
                  expected_grouping,
                  preceding_window,
                  following_window,
                  min_periods,
-                 cudf::make_max_aggregation());
+                 *cudf::make_max_aggregation<cudf::rolling_aggregation>());
 
     if (!cudf::is_timestamp(input.type())) {
       run_test_col(keys,
@@ -200,14 +203,14 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
                    preceding_window,
                    following_window,
                    min_periods,
-                   cudf::make_sum_aggregation());
+                   *cudf::make_sum_aggregation<cudf::rolling_aggregation>());
       run_test_col(keys,
                    input,
                    expected_grouping,
                    preceding_window,
                    following_window,
                    min_periods,
-                   cudf::make_mean_aggregation());
+                   *cudf::make_mean_aggregation<cudf::rolling_aggregation>());
     }
     run_test_col(keys,
                  input,
@@ -215,11 +218,11 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
                  preceding_window,
                  following_window,
                  min_periods,
-                 cudf::make_row_number_aggregation());
+                 *cudf::make_row_number_aggregation<cudf::rolling_aggregation>());
 
     // >>> test UDFs <<<
     if (input.type() == cudf::data_type{cudf::type_id::INT32} && !input.has_nulls()) {
-      auto cuda_udf_agg = cudf::make_udf_aggregation(
+      auto cuda_udf_agg = cudf::make_udf_aggregation<cudf::rolling_aggregation>(
         cudf::udf_type::CUDA, cuda_func, cudf::data_type{cudf::type_id::INT64});
       run_test_col(keys,
                    input,
@@ -227,9 +230,9 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
                    preceding_window,
                    following_window,
                    min_periods,
-                   cuda_udf_agg);
+                   *cuda_udf_agg);
 
-      auto ptx_udf_agg = cudf::make_udf_aggregation(
+      auto ptx_udf_agg = cudf::make_udf_aggregation<cudf::rolling_aggregation>(
         cudf::udf_type::PTX, ptx_func, cudf::data_type{cudf::type_id::INT64});
       run_test_col(keys,
                    input,
@@ -237,7 +240,7 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
                    preceding_window,
                    following_window,
                    min_periods,
-                   ptx_udf_agg);
+                   *ptx_udf_agg);
     }
   }
 
@@ -335,7 +338,7 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& input,
                                                         std::vector<size_type> const& group_offsets,
                                                         size_type const& preceding_window,
@@ -392,7 +395,7 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<!is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& input,
                                                         std::vector<size_type> const& group_offsets,
                                                         size_type const& preceding_window_col,
@@ -402,16 +405,15 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
     CUDF_FAIL("Unsupported combination of type and aggregation");
   }
 
-  std::unique_ptr<cudf::column> create_reference_output(
-    std::unique_ptr<cudf::aggregation> const& op,
-    cudf::column_view const& input,
-    std::vector<size_type> const& group_offsets,
-    size_type const& preceding_window,
-    size_type const& following_window,
-    size_type min_periods)
+  std::unique_ptr<cudf::column> create_reference_output(cudf::rolling_aggregation const& op,
+                                                        cudf::column_view const& input,
+                                                        std::vector<size_type> const& group_offsets,
+                                                        size_type const& preceding_window,
+                                                        size_type const& following_window,
+                                                        size_type min_periods)
   {
     // unroll aggregation types
-    switch (op->kind) {
+    switch (op.kind) {
       case cudf::aggregation::SUM:
         return create_reference_output<cudf::DeviceSum,
                                        cudf::aggregation::SUM,
@@ -484,7 +486,8 @@ TEST_F(GroupedRollingErrorTest, NegativeMinPeriods)
   const cudf::table_view grouping_keys{std::vector<cudf::column_view>{grouping_keys_col}};
 
   EXPECT_THROW(
-    cudf::grouped_rolling_window(grouping_keys, input, 2, 2, -2, cudf::make_sum_aggregation()),
+    cudf::grouped_rolling_window(
+      grouping_keys, input, 2, 2, -2, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
     cudf::logic_error);
 }
 
@@ -493,8 +496,9 @@ TEST_F(GroupedRollingErrorTest, EmptyInput)
   cudf::test::fixed_width_column_wrapper<int32_t> empty_col{};
   std::unique_ptr<cudf::column> output;
   const cudf::table_view grouping_keys{std::vector<cudf::column_view>{}};
-  EXPECT_NO_THROW(output = cudf::grouped_rolling_window(
-                    grouping_keys, empty_col, 2, 0, 2, cudf::make_sum_aggregation()));
+  EXPECT_NO_THROW(
+    output = cudf::grouped_rolling_window(
+      grouping_keys, empty_col, 2, 0, 2, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()));
   EXPECT_EQ(output->size(), 0);
 }
 
@@ -519,19 +523,24 @@ TEST_F(GroupedRollingErrorTest, SumTimestampNotSupported)
     fixed_width_column_wrapper<size_type>(grouping_keys_vec.begin(), grouping_keys_vec.end())}};
 
   EXPECT_THROW(
-    cudf::grouped_rolling_window(grouping_keys, input_D, 2, 2, 0, cudf::make_sum_aggregation()),
+    cudf::grouped_rolling_window(
+      grouping_keys, input_D, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf::grouped_rolling_window(grouping_keys, input_s, 2, 2, 0, cudf::make_sum_aggregation()),
+    cudf::grouped_rolling_window(
+      grouping_keys, input_s, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf::grouped_rolling_window(grouping_keys, input_ms, 2, 2, 0, cudf::make_sum_aggregation()),
+    cudf::grouped_rolling_window(
+      grouping_keys, input_ms, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf::grouped_rolling_window(grouping_keys, input_us, 2, 2, 0, cudf::make_sum_aggregation()),
+    cudf::grouped_rolling_window(
+      grouping_keys, input_us, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf::grouped_rolling_window(grouping_keys, input_ns, 2, 2, 0, cudf::make_sum_aggregation()),
+    cudf::grouped_rolling_window(
+      grouping_keys, input_ns, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
     cudf::logic_error);
 }
 
@@ -655,10 +664,13 @@ TEST_F(GroupedRollingTestStrings, StringsUnsupportedOperators)
   const cudf::table_view key_cols{std::vector<cudf::column_view>{
     fixed_width_column_wrapper<size_type>(key_col_vec.begin(), key_col_vec.end())}};
 
-  EXPECT_THROW(cudf::grouped_rolling_window(key_cols, input, 2, 2, 0, cudf::make_sum_aggregation()),
-               cudf::logic_error);
   EXPECT_THROW(
-    cudf::grouped_rolling_window(key_cols, input, 2, 2, 0, cudf::make_mean_aggregation()),
+    cudf::grouped_rolling_window(
+      key_cols, input, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf::grouped_rolling_window(
+      key_cols, input, 2, 2, 0, *cudf::make_mean_aggregation<cudf::rolling_aggregation>()),
     cudf::logic_error);
 }
 
@@ -674,7 +686,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                     size_type const& preceding_window_in_days,
                     size_type const& following_window_in_days,
                     size_type min_periods,
-                    std::unique_ptr<cudf::aggregation> const& op)
+                    cudf::rolling_aggregation const& op)
   {
     std::unique_ptr<cudf::column> output;
 
@@ -734,16 +746,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                  preceding_window_in_days,
                  following_window_in_days,
                  min_periods,
-                 cudf::make_min_aggregation());
-    run_test_col(keys,
-                 timestamp_column,
-                 timestamp_order,
-                 input,
-                 expected_grouping,
-                 preceding_window_in_days,
-                 following_window_in_days,
-                 min_periods,
-                 cudf::make_count_aggregation());
+                 *cudf::make_min_aggregation<cudf::rolling_aggregation>());
     run_test_col(keys,
                  timestamp_column,
                  timestamp_order,
@@ -752,7 +755,17 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                  preceding_window_in_days,
                  following_window_in_days,
                  min_periods,
-                 cudf::make_count_aggregation(cudf::null_policy::INCLUDE));
+                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
+    run_test_col(
+      keys,
+      timestamp_column,
+      timestamp_order,
+      input,
+      expected_grouping,
+      preceding_window_in_days,
+      following_window_in_days,
+      min_periods,
+      *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
     run_test_col(keys,
                  timestamp_column,
                  timestamp_order,
@@ -761,7 +774,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                  preceding_window_in_days,
                  following_window_in_days,
                  min_periods,
-                 cudf::make_max_aggregation());
+                 *cudf::make_max_aggregation<cudf::rolling_aggregation>());
     if (!cudf::is_timestamp(input.type())) {
       run_test_col(keys,
                    timestamp_column,
@@ -771,7 +784,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                    preceding_window_in_days,
                    following_window_in_days,
                    min_periods,
-                   cudf::make_sum_aggregation());
+                   *cudf::make_sum_aggregation<cudf::rolling_aggregation>());
       run_test_col(keys,
                    timestamp_column,
                    timestamp_order,
@@ -780,7 +793,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                    preceding_window_in_days,
                    following_window_in_days,
                    min_periods,
-                   cudf::make_mean_aggregation());
+                   *cudf::make_mean_aggregation<cudf::rolling_aggregation>());
     }
     run_test_col(keys,
                  timestamp_column,
@@ -790,7 +803,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                  preceding_window_in_days,
                  following_window_in_days,
                  min_periods,
-                 cudf::make_row_number_aggregation());
+                 *cudf::make_row_number_aggregation<cudf::rolling_aggregation>());
   }
 
  private:
@@ -942,7 +955,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& timestamp_column,
                                                         cudf::order const& timestamp_order,
                                                         cudf::column_view const& input,
@@ -1026,7 +1039,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<!is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(cudf::column_view const& timestamp_column,
                                                         cudf::order const& timestamp_order,
                                                         cudf::column_view const& input,
@@ -1038,18 +1051,17 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
     CUDF_FAIL("Unsupported combination of type and aggregation");
   }
 
-  std::unique_ptr<cudf::column> create_reference_output(
-    std::unique_ptr<cudf::aggregation> const& op,
-    cudf::column_view const& timestamp_column,
-    cudf::order const& timestamp_order,
-    cudf::column_view const& input,
-    std::vector<size_type> const& group_offsets,
-    size_type const& preceding_window,
-    size_type const& following_window,
-    size_type min_periods)
+  std::unique_ptr<cudf::column> create_reference_output(cudf::rolling_aggregation const& op,
+                                                        cudf::column_view const& timestamp_column,
+                                                        cudf::order const& timestamp_order,
+                                                        cudf::column_view const& input,
+                                                        std::vector<size_type> const& group_offsets,
+                                                        size_type const& preceding_window,
+                                                        size_type const& following_window,
+                                                        size_type min_periods)
   {
     // unroll aggregation types
-    switch (op->kind) {
+    switch (op.kind) {
       case cudf::aggregation::SUM:
         return create_reference_output<cudf::DeviceSum,
                                        cudf::aggregation::SUM,
@@ -1257,14 +1269,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampASCNu
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1287,14 +1300,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampASCNu
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1315,14 +1329,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampASCNul
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1343,14 +1358,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampASCNul
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1372,14 +1388,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampDESCN
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1402,14 +1419,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampDESCN
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1430,14 +1448,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNu
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1458,14 +1477,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNu
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1488,14 +1508,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupAllNullTimesta
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1518,14 +1539,15 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupAllNullTimestam
   auto const preceding     = 1L;
   auto const following     = 1L;
   auto const min_periods   = 1L;
-  auto const output        = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output        = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1561,14 +1583,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1590,14 +1613,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1620,14 +1644,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1649,14 +1674,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1678,14 +1704,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1708,14 +1735,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1737,14 +1765,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1766,14 +1795,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1796,14 +1826,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1825,14 +1856,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1854,14 +1886,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1884,14 +1917,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1912,14 +1946,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampA
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1940,14 +1975,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampA
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1969,14 +2005,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -1997,14 +2034,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampA
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2025,14 +2063,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampA
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2054,14 +2093,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::ASCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2082,14 +2122,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampD
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2110,14 +2151,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampD
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2139,14 +2181,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2167,14 +2210,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampD
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_day_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              one_day_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    one_day_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2195,14 +2239,15 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampD
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              one_day_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    one_day_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2224,14 +2269,15 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_time_range_rolling_window(grouping_keys,
-                                                              time_col,
-                                                              cudf::order::DESCENDING,
-                                                              agg_col,
-                                                              unbounded_preceding,
-                                                              unbounded_following,
-                                                              min_periods,
-                                                              cudf::make_count_aggregation());
+  auto const output              = cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::DESCENDING,
+    agg_col,
+    unbounded_preceding,
+    unbounded_following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2251,12 +2297,13 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroup)
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_row_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_rolling_window(grouping_keys,
-                                                   agg_col,
-                                                   unbounded_preceding,
-                                                   one_row_following,
-                                                   min_periods,
-                                                   cudf::make_count_aggregation());
+  auto const output =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 agg_col,
+                                 unbounded_preceding,
+                                 one_row_following,
+                                 min_periods,
+                                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2276,12 +2323,13 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroup)
   auto const one_row_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_rolling_window(grouping_keys,
-                                                   agg_col,
-                                                   one_row_preceding,
-                                                   unbounded_following,
-                                                   min_periods,
-                                                   cudf::make_count_aggregation());
+  auto const output =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 agg_col,
+                                 one_row_preceding,
+                                 unbounded_following,
+                                 min_periods,
+                                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2301,12 +2349,13 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingAndFollowingWindowSingleG
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_rolling_window(grouping_keys,
-                                                   agg_col,
-                                                   unbounded_preceding,
-                                                   unbounded_following,
-                                                   min_periods,
-                                                   cudf::make_count_aggregation());
+  auto const output =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 agg_col,
+                                 unbounded_preceding,
+                                 unbounded_following,
+                                 min_periods,
+                                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2326,12 +2375,13 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowMultiGroup)
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const one_row_following   = cudf::window_bounds::get(1L);
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_rolling_window(grouping_keys,
-                                                   agg_col,
-                                                   unbounded_preceding,
-                                                   one_row_following,
-                                                   min_periods,
-                                                   cudf::make_count_aggregation());
+  auto const output =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 agg_col,
+                                 unbounded_preceding,
+                                 one_row_following,
+                                 min_periods,
+                                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2351,12 +2401,13 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowMultiGroup)
   auto const one_row_preceding   = cudf::window_bounds::get(1L);
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_rolling_window(grouping_keys,
-                                                   agg_col,
-                                                   one_row_preceding,
-                                                   unbounded_following,
-                                                   min_periods,
-                                                   cudf::make_count_aggregation());
+  auto const output =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 agg_col,
+                                 one_row_preceding,
+                                 unbounded_following,
+                                 min_periods,
+                                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
@@ -2376,12 +2427,13 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingAndFollowingWindowMultiGr
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
   auto const unbounded_following = cudf::window_bounds::unbounded();
   auto const min_periods         = 1L;
-  auto const output              = cudf::grouped_rolling_window(grouping_keys,
-                                                   agg_col,
-                                                   unbounded_preceding,
-                                                   unbounded_following,
-                                                   min_periods,
-                                                   cudf::make_count_aggregation());
+  auto const output =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 agg_col,
+                                 unbounded_preceding,
+                                 unbounded_following,
+                                 min_periods,
+                                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
                                  fixed_width_column_wrapper<cudf::size_type>{
diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp
index 1cf7d74285c..a54fb236f29 100644
--- a/cpp/tests/rolling/lead_lag_test.cpp
+++ b/cpp/tests/rolling/lead_lag_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -28,6 +29,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -72,12 +74,13 @@ TYPED_TEST(TypedLeadLagWindowTest, LeadLagBasics)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lead_3_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                        input_col->view(),
-                                                        preceding,
-                                                        following,
-                                                        min_periods,
-                                                        cudf::make_lead_aggregation(3));
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
 
   expect_columns_equivalent(
     *lead_3_output_col,
@@ -86,12 +89,13 @@ TYPED_TEST(TypedLeadLagWindowTest, LeadLagBasics)
       .release()
       ->view());
 
-  auto lag_2_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                       input_col->view(),
-                                                       preceding,
-                                                       following,
-                                                       min_periods,
-                                                       cudf::make_lag_aggregation(2));
+  auto lag_2_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(2));
 
   expect_columns_equivalent(
     *lag_2_output_col,
@@ -116,12 +120,13 @@ TYPED_TEST(TypedLeadLagWindowTest, LeadLagWithNulls)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lead_3_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                        input_col->view(),
-                                                        preceding,
-                                                        following,
-                                                        min_periods,
-                                                        cudf::make_lead_aggregation(3));
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
 
   expect_columns_equivalent(
     *lead_3_output_col,
@@ -130,12 +135,13 @@ TYPED_TEST(TypedLeadLagWindowTest, LeadLagWithNulls)
       .release()
       ->view());
 
-  auto const lag_2_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                             input_col->view(),
-                                                             preceding,
-                                                             following,
-                                                             min_periods,
-                                                             cudf::make_lag_aggregation(2));
+  auto const lag_2_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(2));
 
   expect_columns_equivalent(
     *lag_2_output_col,
@@ -164,13 +170,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithDefaults)
     cudf::make_fixed_width_scalar(detail::fixed_width_type_converter<int32_t, T>{}(99));
   auto const default_outputs = cudf::make_column_from_scalar(*default_value, input_col->size());
 
-  auto lead_3_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                        input_col->view(),
-                                                        *default_outputs,
-                                                        preceding,
-                                                        following,
-                                                        min_periods,
-                                                        cudf::make_lead_aggregation(3));
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
   expect_columns_equivalent(
     *lead_3_output_col,
     fixed_width_column_wrapper<T>{{3, 4, 5, 99, 99, 99, 30, 40, 50, 99, 99, 99},
@@ -178,13 +185,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithDefaults)
       .release()
       ->view());
 
-  auto const lag_2_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                             input_col->view(),
-                                                             *default_outputs,
-                                                             preceding,
-                                                             following,
-                                                             min_periods,
-                                                             cudf::make_lag_aggregation(2));
+  auto const lag_2_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(2));
 
   expect_columns_equivalent(
     *lag_2_output_col,
@@ -214,13 +222,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithDefaultsContainingNulls)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lead_3_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                        input_col->view(),
-                                                        *default_outputs,
-                                                        preceding,
-                                                        following,
-                                                        min_periods,
-                                                        cudf::make_lead_aggregation(3));
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
   expect_columns_equivalent(
     *lead_3_output_col,
     fixed_width_column_wrapper<T>{{3, 4, 5, 99, 99, -1, 30, 40, 50, 99, 99, -1},
@@ -228,13 +237,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithDefaultsContainingNulls)
       .release()
       ->view());
 
-  auto const lag_2_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                             input_col->view(),
-                                                             *default_outputs,
-                                                             preceding,
-                                                             following,
-                                                             min_periods,
-                                                             cudf::make_lag_aggregation(2));
+  auto const lag_2_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(2));
 
   expect_columns_equivalent(
     *lag_2_output_col,
@@ -263,12 +273,13 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithOutOfRangeOffsets)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lead_30_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                         input_col->view(),
-                                                         preceding,
-                                                         following,
-                                                         min_periods,
-                                                         cudf::make_lead_aggregation(30));
+  auto lead_30_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(30));
 
   expect_columns_equivalent(
     *lead_30_output_col,
@@ -277,13 +288,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithOutOfRangeOffsets)
       .release()
       ->view());
 
-  auto const lag_20_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                              input_col->view(),
-                                                              *default_outputs,
-                                                              preceding,
-                                                              following,
-                                                              min_periods,
-                                                              cudf::make_lag_aggregation(20));
+  auto const lag_20_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(20));
 
   expect_columns_equivalent(
     *lag_20_output_col,
@@ -308,22 +320,23 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithZeroOffsets)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lead_0_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                        input_col->view(),
-                                                        preceding,
-                                                        following,
-                                                        min_periods,
-                                                        cudf::make_lead_aggregation(0));
+  auto lead_0_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(0));
 
   expect_columns_equivalent(*lead_0_output_col, *input_col);
 
-  auto const lag_0_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                             input_col->view(),
-                                                             preceding,
-                                                             following,
-                                                             min_periods,
-                                                             cudf::make_lag_aggregation(0));
-  ;
+  auto const lag_0_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(0));
 
   expect_columns_equivalent(*lag_0_output_col, *input_col);
 }
@@ -347,14 +360,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithNegativeOffsets)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lag_minus_3_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                             input_col->view(),
-                                                             *default_outputs,
-                                                             preceding,
-                                                             following,
-                                                             min_periods,
-                                                             cudf::make_lag_aggregation(-3));
-  ;
+  auto lag_minus_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(-3));
 
   expect_columns_equivalent(
     *lag_minus_3_output_col,
@@ -370,7 +383,7 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithNegativeOffsets)
                                  preceding,
                                  following,
                                  min_periods,
-                                 cudf::make_lead_aggregation(-2));
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(-2));
 
   expect_columns_equivalent(
     *lead_minus_2_output_col,
@@ -397,26 +410,27 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithNoGrouping)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lead_3_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                        input_col->view(),
-                                                        *default_outputs,
-                                                        preceding,
-                                                        following,
-                                                        min_periods,
-                                                        cudf::make_lead_aggregation(3));
-  ;
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
 
   expect_columns_equivalent(
     *lead_3_output_col,
     fixed_width_column_wrapper<T>{{3, 4, 5, 99, 99, 99}, {1, 1, 1, 1, 1, 1}}.release()->view());
 
-  auto const lag_2_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                             input_col->view(),
-                                                             *default_outputs,
-                                                             preceding,
-                                                             following,
-                                                             min_periods,
-                                                             cudf::make_lag_aggregation(2));
+  auto const lag_2_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(2));
 
   expect_columns_equivalent(
     *lag_2_output_col,
@@ -444,13 +458,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithAllNullInput)
   auto const following   = 3;
   auto const min_periods = 1;
 
-  auto lead_3_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                        input_col->view(),
-                                                        *default_outputs,
-                                                        preceding,
-                                                        following,
-                                                        min_periods,
-                                                        cudf::make_lead_aggregation(3));
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
   expect_columns_equivalent(
     *lead_3_output_col,
     fixed_width_column_wrapper<T>{{-1, -1, -1, 99, 99, 99, -1, -1, -1, 99, 99, 99},
@@ -458,13 +473,14 @@ TYPED_TEST(TypedLeadLagWindowTest, TestLeadLagWithAllNullInput)
       .release()
       ->view());
 
-  auto const lag_2_output_col = cudf::grouped_rolling_window(grouping_keys,
-                                                             input_col->view(),
-                                                             *default_outputs,
-                                                             preceding,
-                                                             following,
-                                                             min_periods,
-                                                             cudf::make_lag_aggregation(2));
+  auto const lag_2_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 *default_outputs,
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(2));
 
   expect_columns_equivalent(
     *lag_2_output_col,
@@ -498,46 +514,617 @@ TYPED_TEST(TypedLeadLagWindowTest, DefaultValuesWithoutLeadLag)
   auto const min_periods = 1;
 
   auto const assert_aggregation_fails = [&](auto&& aggr) {
-    EXPECT_THROW(cudf::grouped_rolling_window(grouping_keys,
-                                              input_col->view(),
-                                              default_outputs->view(),
-                                              preceding,
-                                              following,
-                                              min_periods,
-                                              cudf::make_count_aggregation()),
-                 cudf::logic_error);
+    EXPECT_THROW(
+      cudf::grouped_rolling_window(grouping_keys,
+                                   input_col->view(),
+                                   default_outputs->view(),
+                                   preceding,
+                                   following,
+                                   min_periods,
+                                   *cudf::make_count_aggregation<cudf::rolling_aggregation>()),
+      cudf::logic_error);
   };
 
-  auto aggs = {cudf::make_count_aggregation(), cudf::make_min_aggregation()};
+  auto aggs = {cudf::make_count_aggregation<cudf::rolling_aggregation>(),
+               cudf::make_min_aggregation<cudf::rolling_aggregation>()};
   std::for_each(
     aggs.begin(), aggs.end(), [&](auto& agg) { assert_aggregation_fails(std::move(agg)); });
 }
 
-TEST_F(LeadLagWindowTest, LeadLagWithoutFixedWidthInput)
+template <typename T>
+struct TypedNestedLeadLagWindowTest : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(TypedNestedLeadLagWindowTest, TypesForTest);
+
+TYPED_TEST(TypedNestedLeadLagWindowTest, NumericListsWithNullsAllOver)
+{
+  using T   = TypeParam;
+  using lcw = lists_column_wrapper<T, int32_t>;
+
+  auto null_at_2       = cudf::test::iterator_with_null_at(2);
+  auto const input_col = lcw{{{0, 0},
+                              {1, 1},
+                              {2, 2},
+                              {3, 3, 3},
+                              {{4, 4, 4, 4}, null_at_2},
+                              {5, 5, 5, 5, 5},
+                              {0, 0},
+                              {10, 10},
+                              {20, 20},
+                              {30, 30, 30},
+                              {40, 40, 40, 40},
+                              {{50, 50, 50, 50, 50}, null_at_2}},
+                             null_at_2}
+                           .release();
+
+  auto const grouping_key = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_key}};
+
+  auto const preceding   = 4;
+  auto const following   = 3;
+  auto const min_periods = 1;
+
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    lead_3_output_col->view(),
+    lcw{{{3, 3, 3},
+         {{4, 4, 4, 4}, null_at_2},
+         {5, 5, 5, 5, 5},
+         {},
+         {},
+         {},
+         {30, 30, 30},
+         {40, 40, 40, 40},
+         {{50, 50, 50, 50, 50}, null_at_2},
+         {},
+         {},
+         {}},
+        iterator_with_null_at(std::vector<size_type>{3, 4, 5, 9, 10, 11})}
+      .release()
+      ->view());
+
+  auto lag_1_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+
+  expect_columns_equivalent(lag_1_output_col->view(),
+                            lcw{{{},
+                                 {0, 0},
+                                 {1, 1},
+                                 {2, 2},
+                                 {3, 3, 3},
+                                 {{4, 4, 4, 4}, null_at_2},
+                                 {},
+                                 {0, 0},
+                                 {10, 10},
+                                 {20, 20},
+                                 {30, 30, 30},
+                                 {40, 40, 40, 40}},
+                                iterator_with_null_at(std::vector<size_type>{0, 3, 6})}
+                              .release()
+                              ->view());
+}
+
+TYPED_TEST(TypedNestedLeadLagWindowTest, NumericListsWithDefaults)
 {
-  // Check that Lead/Lag aren't supported for non-fixed-width types.
+  using T   = TypeParam;
+  using lcw = lists_column_wrapper<T, int32_t>;
+
+  auto null_at_2       = cudf::test::iterator_with_null_at(2);
+  auto const input_col = lcw{{{0, 0},
+                              {1, 1},
+                              {2, 2},
+                              {3, 3, 3},
+                              {{4, 4, 4, 4}, null_at_2},
+                              {5, 5, 5, 5, 5},
+                              {0, 0},
+                              {10, 10},
+                              {20, 20},
+                              {30, 30, 30},
+                              {40, 40, 40, 40},
+                              {{50, 50, 50, 50, 50}, null_at_2}},
+                             null_at_2}
+                           .release();
 
-  auto const input_col = strings_column_wrapper{
-    {"0", "1", "2", "3", "4", "5"}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return false;
-    })}.release();
+  auto const defaults_col =
+    lcw{
+      {
+        {},
+        {91, 91},
+        {92, 92},
+        {},  // null!
+        {94, 94, 94},
+        {95, 95},
+        {},
+        {91, 91},
+        {92, 92},
+        {},  // null!
+        {94, 94, 94},
+        {95, 95},
+      },
+    }
+      .release();
 
-  auto const grouping_key  = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0};
+  auto const grouping_key = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_key}};
 
-  auto const default_value   = cudf::make_string_scalar("99");
-  auto const default_outputs = cudf::make_column_from_scalar(*default_value, input_col->size());
+  auto const preceding   = 4;
+  auto const following   = 3;
+  auto const min_periods = 1;
+
+  auto lead_3_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    lead_3_output_col->view(),
+    lcw{{{3, 3, 3},
+         {{4, 4, 4, 4}, null_at_2},
+         {5, 5, 5, 5, 5},
+         {},
+         {},
+         {},
+         {30, 30, 30},
+         {40, 40, 40, 40},
+         {{50, 50, 50, 50, 50}, null_at_2},
+         {},
+         {},
+         {}},
+        iterator_with_null_at(std::vector<size_type>{3, 4, 5, 9, 10, 11})}
+      .release()
+      ->view());
+
+  auto lag_1_output_col =
+    cudf::grouped_rolling_window(grouping_keys,
+                                 input_col->view(),
+                                 preceding,
+                                 following,
+                                 min_periods,
+                                 *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+
+  expect_columns_equivalent(lag_1_output_col->view(),
+                            lcw{{{},
+                                 {0, 0},
+                                 {1, 1},
+                                 {2, 2},
+                                 {3, 3, 3},
+                                 {{4, 4, 4, 4}, null_at_2},
+                                 {},
+                                 {0, 0},
+                                 {10, 10},
+                                 {20, 20},
+                                 {30, 30, 30},
+                                 {40, 40, 40, 40}},
+                                iterator_with_null_at(std::vector<size_type>{0, 3, 6})}
+                              .release()
+                              ->view());
+}
+
+TYPED_TEST(TypedNestedLeadLagWindowTest, Structs)
+{
+  using T   = TypeParam;
+  using lcw = lists_column_wrapper<T, int32_t>;
+
+  auto null_at_2 = cudf::test::iterator_with_null_at(2);
+  auto lists_col = lcw{{{0, 0},
+                        {1, 1},
+                        {2, 2},
+                        {3, 3, 3},
+                        {{4, 4, 4, 4}, null_at_2},
+                        {5, 5, 5, 5, 5},
+                        {0, 0},
+                        {10, 10},
+                        {20, 20},
+                        {30, 30, 30},
+                        {40, 40, 40, 40},
+                        {{50, 50, 50, 50, 50}, null_at_2}},
+                       null_at_2};
+
+  auto strings_col = strings_column_wrapper{{"00",
+                                             "11",
+                                             "22",
+                                             "333",
+                                             "4444",
+                                             "55555",
+                                             "00",
+                                             "1010",
+                                             "2020",
+                                             "303030",
+                                             "40404040",
+                                             "5050505050"},
+                                            iterator_with_null_at(9)};
+
+  auto structs_col = structs_column_wrapper{lists_col, strings_col}.release();
+
+  auto const grouping_key = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_key}};
+
+  auto const preceding   = 4;
+  auto const following   = 3;
+  auto const min_periods = 1;
+
+  // Test LEAD().
+  {
+    auto lead_3_output_col =
+      cudf::grouped_rolling_window(grouping_keys,
+                                   structs_col->view(),
+                                   preceding,
+                                   following,
+                                   min_periods,
+                                   *cudf::make_lead_aggregation<cudf::rolling_aggregation>(3));
+    auto expected_lists_col =
+      lcw{{{3, 3, 3},
+           {{4, 4, 4, 4}, null_at_2},
+           {5, 5, 5, 5, 5},
+           {},
+           {},
+           {},
+           {30, 30, 30},
+           {40, 40, 40, 40},
+           {{50, 50, 50, 50, 50}, null_at_2},
+           {},
+           {},
+           {}},
+          iterator_with_null_at(std::vector<size_type>{3, 4, 5, 9, 10, 11})};
+    auto expected_strings_col = strings_column_wrapper{
+      {"333", "4444", "55555", "", "", "", "", "40404040", "5050505050", "", "", ""},
+      iterator_with_null_at(std::vector<size_type>{3, 4, 5, 6, 9, 10, 11})};
+
+    auto expected_structs_col =
+      structs_column_wrapper{{expected_lists_col, expected_strings_col},
+                             iterator_with_null_at(std::vector<size_type>{3, 4, 5, 9, 10, 11})}
+        .release();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lead_3_output_col->view(), expected_structs_col->view());
+  }
+
+  // Test LAG()
+  {
+    auto lag_1_output_col =
+      cudf::grouped_rolling_window(grouping_keys,
+                                   structs_col->view(),
+                                   preceding,
+                                   following,
+                                   min_periods,
+                                   *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+    auto expected_lists_col = lcw{{{},  // null.
+                                   {0, 0},
+                                   {1, 1},
+                                   {},  // null.
+                                   {3, 3, 3},
+                                   {{4, 4, 4, 4}, null_at_2},
+                                   {},  // null.
+                                   {0, 0},
+                                   {10, 10},
+                                   {20, 20},
+                                   {30, 30, 30},
+                                   {40, 40, 40, 40}},
+                                  iterator_with_null_at(std::vector<size_type>{0, 3, 6})};
+    auto expected_strings_col =
+      strings_column_wrapper{{"",  // null.
+                              "00",
+                              "11",
+                              "22",
+                              "333",
+                              "4444",
+                              "",  // null.
+                              "00",
+                              "1010",
+                              "2020",
+                              "",  // null.
+                              "40404040"},
+                             iterator_with_null_at(std::vector<size_type>{0, 6, 10})};
+
+    auto expected_structs_col =
+      structs_column_wrapper{{expected_lists_col, expected_strings_col},
+                             iterator_with_null_at(std::vector<size_type>{0, 6})}
+        .release();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lag_1_output_col->view(), expected_structs_col->view());
+  }
+}
+
+struct LeadLagNonFixedWidthTest : cudf::test::BaseFixture {
+};
+
+TEST_F(LeadLagNonFixedWidthTest, StringsNoDefaults)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto input_col = strings_column_wrapper{{"",
+                                           "A_1",
+                                           "A_22",
+                                           "A_333",
+                                           "A_4444",
+                                           "A_55555",
+                                           "B_0",
+                                           "",
+                                           "B_22",
+                                           "B_333",
+                                           "B_4444",
+                                           "B_55555"},
+                                          iterator_with_null_at(std::vector{0, 7})}
+                     .release();
+
+  auto const grouping_key = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_key}};
+
+  auto const preceding   = 4;
+  auto const following   = 3;
+  auto const min_periods = 1;
+
+  auto lead_2 = grouped_rolling_window(grouping_keys,
+                                       input_col->view(),
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *cudf::make_lead_aggregation<cudf::rolling_aggregation>(2));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    lead_2->view(),
+    strings_column_wrapper{
+      {"A_22", "A_333", "A_4444", "A_55555", "", "", "B_22", "B_333", "B_4444", "B_55555", "", ""},
+      iterator_with_null_at(std::vector{4, 5, 10, 11})});
+
+  auto lag_1 = grouped_rolling_window(grouping_keys,
+                                      input_col->view(),
+                                      preceding,
+                                      following,
+                                      min_periods,
+                                      *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    lag_1->view(),
+    strings_column_wrapper{
+      {"", "", "A_1", "A_22", "A_333", "A_4444", "", "B_0", "", "B_22", "B_333", "B_4444"},
+      iterator_with_null_at(std::vector{0, 1, 6, 8})});
+}
+
+TEST_F(LeadLagNonFixedWidthTest, StringsWithDefaults)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto input_col = strings_column_wrapper{{"",
+                                           "A_1",
+                                           "A_22",
+                                           "A_333",
+                                           "A_4444",
+                                           "A_55555",
+                                           "B_0",
+                                           "",
+                                           "B_22",
+                                           "B_333",
+                                           "B_4444",
+                                           "B_55555"},
+                                          iterator_with_null_at(std::vector{0, 7})}
+                     .release();
+
+  auto defaults_col = strings_column_wrapper{"9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999"}
+                        .release();
+
+  auto const grouping_key = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_key}};
+
+  auto const preceding   = 4;
+  auto const following   = 3;
+  auto const min_periods = 1;
+
+  auto lead_2 = grouped_rolling_window(grouping_keys,
+                                       input_col->view(),
+                                       defaults_col->view(),
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *cudf::make_lead_aggregation<cudf::rolling_aggregation>(2));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lead_2->view(),
+                                      strings_column_wrapper{"A_22",
+                                                             "A_333",
+                                                             "A_4444",
+                                                             "A_55555",
+                                                             "9999",
+                                                             "9999",
+                                                             "B_22",
+                                                             "B_333",
+                                                             "B_4444",
+                                                             "B_55555",
+                                                             "9999",
+                                                             "9999"});
+
+  auto lag_1 = grouped_rolling_window(grouping_keys,
+                                      input_col->view(),
+                                      defaults_col->view(),
+                                      preceding,
+                                      following,
+                                      min_periods,
+                                      *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    lag_1->view(),
+    strings_column_wrapper{
+      {"9999", "", "A_1", "A_22", "A_333", "A_4444", "9999", "B_0", "", "B_22", "B_333", "B_4444"},
+      iterator_with_null_at(std::vector{1, 8})});
+}
+
+TEST_F(LeadLagNonFixedWidthTest, StringsWithDefaultsNoGroups)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  auto input_col = strings_column_wrapper{{"",
+                                           "A_1",
+                                           "A_22",
+                                           "A_333",
+                                           "A_4444",
+                                           "A_55555",
+                                           "B_0",
+                                           "",
+                                           "B_22",
+                                           "B_333",
+                                           "B_4444",
+                                           "B_55555"},
+                                          iterator_with_null_at(std::vector{0, 7})}
+                     .release();
+
+  auto defaults_col = strings_column_wrapper{"9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999",
+                                             "9999"}
+                        .release();
+
+  auto const grouping_key = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_key}};
+
+  auto const preceding   = 4;
+  auto const following   = 3;
+  auto const min_periods = 1;
+
+  auto lead_2 = grouped_rolling_window(grouping_keys,
+                                       input_col->view(),
+                                       defaults_col->view(),
+                                       preceding,
+                                       following,
+                                       min_periods,
+                                       *cudf::make_lead_aggregation<cudf::rolling_aggregation>(2));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lead_2->view(),
+                                      strings_column_wrapper{{"A_22",
+                                                              "A_333",
+                                                              "A_4444",
+                                                              "A_55555",
+                                                              "B_0",
+                                                              "",
+                                                              "B_22",
+                                                              "B_333",
+                                                              "B_4444",
+                                                              "B_55555",
+                                                              "9999",
+                                                              "9999"},
+                                                             iterator_with_null_at(5)});
+
+  auto lag_1 = grouped_rolling_window(grouping_keys,
+                                      input_col->view(),
+                                      defaults_col->view(),
+                                      preceding,
+                                      following,
+                                      min_periods,
+                                      *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    lag_1->view(),
+    strings_column_wrapper{{"9999",
+                            "",
+                            "A_1",
+                            "A_22",
+                            "A_333",
+                            "A_4444",
+                            "A_55555",
+                            "B_0",
+                            "",
+                            "B_22",
+                            "B_333",
+                            "B_4444"},
+                           iterator_with_null_at(std::vector{1, 8})});
+}
+
+TEST_F(LeadLagNonFixedWidthTest, Dictionary)
+{
+  using namespace cudf;
+  using namespace cudf::test;
+
+  using dictionary = cudf::test::dictionary_column_wrapper<std::string>;
+
+  auto input_strings = std::initializer_list<std::string>{"",
+                                                          "A_1",
+                                                          "A_22",
+                                                          "A_333",
+                                                          "A_4444",
+                                                          "A_55555",
+                                                          "B_0",
+                                                          "",
+                                                          "B_22",
+                                                          "B_333",
+                                                          "B_4444",
+                                                          "B_55555"};
+  auto input_col     = dictionary{input_strings}.release();
+
+  auto const grouping_key = fixed_width_column_wrapper<int32_t>{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_key}};
 
   auto const preceding   = 4;
   auto const following   = 3;
   auto const min_periods = 1;
 
-  EXPECT_THROW(cudf::grouped_rolling_window(grouping_keys,
-                                            input_col->view(),
-                                            default_outputs->view(),
-                                            preceding,
-                                            following,
-                                            min_periods,
-                                            cudf::make_lead_aggregation(4)),
-               cudf::logic_error);
+  {
+    auto lead_2 =
+      grouped_rolling_window(grouping_keys,
+                             input_col->view(),
+                             preceding,
+                             following,
+                             min_periods,
+                             *cudf::make_lead_aggregation<cudf::rolling_aggregation>(2));
+
+    auto expected_keys = strings_column_wrapper{input_strings}.release();
+    auto expected_values =
+      fixed_width_column_wrapper<uint32_t>{{2, 3, 4, 5, 0, 0, 7, 8, 9, 10, 0, 0},
+                                           iterator_with_null_at(std::vector{4, 5, 10, 11})}
+        .release();
+    auto expected_output =
+      make_dictionary_column(expected_keys->view(), expected_values->view()).release();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lead_2->view(), expected_output->view());
+  }
+
+  {
+    auto lag_1 = grouped_rolling_window(grouping_keys,
+                                        input_col->view(),
+                                        preceding,
+                                        following,
+                                        min_periods,
+                                        *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+
+    auto expected_keys = strings_column_wrapper{input_strings}.release();
+    auto expected_values =
+      fixed_width_column_wrapper<uint32_t>{{0, 0, 1, 2, 3, 4, 0, 6, 0, 7, 8, 9},
+                                           iterator_with_null_at(std::vector{0, 6})}
+        .release();
+    auto expected_output =
+      make_dictionary_column(expected_keys->view(), expected_values->view()).release();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lag_1->view(), expected_output->view());
+  }
 }
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
new file mode 100644
index 00000000000..03bb7a80a37
--- /dev/null
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -0,0 +1,607 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/unary.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <src/rolling/range_window_bounds_detail.hpp>
+#include <src/rolling/rolling_detail.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+
+#include <algorithm>
+#include <vector>
+
+namespace cudf {
+namespace test {
+
+template <typename T, typename R = int32_t>
+using fwcw = fixed_width_column_wrapper<T>;
+
+using int_col  = fwcw<int32_t>;
+using size_col = fwcw<cudf::size_type>;
+
+template <typename T, typename R = typename T::rep>
+using time_col = fwcw<T, R>;
+
+using lists_col = lists_column_wrapper<int32_t>;
+
+template <typename ScalarT>
+struct window_exec {
+ public:
+  window_exec(cudf::column_view gby,
+              cudf::column_view oby,
+              cudf::order ordering,
+              cudf::column_view agg,
+              ScalarT preceding_scalar,
+              ScalarT following_scalar,
+              cudf::size_type min_periods = 1)
+    : gby_column(gby),
+      oby_column(oby),
+      order(ordering),
+      agg_column(agg),
+      preceding(preceding_scalar),
+      following(following_scalar),
+      min_periods(min_periods)
+  {
+  }
+
+  size_type num_rows() { return gby_column.size(); }
+
+  std::unique_ptr<column> operator()(std::unique_ptr<rolling_aggregation> const& agg) const
+  {
+    auto const grouping_keys = cudf::table_view{std::vector<column_view>{gby_column}};
+
+    return cudf::grouped_range_rolling_window(grouping_keys,
+                                              oby_column,
+                                              order,
+                                              agg_column,
+                                              range_window_bounds::get(preceding),
+                                              range_window_bounds::get(following),
+                                              min_periods,
+                                              *agg);
+  }
+
+ private:
+  cudf::column_view gby_column;  // Groupby column.
+  cudf::column_view oby_column;  // Orderby column.
+  cudf::order order;             // Ordering for `oby_column`.
+  cudf::column_view agg_column;  // Aggregation column.
+  ScalarT preceding;             // Preceding window scalar.
+  ScalarT following;             // Following window scalar.
+  cudf::size_type min_periods = 1;
+};  // struct window_exec;
+
+struct RangeRollingTest : public BaseFixture {
+};
+
+template <typename T>
+struct TypedTimeRangeRollingTest : RangeRollingTest {
+};
+
+TYPED_TEST_CASE(TypedTimeRangeRollingTest, cudf::test::TimestampTypes);
+
+template <typename WindowExecT>
+void verify_results_for_ascending(WindowExecT exec)
+{
+  auto const n_rows       = exec.num_rows();
+  auto const all_valid    = thrust::make_constant_iterator<bool>(true);
+  auto const all_invalid  = thrust::make_constant_iterator<bool>(false);
+  auto const last_invalid = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0), [&n_rows](auto i) { return i != (n_rows - 1); });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    exec(make_count_aggregation<rolling_aggregation>(null_policy::INCLUDE))->view(),
+    size_col{{1, 2, 2, 3, 2, 3, 3, 4, 4, 1}, all_valid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exec(make_count_aggregation<rolling_aggregation>())->view(),
+                                 size_col{{1, 2, 2, 3, 2, 3, 3, 4, 4, 0}, all_valid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    exec(make_sum_aggregation<rolling_aggregation>())->view(),
+    fwcw<int64_t>{{0, 12, 12, 12, 8, 17, 17, 18, 18, 1}, last_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exec(make_min_aggregation<rolling_aggregation>())->view(),
+                                 int_col{{0, 4, 4, 2, 2, 3, 3, 1, 1, 1}, last_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exec(make_max_aggregation<rolling_aggregation>())->view(),
+                                 int_col{{0, 8, 8, 6, 6, 9, 9, 9, 9, 1}, last_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    exec(make_mean_aggregation<rolling_aggregation>())->view(),
+    fwcw<double>{{0.0, 6.0, 6.0, 4.0, 4.0, 17.0 / 3, 17.0 / 3, 4.5, 4.5, 1.0}, last_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    exec(make_collect_list_aggregation<rolling_aggregation>())->view(),
+    lists_col{{{0},
+               {8, 4},
+               {8, 4},
+               {4, 6, 2},
+               {6, 2},
+               {9, 3, 5},
+               {9, 3, 5},
+               {9, 3, 5, 1},
+               {9, 3, 5, 1},
+               {{0}, all_invalid}},
+              all_valid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    exec(make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE))->view(),
+    lists_col{{{0},
+               {8, 4},
+               {8, 4},
+               {4, 6, 2},
+               {6, 2},
+               {9, 3, 5},
+               {9, 3, 5},
+               {9, 3, 5, 1},
+               {9, 3, 5, 1},
+               {}},
+              all_valid});
+}
+
+TYPED_TEST(TypedTimeRangeRollingTest, TimestampASC)
+{
+  // Confirm that timestamp columns can be used in range queries
+  // at all resolutions, given the right duration column type.
+
+  using namespace cudf;
+  using TimeT     = TypeParam;
+  using DurationT = cudf::detail::range_type<TimeT>;
+  using time_col  = fwcw<TimeT>;
+
+  // clang-format off
+  auto gby_column  = int_col { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  auto agg_column  = int_col {{0, 8, 4, 6, 2, 9, 3, 5, 1, 7},
+                              {1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
+  auto time_column = time_col{ 1, 5, 6, 8, 9, 2, 2, 3, 4, 9};
+  // clang-format on
+
+  auto exec =
+    window_exec(gby_column,
+                time_column,
+                order::ASCENDING,
+                agg_column,
+                duration_scalar<DurationT>{DurationT{2}, true},   // 2 "durations" preceding.
+                duration_scalar<DurationT>{DurationT{1}, true});  // 1 "durations" following.
+
+  verify_results_for_ascending(exec);
+}
+
+template <typename WindowExecT>
+void verify_results_for_descending(WindowExecT exec)
+{
+  auto const all_valid     = thrust::make_constant_iterator<bool>(true);
+  auto const all_invalid   = thrust::make_constant_iterator<bool>(false);
+  auto const first_invalid = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                                             [](auto i) { return i != 0; });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    exec(make_count_aggregation<rolling_aggregation>(null_policy::INCLUDE))->view(),
+    size_col{{1, 4, 4, 3, 3, 2, 3, 2, 2, 1}, all_valid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exec(make_count_aggregation<rolling_aggregation>())->view(),
+                                 size_col{{0, 4, 4, 3, 3, 2, 3, 2, 2, 1}, all_valid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    exec(make_sum_aggregation<rolling_aggregation>())->view(),
+    fwcw<int64_t>{{1, 18, 18, 17, 17, 8, 12, 12, 12, 0}, first_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exec(make_min_aggregation<rolling_aggregation>())->view(),
+                                 int_col{{1, 1, 1, 3, 3, 2, 2, 4, 4, 0}, first_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(exec(make_max_aggregation<rolling_aggregation>())->view(),
+                                 int_col{{1, 9, 9, 9, 9, 6, 6, 8, 8, 0}, first_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    exec(make_mean_aggregation<rolling_aggregation>())->view(),
+    fwcw<double>{{1.0, 4.5, 4.5, 17.0 / 3, 17.0 / 3, 4.0, 4.0, 6.0, 6.0, 0.0}, first_invalid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    exec(make_collect_list_aggregation<rolling_aggregation>())->view(),
+    lists_col{{{{0}, all_invalid},
+               {1, 5, 3, 9},
+               {1, 5, 3, 9},
+               {5, 3, 9},
+               {5, 3, 9},
+               {2, 6},
+               {2, 6, 4},
+               {4, 8},
+               {4, 8},
+               {0}},
+              all_valid});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    exec(make_collect_list_aggregation<rolling_aggregation>(null_policy::EXCLUDE))->view(),
+    lists_col{{{},
+               {1, 5, 3, 9},
+               {1, 5, 3, 9},
+               {5, 3, 9},
+               {5, 3, 9},
+               {2, 6},
+               {2, 6, 4},
+               {4, 8},
+               {4, 8},
+               {0}},
+              all_valid});
+}
+
+TYPED_TEST(TypedTimeRangeRollingTest, TimestampDESC)
+{
+  // Confirm that timestamp columns can be used in range queries
+  // at all resolutions, given the right duration column type.
+  using namespace cudf;
+  using TimeT     = TypeParam;
+  using DurationT = cudf::detail::range_type<TimeT>;
+  using time_col  = fwcw<TimeT>;
+
+  // clang-format off
+  auto gby_column  = int_col { 5, 5, 5, 5, 5, 1, 1, 1, 1, 1};
+  auto agg_column  = int_col {{7, 1, 5, 3, 9, 2, 6, 4, 8, 0},
+                              {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  auto time_column = time_col{ 9, 4, 3, 2, 2, 9, 8, 6, 5, 1};
+  // clang-format on
+
+  auto exec =
+    window_exec(gby_column,
+                time_column,
+                order::DESCENDING,
+                agg_column,
+                duration_scalar<DurationT>{DurationT{1}, true},   // 1 "durations" preceding.
+                duration_scalar<DurationT>{DurationT{2}, true});  // 2 "durations" following.
+
+  verify_results_for_descending(exec);
+}
+
+template <typename T>
+struct TypedIntegralRangeRollingTest : RangeRollingTest {
+};
+
+TYPED_TEST_CASE(TypedIntegralRangeRollingTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(TypedIntegralRangeRollingTest, OrderByASC)
+{
+  // Confirm that integral ranges work with integral orderby columns,
+  // in ascending order.
+  using namespace cudf;
+  using T = TypeParam;
+
+  // clang-format off
+  auto gby_column = int_col { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  auto agg_column = int_col {{0, 8, 4, 6, 2, 9, 3, 5, 1, 7},
+                             {1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
+  auto oby_column = fwcw<T>{  1, 5, 6, 8, 9, 2, 2, 3, 4, 9};
+  // clang-format on
+
+  auto exec = window_exec(gby_column,
+                          oby_column,
+                          order::ASCENDING,
+                          agg_column,
+                          numeric_scalar<T>(2),   // 2 preceding.
+                          numeric_scalar<T>(1));  // 1 following.
+
+  verify_results_for_ascending(exec);
+}
+
+TYPED_TEST(TypedIntegralRangeRollingTest, OrderByDesc)
+{
+  // Confirm that integral ranges work with integral orderby columns,
+  // in descending order.
+  using namespace cudf;
+  using T = TypeParam;
+
+  // clang-format off
+  auto gby_column  = int_col { 5, 5, 5, 5, 5, 1, 1, 1, 1, 1};
+  auto agg_column  = int_col {{7, 1, 5, 3, 9, 2, 6, 4, 8, 0},
+                              {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  auto oby_column  = fwcw<T>{  9, 4, 3, 2, 2, 9, 8, 6, 5, 1};
+  // clang-format on
+
+  auto exec = window_exec(gby_column,
+                          oby_column,
+                          order::DESCENDING,
+                          agg_column,
+                          numeric_scalar<T>(1),   // 1 preceding.
+                          numeric_scalar<T>(2));  // 2 following.
+
+  verify_results_for_descending(exec);
+}
+
+template <typename T>
+struct TypedRangeRollingNullsTest : public RangeRollingTest {
+};
+
+using TypesUnderTest = IntegralTypesNotBool;
+
+TYPED_TEST_CASE(TypedRangeRollingNullsTest, TypesUnderTest);
+
+template <typename T>
+auto do_count_over_window(
+  cudf::column_view grouping_col,
+  cudf::column_view order_by,
+  cudf::order order,
+  cudf::column_view aggregation_col,
+  range_window_bounds&& preceding = range_window_bounds::get(numeric_scalar<T>{T{1}, true}),
+  range_window_bounds&& following = range_window_bounds::get(numeric_scalar<T>{T{1}, true}))
+{
+  auto const min_periods   = size_type{1};
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grouping_col}};
+
+  return cudf::grouped_range_rolling_window(grouping_keys,
+                                            order_by,
+                                            order,
+                                            aggregation_col,
+                                            std::move(preceding),
+                                            std::move(following),
+                                            min_periods,
+                                            *cudf::make_count_aggregation<rolling_aggregation>());
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountSingleGroupOrderByASCNullsFirst)
+{
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  // Aggregation column.
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::ASCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {4, 4, 4, 4, 1, 2, 2, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountSingleGroupOrderByASCNullsLast)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  // Aggregation column.
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::ASCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {2, 3, 3, 3, 2, 1, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountMultiGroupOrderByASCNullsFirst)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  // Aggregation column.
+  auto const agg_col = fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{1, 2, 2, 1, 2, 1, 2, 3, 4, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::ASCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {3, 3, 3, 2, 2, 2, 2, 2, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountMultiGroupOrderByASCNullsLast)
+{
+  using namespace cudf::test;
+  using T = int32_t;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  // Aggregation column.
+  auto const agg_col = fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{1, 2, 2, 1, 3, 1, 2, 3, 4, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::ASCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {3, 3, 3, 2, 2, 2, 3, 2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountSingleGroupOrderByDESCNullsFirst)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  // Aggregation column.
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::DESCENDING, agg_col);
+  ;
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {4, 4, 4, 4, 1, 2, 2, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountSingleGroupOrderByDESCNullsLast)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  // Aggregation column.
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::DESCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {2, 3, 3, 3, 2, 1, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountMultiGroupOrderByDESCNullsFirst)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  // Aggregation column.
+  auto const agg_col = fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::DESCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {3, 3, 3, 2, 2, 2, 2, 2, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountMultiGroupOrderByDESCNullsLast)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  // Aggregation column.
+  auto const agg_col = fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::DESCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {2, 3, 2, 2, 2, 2, 3, 2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountSingleGroupAllNullOrderBys)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  // Aggregation column.
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::ASCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, CountMultiGroupAllNullOrderBys)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  // Groupby column.
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  // Aggregation column.
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+
+  // OrderBy column.
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
+
+  auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::ASCENDING, agg_col);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {2, 3, 3, 3, 2, 4, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, UnboundedPrecedingWindowSingleGroupOrderByASCNullsFirst)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const output =
+    do_count_over_window<T>(grp_col,
+                            oby_col,
+                            cudf::order::ASCENDING,
+                            agg_col,
+                            range_window_bounds::unbounded(data_type{type_to_id<T>()}),
+                            range_window_bounds::get(numeric_scalar<T>{1, true}));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {4, 4, 4, 4, 5, 6, 7, 8, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+TYPED_TEST(TypedRangeRollingNullsTest, UnboundedFollowingWindowSingleGroupOrderByASCNullsFirst)
+{
+  using namespace cudf::test;
+  using T = TypeParam;
+
+  auto const grp_col = fixed_width_column_wrapper<T>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  auto const oby_col =
+    fixed_width_column_wrapper<T>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const output =
+    do_count_over_window<T>(grp_col,
+                            oby_col,
+                            cudf::order::ASCENDING,
+                            agg_col,
+                            range_window_bounds::get(numeric_scalar<T>{1, true}),
+                            range_window_bounds::unbounded(data_type{type_to_id<T>()}));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(),
+                                 fixed_width_column_wrapper<cudf::size_type>{
+                                   {9, 9, 9, 9, 5, 5, 4, 4, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
new file mode 100644
index 00000000000..aca40d6c0d1
--- /dev/null
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <chrono>
+#include <cuda/std/ratio>
+#include <vector>
+
+#include <cudf/rolling/range_window_bounds.hpp>
+#include <src/rolling/range_window_bounds_detail.hpp>
+
+namespace cudf {
+namespace test {
+
+struct RangeWindowBoundsTest : public BaseFixture {
+};
+
+template <typename Timestamp>
+struct TimestampRangeWindowBoundsTest : RangeWindowBoundsTest {
+};
+
+TYPED_TEST_CASE(TimestampRangeWindowBoundsTest, cudf::test::TimestampTypes);
+
+TEST_F(RangeWindowBoundsTest, TestBasicTimestampRangeTypeMapping)
+{
+  // Explicitly check that the programmatic mapping of orderby column types
+  // to their respective range and range_rep types is accurate.
+
+  using namespace cudf::detail;
+
+  static_assert(std::is_same<range_type<timestamp_D>, duration_D>::value);
+  static_assert(std::is_same<range_type<timestamp_s>, duration_s>::value);
+  static_assert(std::is_same<range_type<timestamp_ms>, duration_ms>::value);
+  static_assert(std::is_same<range_type<timestamp_us>, duration_us>::value);
+  static_assert(std::is_same<range_type<timestamp_ns>, duration_ns>::value);
+
+  static_assert(std::is_same<range_rep_type<timestamp_D>, int32_t>::value);
+  static_assert(std::is_same<range_rep_type<timestamp_s>, int64_t>::value);
+  static_assert(std::is_same<range_rep_type<timestamp_ms>, int64_t>::value);
+  static_assert(std::is_same<range_rep_type<timestamp_us>, int64_t>::value);
+  static_assert(std::is_same<range_rep_type<timestamp_ns>, int64_t>::value);
+}
+
+TYPED_TEST(TimestampRangeWindowBoundsTest, BoundsConstruction)
+{
+  using OrderByType = TypeParam;
+  using range_type  = cudf::detail::range_type<OrderByType>;
+  using rep_type    = cudf::detail::range_rep_type<OrderByType>;
+
+  using range_window_bounds = cudf::range_window_bounds;
+
+  static_assert(cudf::is_duration<range_type>());
+  auto range_3 = range_window_bounds::get(duration_scalar<range_type>{3, true});
+  EXPECT_FALSE(range_3.is_unbounded() &&
+               "range_window_bounds constructed from scalar cannot be unbounded.");
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_3), rep_type{3});
+
+  auto range_unbounded = range_window_bounds::unbounded(data_type{type_to_id<range_type>()});
+  EXPECT_TRUE(range_unbounded.is_unbounded() &&
+              "range_window_bounds::unbounded() must return an unbounded range.");
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_unbounded), rep_type{});
+}
+
+TYPED_TEST(TimestampRangeWindowBoundsTest, WrongRangeType)
+{
+  using OrderByType = TypeParam;
+
+  using wrong_range_type =
+    std::conditional_t<std::is_same<OrderByType, timestamp_D>::value, duration_ns, duration_D>;
+  auto range_3 = cudf::range_window_bounds::get(duration_scalar<wrong_range_type>{3, true});
+
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_3), cudf::logic_error);
+
+  auto range_unbounded = range_window_bounds::unbounded(data_type{type_to_id<wrong_range_type>()});
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_unbounded),
+               cudf::logic_error);
+}
+
+template <typename T>
+struct NumericRangeWindowBoundsTest : RangeWindowBoundsTest {
+};
+
+using TypesForTest = cudf::test::IntegralTypesNotBool;
+
+TYPED_TEST_CASE(NumericRangeWindowBoundsTest, TypesForTest);
+
+TYPED_TEST(NumericRangeWindowBoundsTest, BasicNumericRangeTypeMapping)
+{
+  using T = TypeParam;
+
+  using range_type     = cudf::detail::range_type<T>;
+  using range_rep_type = cudf::detail::range_rep_type<T>;
+
+  static_assert(std::is_same<T, range_type>::value);
+  static_assert(std::is_same<T, range_rep_type>::value);
+}
+
+TYPED_TEST(NumericRangeWindowBoundsTest, BoundsConstruction)
+{
+  using OrderByType = TypeParam;
+  using range_type  = cudf::detail::range_type<OrderByType>;
+  using rep_type    = cudf::detail::range_rep_type<OrderByType>;
+
+  using range_window_bounds = cudf::range_window_bounds;
+
+  static_assert(std::is_integral<range_type>::value);
+  auto range_3 = range_window_bounds::get(numeric_scalar<range_type>{3, true});
+  EXPECT_FALSE(range_3.is_unbounded() &&
+               "range_window_bounds constructed from scalar cannot be unbounded.");
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_3), rep_type{3});
+
+  auto range_unbounded = range_window_bounds::unbounded(data_type{type_to_id<range_type>()});
+  EXPECT_TRUE(range_unbounded.is_unbounded() &&
+              "range_window_bounds::unbounded() must return an unbounded range.");
+  EXPECT_EQ(cudf::detail::range_comparable_value<OrderByType>(range_unbounded), rep_type{});
+}
+
+TYPED_TEST(NumericRangeWindowBoundsTest, WrongRangeType)
+{
+  using OrderByType = TypeParam;
+
+  using wrong_range_type =
+    std::conditional_t<std::is_same<OrderByType, int32_t>::value, int16_t, int32_t>;
+  auto range_3 = cudf::range_window_bounds::get(numeric_scalar<wrong_range_type>{3, true});
+
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_3), cudf::logic_error);
+
+  auto range_unbounded = range_window_bounds::unbounded(data_type{type_to_id<wrong_range_type>()});
+  EXPECT_THROW(cudf::detail::range_comparable_value<OrderByType>(range_unbounded),
+               cudf::logic_error);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index b6e2b35e760..a67e670acb7 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "rolling_test.hpp"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -53,12 +55,18 @@ TEST_F(RollingStringTest, NoNullStringMinMaxCount)
   fixed_width_column_wrapper<size_type> expected_count({3, 4, 4, 4, 4, 4, 4, 3, 2},
                                                        {1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto got_min = cudf::rolling_window(input, window[0], window[0], 1, cudf::make_min_aggregation());
-  auto got_max = cudf::rolling_window(input, window[0], window[0], 1, cudf::make_max_aggregation());
-  auto got_count_valid =
-    cudf::rolling_window(input, window[0], window[0], 1, cudf::make_count_aggregation());
+  auto got_min = cudf::rolling_window(
+    input, window[0], window[0], 1, *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+  auto got_max = cudf::rolling_window(
+    input, window[0], window[0], 1, *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+  auto got_count_valid = cudf::rolling_window(
+    input, window[0], window[0], 1, *cudf::make_count_aggregation<cudf::rolling_aggregation>());
   auto got_count_all = cudf::rolling_window(
-    input, window[0], window[0], 1, cudf::make_count_aggregation(cudf::null_policy::INCLUDE));
+    input,
+    window[0],
+    window[0],
+    1,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, got_min->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, got_max->view());
@@ -83,12 +91,18 @@ TEST_F(RollingStringTest, NullStringMinMaxCount)
   fixed_width_column_wrapper<size_type> expected_count_all({3, 4, 4, 4, 4, 4, 4, 3, 2},
                                                            {1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto got_min = cudf::rolling_window(input, window[0], window[0], 1, cudf::make_min_aggregation());
-  auto got_max = cudf::rolling_window(input, window[0], window[0], 1, cudf::make_max_aggregation());
-  auto got_count_valid =
-    cudf::rolling_window(input, window[0], window[0], 1, cudf::make_count_aggregation());
+  auto got_min = cudf::rolling_window(
+    input, window[0], window[0], 1, *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+  auto got_max = cudf::rolling_window(
+    input, window[0], window[0], 1, *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+  auto got_count_valid = cudf::rolling_window(
+    input, window[0], window[0], 1, *cudf::make_count_aggregation<cudf::rolling_aggregation>());
   auto got_count_all = cudf::rolling_window(
-    input, window[0], window[0], 1, cudf::make_count_aggregation(cudf::null_policy::INCLUDE));
+    input,
+    window[0],
+    window[0],
+    1,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, got_min->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, got_max->view());
@@ -113,12 +127,18 @@ TEST_F(RollingStringTest, MinPeriods)
   fixed_width_column_wrapper<size_type> expected_count_all({3, 4, 4, 4, 4, 4, 4, 3, 2},
                                                            {0, 1, 1, 1, 1, 1, 1, 0, 0});
 
-  auto got_min = cudf::rolling_window(input, window[0], window[0], 3, cudf::make_min_aggregation());
-  auto got_max = cudf::rolling_window(input, window[0], window[0], 3, cudf::make_max_aggregation());
-  auto got_count_valid =
-    cudf::rolling_window(input, window[0], window[0], 3, cudf::make_count_aggregation());
+  auto got_min = cudf::rolling_window(
+    input, window[0], window[0], 3, *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+  auto got_max = cudf::rolling_window(
+    input, window[0], window[0], 3, *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+  auto got_count_valid = cudf::rolling_window(
+    input, window[0], window[0], 3, *cudf::make_count_aggregation<cudf::rolling_aggregation>());
   auto got_count_all = cudf::rolling_window(
-    input, window[0], window[0], 4, cudf::make_count_aggregation(cudf::null_policy::INCLUDE));
+    input,
+    window[0],
+    window[0],
+    4,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, got_min->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, got_max->view());
@@ -134,7 +154,8 @@ TEST_F(RollingStringTest, ZeroWindowSize)
   fixed_width_column_wrapper<size_type> expected_count({0, 0, 0, 0, 0, 0, 0, 0, 0},
                                                        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto got_count = cudf::rolling_window(input, 0, 0, 0, cudf::make_count_aggregation());
+  auto got_count = cudf::rolling_window(
+    input, 0, 0, 0, *cudf::make_count_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count, got_count->view());
 }
@@ -147,7 +168,7 @@ class RollingTest : public cudf::test::BaseFixture {
                     const std::vector<size_type>& preceding_window,
                     const std::vector<size_type>& following_window,
                     size_type min_periods,
-                    std::unique_ptr<cudf::aggregation> const& op)
+                    cudf::rolling_aggregation const& op)
   {
     std::unique_ptr<cudf::column> output;
 
@@ -192,23 +213,39 @@ class RollingTest : public cudf::test::BaseFixture {
                         size_type min_periods)
   {
     // test all supported aggregators
-    run_test_col(
-      input, preceding_window, following_window, min_periods, cudf::make_min_aggregation());
-    run_test_col(
-      input, preceding_window, following_window, min_periods, cudf::make_count_aggregation());
     run_test_col(input,
                  preceding_window,
                  following_window,
                  min_periods,
-                 cudf::make_count_aggregation(cudf::null_policy::INCLUDE));
+                 *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+    run_test_col(input,
+                 preceding_window,
+                 following_window,
+                 min_periods,
+                 *cudf::make_count_aggregation<cudf::rolling_aggregation>());
     run_test_col(
-      input, preceding_window, following_window, min_periods, cudf::make_max_aggregation());
+      input,
+      preceding_window,
+      following_window,
+      min_periods,
+      *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
+    run_test_col(input,
+                 preceding_window,
+                 following_window,
+                 min_periods,
+                 *cudf::make_max_aggregation<cudf::rolling_aggregation>());
 
     if (not cudf::is_timestamp(input.type())) {
-      run_test_col(
-        input, preceding_window, following_window, min_periods, cudf::make_sum_aggregation());
-      run_test_col(
-        input, preceding_window, following_window, min_periods, cudf::make_mean_aggregation());
+      run_test_col(input,
+                   preceding_window,
+                   following_window,
+                   min_periods,
+                   *cudf::make_sum_aggregation<cudf::rolling_aggregation>());
+      run_test_col(input,
+                   preceding_window,
+                   following_window,
+                   min_periods,
+                   *cudf::make_mean_aggregation<cudf::rolling_aggregation>());
     }
   }
 
@@ -263,7 +300,7 @@ class RollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(
     cudf::column_view const& input,
     std::vector<size_type> const& preceding_window_col,
@@ -318,7 +355,7 @@ class RollingTest : public cudf::test::BaseFixture {
             cudf::aggregation::Kind k,
             typename OutputType,
             bool is_mean,
-            std::enable_if_t<!cudf::detail::is_rolling_supported<T, agg_op, k>()>* = nullptr>
+            std::enable_if_t<!is_rolling_supported<T, agg_op, k>()>* = nullptr>
   std::unique_ptr<cudf::column> create_reference_output(
     cudf::column_view const& input,
     std::vector<size_type> const& preceding_window_col,
@@ -329,14 +366,14 @@ class RollingTest : public cudf::test::BaseFixture {
   }
 
   std::unique_ptr<cudf::column> create_reference_output(
-    std::unique_ptr<cudf::aggregation> const& op,
+    cudf::rolling_aggregation const& op,
     cudf::column_view const& input,
     std::vector<size_type> const& preceding_window,
     std::vector<size_type> const& following_window,
     size_type min_periods)
   {
     // unroll aggregation types
-    switch (op->kind) {
+    switch (op.kind) {
       case cudf::aggregation::SUM:
         return create_reference_output<cudf::DeviceSum,
                                        cudf::aggregation::SUM,
@@ -384,8 +421,9 @@ TEST_F(RollingErrorTest, NegativeMinPeriods)
   const std::vector<bool> col_valid     = {1, 1, 1, 0, 1};
   fixed_width_column_wrapper<size_type> input(col_data.begin(), col_data.end(), col_valid.begin());
 
-  EXPECT_THROW(cudf::rolling_window(input, 2, 2, -2, cudf::make_sum_aggregation()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::rolling_window(input, 2, 2, -2, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
+    cudf::logic_error);
 }
 
 // window array size mismatch
@@ -401,38 +439,54 @@ TEST_F(RollingErrorTest, WindowArraySizeMismatch)
   fixed_width_column_wrapper<size_type> four_elements(four.begin(), four.end());
 
   // this runs ok
-  EXPECT_NO_THROW(
-    cudf::rolling_window(input, five_elements, five_elements, 1, cudf::make_sum_aggregation()));
+  EXPECT_NO_THROW(cudf::rolling_window(input,
+                                       five_elements,
+                                       five_elements,
+                                       1,
+                                       *cudf::make_sum_aggregation<cudf::rolling_aggregation>()));
 
   // mismatch for the window array
-  EXPECT_THROW(
-    cudf::rolling_window(input, four_elements, five_elements, 1, cudf::make_sum_aggregation()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::rolling_window(input,
+                                    four_elements,
+                                    five_elements,
+                                    1,
+                                    *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
+               cudf::logic_error);
 
   // mismatch for the forward window array
-  EXPECT_THROW(
-    cudf::rolling_window(input, five_elements, four_elements, 1, cudf::make_sum_aggregation()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::rolling_window(input,
+                                    five_elements,
+                                    four_elements,
+                                    1,
+                                    *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
+               cudf::logic_error);
 }
 
 TEST_F(RollingErrorTest, EmptyInput)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> empty_col{};
   std::unique_ptr<cudf::column> output;
-  EXPECT_NO_THROW(output = cudf::rolling_window(empty_col, 2, 0, 2, cudf::make_sum_aggregation()));
+  EXPECT_NO_THROW(output = cudf::rolling_window(
+                    empty_col, 2, 0, 2, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()));
   EXPECT_EQ(output->size(), 0);
 
   fixed_width_column_wrapper<int32_t> preceding_window{};
   fixed_width_column_wrapper<int32_t> following_window{};
-  EXPECT_NO_THROW(
-    output = cudf::rolling_window(
-      empty_col, preceding_window, following_window, 2, cudf::make_sum_aggregation()));
+  EXPECT_NO_THROW(output =
+                    cudf::rolling_window(empty_col,
+                                         preceding_window,
+                                         following_window,
+                                         2,
+                                         *cudf::make_sum_aggregation<cudf::rolling_aggregation>()));
   EXPECT_EQ(output->size(), 0);
 
   fixed_width_column_wrapper<int32_t> nonempty_col{{1, 2, 3}};
-  EXPECT_NO_THROW(
-    output = cudf::rolling_window(
-      nonempty_col, preceding_window, following_window, 2, cudf::make_sum_aggregation()));
+  EXPECT_NO_THROW(output =
+                    cudf::rolling_window(nonempty_col,
+                                         preceding_window,
+                                         following_window,
+                                         2,
+                                         *cudf::make_sum_aggregation<cudf::rolling_aggregation>()));
   EXPECT_EQ(output->size(), 0);
 }
 
@@ -445,16 +499,22 @@ TEST_F(RollingErrorTest, SizeMismatch)
     fixed_width_column_wrapper<int32_t> preceding_window{{1, 1}};  // wrong size
     fixed_width_column_wrapper<int32_t> following_window{{1, 1, 1}};
     EXPECT_THROW(
-      output = cudf::rolling_window(
-        nonempty_col, preceding_window, following_window, 2, cudf::make_sum_aggregation()),
+      output = cudf::rolling_window(nonempty_col,
+                                    preceding_window,
+                                    following_window,
+                                    2,
+                                    *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
       cudf::logic_error);
   }
   {
     fixed_width_column_wrapper<int32_t> preceding_window{{1, 1, 1}};
     fixed_width_column_wrapper<int32_t> following_window{{1, 2}};  // wrong size
     EXPECT_THROW(
-      output = cudf::rolling_window(
-        nonempty_col, preceding_window, following_window, 2, cudf::make_sum_aggregation()),
+      output = cudf::rolling_window(nonempty_col,
+                                    preceding_window,
+                                    following_window,
+                                    2,
+                                    *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
       cudf::logic_error);
   }
 }
@@ -466,9 +526,13 @@ TEST_F(RollingErrorTest, WindowWrongDtype)
 
   fixed_width_column_wrapper<float> preceding_window{{1.0f, 1.0f, 1.0f}};
   fixed_width_column_wrapper<float> following_window{{1.0f, 1.0f, 1.0f}};
-  EXPECT_THROW(output = cudf::rolling_window(
-                 nonempty_col, preceding_window, following_window, 2, cudf::make_sum_aggregation()),
-               cudf::logic_error);
+  EXPECT_THROW(
+    output = cudf::rolling_window(nonempty_col,
+                                  preceding_window,
+                                  following_window,
+                                  2,
+                                  *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
+    cudf::logic_error);
 }
 
 // incorrect type/aggregation combo: sum of timestamps
@@ -486,15 +550,20 @@ TEST_F(RollingErrorTest, SumTimestampNotSupported)
   fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> input_ns(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(size));
 
-  EXPECT_THROW(cudf::rolling_window(input_D, 2, 2, 0, cudf::make_sum_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_D, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_s, 2, 2, 0, cudf::make_sum_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_s, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_ms, 2, 2, 0, cudf::make_sum_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_ms, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_us, 2, 2, 0, cudf::make_sum_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_us, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_ns, 2, 2, 0, cudf::make_sum_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_ns, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
 }
 
@@ -513,15 +582,20 @@ TEST_F(RollingErrorTest, MeanTimestampNotSupported)
   fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> input_ns(
     thrust::make_counting_iterator(0), thrust::make_counting_iterator(size));
 
-  EXPECT_THROW(cudf::rolling_window(input_D, 2, 2, 0, cudf::make_mean_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_D, 2, 2, 0, *cudf::make_mean_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_s, 2, 2, 0, cudf::make_mean_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_s, 2, 2, 0, *cudf::make_mean_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_ms, 2, 2, 0, cudf::make_mean_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_ms, 2, 2, 0, *cudf::make_mean_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_us, 2, 2, 0, cudf::make_mean_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_us, 2, 2, 0, *cudf::make_mean_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input_ns, 2, 2, 0, cudf::make_mean_aggregation()),
+  EXPECT_THROW(cudf::rolling_window(
+                 input_ns, 2, 2, 0, *cudf::make_mean_aggregation<cudf::rolling_aggregation>()),
                cudf::logic_error);
 }
 
@@ -531,8 +605,7 @@ TYPED_TEST_CASE(RollingTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 TYPED_TEST(RollingTest, SimpleStatic)
 {
   // https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html
-  const std::vector<TypeParam> col_data =
-    cudf::test::make_type_param_vector<TypeParam>({0, 1, 2, 0, 4});
+  auto const col_data              = cudf::test::make_type_param_vector<TypeParam>({0, 1, 2, 0, 4});
   const std::vector<bool> col_mask = {1, 1, 1, 0, 1};
 
   fixed_width_column_wrapper<TypeParam> input(col_data.begin(), col_data.end(), col_mask.begin());
@@ -560,8 +633,7 @@ TYPED_TEST(RollingTest, NegativeWindowSizes)
 TYPED_TEST(RollingTest, SimpleDynamic)
 {
   // https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html
-  const std::vector<TypeParam> col_data =
-    cudf::test::make_type_param_vector<TypeParam>({0, 1, 2, 0, 4});
+  auto const col_data              = cudf::test::make_type_param_vector<TypeParam>({0, 1, 2, 0, 4});
   const std::vector<bool> col_mask = {1, 1, 1, 0, 1};
 
   fixed_width_column_wrapper<TypeParam> input(col_data.begin(), col_data.end(), col_mask.begin());
@@ -575,8 +647,7 @@ TYPED_TEST(RollingTest, SimpleDynamic)
 // this is a special test to check the volatile count variable issue (see rolling.cu for detail)
 TYPED_TEST(RollingTest, VolatileCount)
 {
-  const std::vector<TypeParam> col_data =
-    cudf::test::make_type_param_vector<TypeParam>({8, 70, 45, 20, 59, 80});
+  auto const col_data = cudf::test::make_type_param_vector<TypeParam>({8, 70, 45, 20, 59, 80});
   const std::vector<bool> col_mask = {1, 1, 0, 0, 1, 0};
 
   fixed_width_column_wrapper<TypeParam> input(col_data.begin(), col_data.end(), col_mask.begin());
@@ -755,22 +826,24 @@ TEST_F(RollingTestStrings, StringsUnsupportedOperators)
 
   std::vector<size_type> window{1};
 
-  EXPECT_THROW(cudf::rolling_window(input, 2, 2, 0, cudf::make_sum_aggregation()),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(input, 2, 2, 0, cudf::make_mean_aggregation()),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::rolling_window(
-                 input,
-                 2,
-                 2,
-                 0,
-                 cudf::make_udf_aggregation(cudf::udf_type::PTX, std::string{}, cudf::data_type{})),
+  EXPECT_THROW(
+    cudf::rolling_window(input, 2, 2, 0, *cudf::make_sum_aggregation<cudf::rolling_aggregation>()),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf::rolling_window(input, 2, 2, 0, *cudf::make_mean_aggregation<cudf::rolling_aggregation>()),
+    cudf::logic_error);
+  EXPECT_THROW(cudf::rolling_window(input,
+                                    2,
+                                    2,
+                                    0,
+                                    *cudf::make_udf_aggregation<cudf::rolling_aggregation>(
+                                      cudf::udf_type::PTX, std::string{}, cudf::data_type{})),
                cudf::logic_error);
   EXPECT_THROW(cudf::rolling_window(input,
                                     2,
                                     2,
                                     0,
-                                    cudf::make_udf_aggregation(
+                                    *cudf::make_udf_aggregation<cudf::rolling_aggregation>(
                                       cudf::udf_type::CUDA, std::string{}, cudf::data_type{})),
                cudf::logic_error);
 }
@@ -891,18 +964,18 @@ TEST_F(RollingTestUdf, StaticWindow)
   fixed_width_column_wrapper<int64_t> expected{start, start + size, valid};
 
   // Test CUDA UDF
-  auto cuda_udf_agg = cudf::make_udf_aggregation(
+  auto cuda_udf_agg = cudf::make_udf_aggregation<cudf::rolling_aggregation>(
     cudf::udf_type::CUDA, this->cuda_func, cudf::data_type{cudf::type_id::INT64});
 
-  output = cudf::rolling_window(input, 2, 2, 4, cuda_udf_agg);
+  output = cudf::rolling_window(input, 2, 2, 4, *cuda_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 
   // Test NUMBA UDF
-  auto ptx_udf_agg = cudf::make_udf_aggregation(
+  auto ptx_udf_agg = cudf::make_udf_aggregation<cudf::rolling_aggregation>(
     cudf::udf_type::PTX, this->ptx_func, cudf::data_type{cudf::type_id::INT64});
 
-  output = cudf::rolling_window(input, 2, 2, 4, ptx_udf_agg);
+  output = cudf::rolling_window(input, 2, 2, 4, *ptx_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 }
@@ -937,18 +1010,18 @@ TEST_F(RollingTestUdf, DynamicWindow)
   fixed_width_column_wrapper<int64_t> expected{start, start + size, valid};
 
   // Test CUDA UDF
-  auto cuda_udf_agg = cudf::make_udf_aggregation(
+  auto cuda_udf_agg = cudf::make_udf_aggregation<cudf::rolling_aggregation>(
     cudf::udf_type::CUDA, this->cuda_func, cudf::data_type{cudf::type_id::INT64});
 
-  output = cudf::rolling_window(input, preceding, following, 2, cuda_udf_agg);
+  output = cudf::rolling_window(input, preceding, following, 2, *cuda_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 
   // Test PTX UDF
-  auto ptx_udf_agg = cudf::make_udf_aggregation(
+  auto ptx_udf_agg = cudf::make_udf_aggregation<cudf::rolling_aggregation>(
     cudf::udf_type::PTX, this->ptx_func, cudf::data_type{cudf::type_id::INT64});
 
-  output = cudf::rolling_window(input, preceding, following, 2, ptx_udf_agg);
+  output = cudf::rolling_window(input, preceding, following, 2, *ptx_udf_agg);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, expected);
 }
@@ -979,13 +1052,20 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLead)
   auto const expected_rowno     = fw_wrapper{{1, 2, 2, 2, 2, 2}, {1, 1, 1, 1, 1, 1}};
   auto const expected_rowno1    = fw_wrapper{{1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}};
 
-  auto const min   = rolling_window(input, 2, 1, 1, make_min_aggregation());
-  auto const max   = rolling_window(input, 2, 1, 1, make_max_aggregation());
-  auto const lag   = rolling_window(input, 2, 1, 1, make_lag_aggregation(1));
-  auto const lead  = rolling_window(input, 2, 1, 1, make_lead_aggregation(1));
-  auto const valid = rolling_window(input, 2, 1, 1, make_count_aggregation());
-  auto const all   = rolling_window(input, 2, 1, 1, make_count_aggregation(null_policy::INCLUDE));
-  auto const rowno = rolling_window(input, 2, 1, 1, make_row_number_aggregation());
+  auto const min =
+    rolling_window(input, 2, 1, 1, *make_min_aggregation<cudf::rolling_aggregation>());
+  auto const max =
+    rolling_window(input, 2, 1, 1, *make_max_aggregation<cudf::rolling_aggregation>());
+  auto const lag =
+    rolling_window(input, 2, 1, 1, *make_lag_aggregation<cudf::rolling_aggregation>(1));
+  auto const lead =
+    rolling_window(input, 2, 1, 1, *make_lead_aggregation<cudf::rolling_aggregation>(1));
+  auto const valid =
+    rolling_window(input, 2, 1, 1, *make_count_aggregation<cudf::rolling_aggregation>());
+  auto const all = rolling_window(
+    input, 2, 1, 1, *make_count_aggregation<cudf::rolling_aggregation>(null_policy::INCLUDE));
+  auto const rowno =
+    rolling_window(input, 2, 1, 1, *make_row_number_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, min->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, max->view());
@@ -997,7 +1077,8 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLead)
 
   // ROW_NUMBER will always return row 1 if the preceding window is set to a constant 1
   for (int following = 1; following < 5; ++following) {
-    auto const rowno1 = rolling_window(input, 1, following, 1, make_row_number_aggregation());
+    auto const rowno1 = rolling_window(
+      input, 1, following, 1, *make_row_number_aggregation<cudf::rolling_aggregation>());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_rowno1, rowno1->view());
   }
 }
@@ -1023,14 +1104,22 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
   auto const expected_count_all = fw_wrapper{{2, 3, 3, 3, 3, 2}, {1, 1, 1, 1, 1, 1}};
   auto const expected_rowno     = fw_wrapper{{1, 2, 2, 2, 2, 2}, {1, 1, 1, 1, 1, 1}};
 
-  auto const sum   = rolling_window(input, 2, 1, 1, make_sum_aggregation());
-  auto const min   = rolling_window(input, 2, 1, 1, make_min_aggregation());
-  auto const max   = rolling_window(input, 2, 1, 1, make_max_aggregation());
-  auto const lag   = rolling_window(input, 2, 1, 1, make_lag_aggregation(1));
-  auto const lead  = rolling_window(input, 2, 1, 1, make_lead_aggregation(1));
-  auto const valid = rolling_window(input, 2, 1, 1, make_count_aggregation());
-  auto const all   = rolling_window(input, 2, 1, 1, make_count_aggregation(null_policy::INCLUDE));
-  auto const rowno = rolling_window(input, 2, 1, 1, make_row_number_aggregation());
+  auto const sum =
+    rolling_window(input, 2, 1, 1, *make_sum_aggregation<cudf::rolling_aggregation>());
+  auto const min =
+    rolling_window(input, 2, 1, 1, *make_min_aggregation<cudf::rolling_aggregation>());
+  auto const max =
+    rolling_window(input, 2, 1, 1, *make_max_aggregation<cudf::rolling_aggregation>());
+  auto const lag =
+    rolling_window(input, 2, 1, 1, *make_lag_aggregation<cudf::rolling_aggregation>(1));
+  auto const lead =
+    rolling_window(input, 2, 1, 1, *make_lead_aggregation<cudf::rolling_aggregation>(1));
+  auto const valid =
+    rolling_window(input, 2, 1, 1, *make_count_aggregation<cudf::rolling_aggregation>());
+  auto const all = rolling_window(
+    input, 2, 1, 1, *make_count_aggregation<cudf::rolling_aggregation>(null_policy::INCLUDE));
+  auto const rowno =
+    rolling_window(input, 2, 1, 1, *make_row_number_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_sum, sum->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, min->view());
@@ -1040,13 +1129,6 @@ TYPED_TEST(FixedPointTests, MinMaxCountLagLeadNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_val, valid->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_all, all->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_rowno, rowno->view());
-
-  EXPECT_THROW(rolling_window(input, 2, 1, 1, make_product_aggregation()), cudf::logic_error);
-  EXPECT_THROW(rolling_window(input, 2, 1, 1, make_mean_aggregation()), cudf::logic_error);
-  EXPECT_THROW(rolling_window(input, 2, 1, 1, make_variance_aggregation()), cudf::logic_error);
-  EXPECT_THROW(rolling_window(input, 2, 1, 1, make_std_aggregation()), cudf::logic_error);
-  EXPECT_THROW(rolling_window(input, 2, 1, 1, make_sum_of_squares_aggregation()),
-               cudf::logic_error);
 }
 
 class RollingDictionaryTest : public cudf::test::BaseFixture {
@@ -1064,10 +1146,16 @@ TEST_F(RollingDictionaryTest, Count)
   fixed_width_column_wrapper<size_type> expected_row_number({1, 2, 2, 2, 2, 2, 2, 2, 2},
                                                             {1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto got_count_valid = cudf::rolling_window(input, 2, 2, 1, cudf::make_count_aggregation());
-  auto got_count_all =
-    cudf::rolling_window(input, 2, 2, 1, cudf::make_count_aggregation(cudf::null_policy::INCLUDE));
-  auto got_row_number = cudf::rolling_window(input, 2, 2, 1, cudf::make_row_number_aggregation());
+  auto got_count_valid = cudf::rolling_window(
+    input, 2, 2, 1, *cudf::make_count_aggregation<cudf::rolling_aggregation>());
+  auto got_count_all = cudf::rolling_window(
+    input,
+    2,
+    2,
+    1,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::INCLUDE));
+  auto got_row_number = cudf::rolling_window(
+    input, 2, 2, 1, *cudf::make_row_number_aggregation<cudf::rolling_aggregation>());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_val, got_count_valid->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_count_all, got_count_all->view());
@@ -1086,11 +1174,13 @@ TEST_F(RollingDictionaryTest, MinMax)
     {"This", "test", "test", "test", "test", "string", "string", "string", "string"},
     {1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto got_min_dict = cudf::rolling_window(input, 2, 2, 1, cudf::make_min_aggregation());
-  auto got_min      = cudf::dictionary::decode(cudf::dictionary_column_view(got_min_dict->view()));
+  auto got_min_dict =
+    cudf::rolling_window(input, 2, 2, 1, *cudf::make_min_aggregation<cudf::rolling_aggregation>());
+  auto got_min = cudf::dictionary::decode(cudf::dictionary_column_view(got_min_dict->view()));
 
-  auto got_max_dict = cudf::rolling_window(input, 2, 2, 1, cudf::make_max_aggregation());
-  auto got_max      = cudf::dictionary::decode(cudf::dictionary_column_view(got_max_dict->view()));
+  auto got_max_dict =
+    cudf::rolling_window(input, 2, 2, 1, *cudf::make_max_aggregation<cudf::rolling_aggregation>());
+  auto got_max = cudf::dictionary::decode(cudf::dictionary_column_view(got_max_dict->view()));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_min, got_min->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_max, got_max->view());
@@ -1106,11 +1196,13 @@ TEST_F(RollingDictionaryTest, LeadLag)
   cudf::test::strings_column_wrapper expected_lag(
     {"", "This", "", "", "test", "", "operated", "on", "string"}, {0, 1, 0, 0, 1, 0, 1, 1, 1});
 
-  auto got_lead_dict = cudf::rolling_window(input, 2, 1, 1, cudf::make_lead_aggregation(1));
+  auto got_lead_dict = cudf::rolling_window(
+    input, 2, 1, 1, *cudf::make_lead_aggregation<cudf::rolling_aggregation>(1));
   auto got_lead = cudf::dictionary::decode(cudf::dictionary_column_view(got_lead_dict->view()));
 
-  auto got_lag_dict = cudf::rolling_window(input, 2, 2, 1, cudf::make_lag_aggregation(1));
-  auto got_lag      = cudf::dictionary::decode(cudf::dictionary_column_view(got_lag_dict->view()));
+  auto got_lag_dict =
+    cudf::rolling_window(input, 2, 2, 1, *cudf::make_lag_aggregation<cudf::rolling_aggregation>(1));
+  auto got_lag = cudf::dictionary::decode(cudf::dictionary_column_view(got_lag_dict->view()));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lead, got_lead->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lag, got_lag->view());
diff --git a/cpp/tests/rolling/rolling_test.hpp b/cpp/tests/rolling/rolling_test.hpp
new file mode 100644
index 00000000000..cca82b15826
--- /dev/null
+++ b/cpp/tests/rolling/rolling_test.hpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/traits.hpp>
+
+// return true if the aggregation is valid for the specified ColumnType
+// valid aggregations may still be further specialized (eg, is_string_specialized)
+template <typename ColumnType, class AggOp, cudf::aggregation::Kind op>
+static constexpr bool is_rolling_supported()
+{
+  using namespace cudf;
+
+  if (!cudf::detail::is_valid_aggregation<ColumnType, op>()) {
+    return false;
+  } else if (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>()) {
+    constexpr bool is_comparable_countable_op = std::is_same<AggOp, DeviceMin>::value or
+                                                std::is_same<AggOp, DeviceMax>::value or
+                                                std::is_same<AggOp, DeviceCount>::value;
+
+    constexpr bool is_operation_supported =
+      (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
+      (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+      (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
+      (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
+
+    constexpr bool is_valid_numeric_agg =
+      (cudf::is_numeric<ColumnType>() or cudf::is_duration<ColumnType>() or
+       is_comparable_countable_op) and
+      is_operation_supported;
+
+    return is_valid_numeric_agg;
+
+  } else if (cudf::is_timestamp<ColumnType>()) {
+    return (op == aggregation::MIN) or (op == aggregation::MAX) or
+           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
+           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
+  } else if (cudf::is_fixed_point<ColumnType>()) {
+    return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or
+           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or
+           (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST);
+  } else if (std::is_same<ColumnType, cudf::string_view>()) {
+    return (op == aggregation::MIN) or (op == aggregation::MAX) or
+           (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
+
+  } else if (std::is_same<ColumnType, cudf::list_view>()) {
+    return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or
+           (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST);
+  } else if (std::is_same<ColumnType, cudf::struct_view>()) {
+    // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER.
+    return op == aggregation::COLLECT_LIST;
+  } else {
+    return false;
+  }
+}
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index fd0a92a0168..e2f2c26a16e 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar_factories.hpp>
@@ -161,4 +163,33 @@ TYPED_TEST(FixedPointScalarFactory, ValueProvided)
   EXPECT_TRUE(s->is_valid());
 }
 
+struct StructScalarFactory : public ScalarFactoryTest {
+};
+
+TEST_F(StructScalarFactory, Basic)
+{
+  cudf::test::fixed_width_column_wrapper<int> col0{1};
+  cudf::test::strings_column_wrapper col1{"abc"};
+  cudf::test::lists_column_wrapper<int> col2{{1, 2, 3}};
+  cudf::test::structs_column_wrapper struct_col({col0, col1, col2});
+  cudf::column_view cv = static_cast<cudf::column_view>(struct_col);
+  std::vector<cudf::column_view> children(cv.child_begin(), cv.child_end());
+
+  // table_view constructor
+  {
+    auto sc = cudf::make_struct_scalar(cudf::table_view{children});
+    auto s  = static_cast<cudf::scalar_type_t<cudf::struct_view>*>(sc.get());
+    EXPECT_TRUE(s->is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{children}, s->view());
+  }
+
+  // host_span constructor
+  {
+    auto sc = cudf::make_struct_scalar(cudf::host_span<cudf::column_view const>{children});
+    auto s  = static_cast<cudf::scalar_type_t<cudf::struct_view>*>(sc.get());
+    EXPECT_TRUE(s->is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{children}, s->view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index c501071ccbe..d0b6b0db44a 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -24,8 +25,6 @@
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <rmm/device_vector.hpp>
-
 #include <thrust/sequence.h>
 #include <random>
 
@@ -126,10 +125,9 @@ TEST_F(StringScalarDeviceViewTest, Value)
 
   auto scalar_device_view = cudf::get_scalar_device_view(s);
   rmm::device_scalar<bool> result;
-  rmm::device_vector<char> value_v(value.begin(), value.end());
+  auto value_v = cudf::detail::make_device_uvector_sync(value);
 
-  test_string_value<<<1, 1>>>(
-    scalar_device_view, value_v.data().get(), value.size(), result.data());
+  test_string_value<<<1, 1>>>(scalar_device_view, value_v.data(), value.size(), result.data());
   CHECK_CUDA(0);
 
   EXPECT_TRUE(result.value());
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index e392b0273a0..7a12c2fd27d 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -138,4 +140,181 @@ TEST_F(StringScalarTest, MoveConstructor)
   EXPECT_EQ(data_ptr, s2.data());
 }
 
+struct ListScalarTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(ListScalarTest, DefaultValidityNonNested)
+{
+  auto data = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto s    = cudf::list_scalar(data);
+
+  EXPECT_TRUE(s.is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(data, s.view());
+}
+
+TEST_F(ListScalarTest, DefaultValidityNested)
+{
+  auto data = cudf::test::lists_column_wrapper<int32_t>{{1, 2}, {2}, {}, {4, 5}};
+  auto s    = cudf::list_scalar(data);
+
+  EXPECT_TRUE(s.is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(data, s.view());
+}
+
+TEST_F(ListScalarTest, ConstructNull)
+{
+  auto s = cudf::list_scalar();
+
+  EXPECT_FALSE(s.is_valid());
+}
+
+TEST_F(ListScalarTest, MoveColumnConstructor)
+{
+  auto data = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto col  = cudf::column(data);
+  auto ptr  = col.view().data<int32_t>();
+  auto s    = cudf::list_scalar(std::move(col));
+
+  EXPECT_TRUE(s.is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(data, s.view());
+  EXPECT_EQ(ptr, s.view().data<int32_t>());
+}
+
+TEST_F(ListScalarTest, CopyConstructorNonNested)
+{
+  auto data = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto s    = cudf::list_scalar(data);
+  auto s2   = s;
+
+  EXPECT_TRUE(s2.is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(data, s2.view());
+  EXPECT_NE(s.view().data<int32_t>(), s2.view().data<int32_t>());
+}
+
+TEST_F(ListScalarTest, CopyConstructorNested)
+{
+  auto data = cudf::test::lists_column_wrapper<int32_t>{{1, 2}, {2}, {}, {4, 5}};
+  auto s    = cudf::list_scalar(data);
+  auto s2   = s;
+
+  EXPECT_TRUE(s2.is_valid());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(data, s2.view());
+  EXPECT_NE(s.view().child(0).data<int32_t>(), s2.view().child(0).data<int32_t>());
+  EXPECT_NE(s.view().child(1).data<int32_t>(), s2.view().child(1).data<int32_t>());
+}
+
+TEST_F(ListScalarTest, MoveConstructorNonNested)
+{
+  auto data     = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto s        = cudf::list_scalar(data);
+  auto data_ptr = s.view().data<int32_t>();
+  auto mask_ptr = s.validity_data();
+  decltype(s) s2(std::move(s));
+
+  EXPECT_EQ(mask_ptr, s2.validity_data());
+  EXPECT_EQ(data_ptr, s2.view().data<int32_t>());
+  EXPECT_EQ(s.view().data<int32_t>(), nullptr);
+}
+
+TEST_F(ListScalarTest, MoveConstructorNested)
+{
+  auto data       = cudf::test::lists_column_wrapper<int32_t>{{1, 2}, {2}, {}, {4, 5}};
+  auto s          = cudf::list_scalar(data);
+  auto offset_ptr = s.view().child(0).data<cudf::size_type>();
+  auto data_ptr   = s.view().child(1).data<int32_t>();
+  auto mask_ptr   = s.validity_data();
+  decltype(s) s2(std::move(s));
+
+  EXPECT_EQ(mask_ptr, s2.validity_data());
+  EXPECT_EQ(offset_ptr, s2.view().child(0).data<cudf::size_type>());
+  EXPECT_EQ(data_ptr, s2.view().child(1).data<int32_t>());
+  EXPECT_EQ(s.view().data<int32_t>(), nullptr);
+  EXPECT_EQ(s.view().num_children(), 0);
+}
+
+struct StructScalarTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StructScalarTest, Basic)
+{
+  cudf::test::fixed_width_column_wrapper<int> col0{1};
+  cudf::test::strings_column_wrapper col1{"abc"};
+  cudf::test::lists_column_wrapper<int> col2{{1, 2, 3}};
+  cudf::test::structs_column_wrapper struct_col({col0, col1, col2});
+  cudf::column_view cv = static_cast<cudf::column_view>(struct_col);
+  std::vector<cudf::column_view> children(cv.child_begin(), cv.child_end());
+
+  // table_view constructor
+  {
+    auto s = cudf::struct_scalar(children, true);
+    EXPECT_TRUE(s.is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{children}, s.view());
+  }
+
+  // host_span constructor
+  {
+    auto s = cudf::struct_scalar(cudf::host_span<cudf::column_view const>{children}, true);
+    EXPECT_TRUE(s.is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{children}, s.view());
+  }
+}
+
+TEST_F(StructScalarTest, BasicNulls)
+{
+  cudf::test::fixed_width_column_wrapper<int> col0{1};
+  cudf::test::strings_column_wrapper col1{"abc"};
+  cudf::test::lists_column_wrapper<int> col2{{1, 2, 3}};
+  std::vector<cudf::column_view> src_children({col0, col1, col2});
+
+  std::vector<std::unique_ptr<cudf::column>> src_columns;
+
+  // structs_column_wrapper takes ownership of the incoming columns, so make a copy
+  src_columns.push_back(std::make_unique<cudf::column>(src_children[0]));
+  src_columns.push_back(std::make_unique<cudf::column>(src_children[1]));
+  src_columns.push_back(std::make_unique<cudf::column>(src_children[2]));
+  cudf::test::structs_column_wrapper valid_struct_col(std::move(src_columns), {1});
+  cudf::column_view vcv = static_cast<cudf::column_view>(valid_struct_col);
+  std::vector<cudf::column_view> valid_children(vcv.child_begin(), vcv.child_end());
+
+  // structs_column_wrapper takes ownership of the incoming columns, so make a copy
+  src_columns.push_back(std::make_unique<cudf::column>(src_children[0]));
+  src_columns.push_back(std::make_unique<cudf::column>(src_children[1]));
+  src_columns.push_back(std::make_unique<cudf::column>(src_children[2]));
+  cudf::test::structs_column_wrapper invalid_struct_col(std::move(src_columns), {0});
+  cudf::column_view icv = static_cast<cudf::column_view>(invalid_struct_col);
+  std::vector<cudf::column_view> invalid_children(icv.child_begin(), icv.child_end());
+
+  // table_view constructor
+  {
+    auto s = cudf::struct_scalar(cudf::table_view{src_children}, true);
+    EXPECT_TRUE(s.is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{valid_children}, s.view());
+  }
+  // host_span constructor
+  {
+    auto s = cudf::struct_scalar(cudf::host_span<cudf::column_view const>{src_children}, true);
+    EXPECT_TRUE(s.is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{valid_children}, s.view());
+  }
+
+  // with nulls, we expect the incoming children to get nullified by passing false to
+  // the scalar constructor itself. so we use the unmodified `children` as the input, but
+  // we compare against the modified `invalid_children` produced by the source column as
+  // proof that the scalar did the validity pushdown.
+
+  // table_view constructor
+  {
+    auto s = cudf::struct_scalar(cudf::table_view{src_children}, false);
+    EXPECT_TRUE(!s.is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{invalid_children}, s.view());
+  }
+
+  // host_span constructor
+  {
+    auto s = cudf::struct_scalar(cudf::host_span<cudf::column_view const>{src_children}, false);
+    EXPECT_TRUE(!s.is_valid());
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(cudf::table_view{invalid_children}, s.view());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp
new file mode 100644
index 00000000000..6b1caa5ed6f
--- /dev/null
+++ b/cpp/tests/search/search_dictionary_test.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/search.hpp>
+
+struct DictionarySearchTest : public cudf::test::BaseFixture {
+};
+
+using cudf::numeric_scalar;
+using cudf::size_type;
+using cudf::string_scalar;
+using cudf::test::fixed_width_column_wrapper;
+
+TEST_F(DictionarySearchTest, search_dictionary)
+{
+  cudf::test::dictionary_column_wrapper<std::string> input(
+    {"", "", "10", "10", "20", "20", "30", "40"}, {0, 0, 1, 1, 1, 1, 1, 1});
+  cudf::test::dictionary_column_wrapper<std::string> values(
+    {"", "08", "10", "11", "30", "32", "90"}, {0, 1, 1, 1, 1, 1, 1});
+
+  auto result = cudf::upper_bound({cudf::table_view{{input}}},
+                                  {cudf::table_view{{values}}},
+                                  {cudf::order::ASCENDING},
+                                  {cudf::null_order::BEFORE});
+  fixed_width_column_wrapper<size_type> expect_upper{2, 2, 4, 4, 7, 7, 8};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_upper);
+
+  result = cudf::lower_bound({cudf::table_view{{input}}},
+                             {cudf::table_view{{values}}},
+                             {cudf::order::ASCENDING},
+                             {cudf::null_order::BEFORE});
+  fixed_width_column_wrapper<size_type> expect_lower{0, 2, 2, 4, 6, 7, 8};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_lower);
+}
+
+TEST_F(DictionarySearchTest, search_table_dictionary)
+{
+  fixed_width_column_wrapper<int32_t> column_0{{10, 10, 20, 20, 20, 20, 20, 20, 20, 50, 30},
+                                               {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
+  fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
+                                             {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  cudf::test::dictionary_column_wrapper<int16_t> column_2{
+    {90, 95, 77, 78, 79, 76, 61, 62, 63, 41, 50}, {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1}};
+  cudf::table_view input({column_0, column_1, column_2});
+
+  fixed_width_column_wrapper<int32_t> values_0{{10, 40, 20}, {1, 0, 1}};
+  fixed_width_column_wrapper<float> values_1{{6., .5, .5}, {0, 1, 1}};
+  cudf::test::dictionary_column_wrapper<int16_t> values_2{{95, 50, 77}, {1, 1, 0}};
+  cudf::table_view values({values_0, values_1, values_2});
+
+  std::vector<cudf::order> order_flags{
+    {cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING}};
+  std::vector<cudf::null_order> null_order_flags{
+    {cudf::null_order::AFTER, cudf::null_order::AFTER, cudf::null_order::AFTER}};
+
+  auto result = cudf::lower_bound(input, values, order_flags, null_order_flags);
+  fixed_width_column_wrapper<size_type> expect_lower{1, 10, 2};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_lower);
+
+  result = cudf::upper_bound(input, values, order_flags, null_order_flags);
+  fixed_width_column_wrapper<size_type> expect_upper{2, 11, 6};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_upper);
+}
+
+TEST_F(DictionarySearchTest, contains_dictionary)
+{
+  cudf::test::dictionary_column_wrapper<std::string> column(
+    {"00", "00", "17", "17", "23", "23", "29"});
+  EXPECT_TRUE(cudf::contains(column, string_scalar{"23"}));
+  EXPECT_FALSE(cudf::contains(column, string_scalar{"28"}));
+
+  cudf::test::dictionary_column_wrapper<std::string> needles({"00", "17", "23", "27"});
+  fixed_width_column_wrapper<bool> expect{1, 1, 1, 1, 1, 1, 0};
+  auto result = cudf::contains(column, needles);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
+}
+
+TEST_F(DictionarySearchTest, contains_nullable_dictionary)
+{
+  cudf::test::dictionary_column_wrapper<int64_t> column({0, 0, 17, 17, 23, 23, 29},
+                                                        {1, 0, 1, 1, 1, 1, 1});
+  EXPECT_TRUE(cudf::contains(column, numeric_scalar<int64_t>{23}));
+  EXPECT_FALSE(cudf::contains(column, numeric_scalar<int64_t>{28}));
+
+  cudf::test::dictionary_column_wrapper<int64_t> needles({0, 17, 23, 27});
+  fixed_width_column_wrapper<bool> expect({1, 0, 1, 1, 1, 1, 0}, {1, 0, 1, 1, 1, 1, 1});
+  auto result = cudf::contains(column, needles);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
+}
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
new file mode 100644
index 00000000000..1c2e9b02f05
--- /dev/null
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/search.hpp>
+#include <cudf/table/table_view.hpp>
+
+using bools_col   = cudf::test::fixed_width_column_wrapper<bool>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using structs_col = cudf::test::structs_column_wrapper;
+using strings_col = cudf::test::strings_column_wrapper;
+
+constexpr bool print_all{false};  // For debugging only
+constexpr int32_t null{0};        // Mark for null child elements
+constexpr int32_t XXX{0};         // Mark for null struct elements
+
+template <typename T>
+struct TypedStructSearchTest : public cudf::test::BaseFixture {
+};
+
+using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                     cudf::test::FloatingPointTypes,
+                                     cudf::test::DurationTypes,
+                                     cudf::test::TimestampTypes>;
+
+TYPED_TEST_CASE(TypedStructSearchTest, TestTypes);
+
+namespace {
+auto search_bounds(cudf::column_view const& t_col_view,
+                   std::unique_ptr<cudf::column> const& values_col,
+                   std::vector<cudf::order> const& column_orders        = {cudf::order::ASCENDING},
+                   std::vector<cudf::null_order> const& null_precedence = {
+                     cudf::null_order::BEFORE})
+{
+  auto const t            = cudf::table_view{std::vector<cudf::column_view>{t_col_view}};
+  auto const values       = cudf::table_view{std::vector<cudf::column_view>{values_col->view()}};
+  auto result_lower_bound = cudf::lower_bound(t, values, column_orders, null_precedence);
+  auto result_upper_bound = cudf::upper_bound(t, values, column_orders, null_precedence);
+  return std::make_pair(std::move(result_lower_bound), std::move(result_upper_bound));
+}
+
+auto search_bounds(std::unique_ptr<cudf::column> const& t_col,
+                   std::unique_ptr<cudf::column> const& values_col,
+                   std::vector<cudf::order> const& column_orders        = {cudf::order::ASCENDING},
+                   std::vector<cudf::null_order> const& null_precedence = {
+                     cudf::null_order::BEFORE})
+{
+  return search_bounds(t_col->view(), values_col, column_orders, null_precedence);
+}
+
+auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); }
+
+}  // namespace
+
+// Test case when all input columns are empty
+TYPED_TEST(TypedStructSearchTest, EmptyInputTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_t     = col_wrapper{};
+  auto const structs_t = structs_col{{child_col_t}, std::vector<bool>{}}.release();
+
+  auto child_col_values     = col_wrapper{};
+  auto const structs_values = structs_col{{child_col_values}, std::vector<bool>{}}.release();
+
+  auto const results  = search_bounds(structs_t, structs_values);
+  auto const expected = int32s_col{};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results.second->view(), print_all);
+}
+
+TYPED_TEST(TypedStructSearchTest, TrivialInputTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_t     = col_wrapper{10, 20, 30, 40, 50};
+  auto const structs_t = structs_col{{child_col_t}}.release();
+
+  auto child_col_values1     = col_wrapper{0, 1, 2, 3, 4};
+  auto const structs_values1 = structs_col{{child_col_values1}}.release();
+
+  auto child_col_values2     = col_wrapper{100, 101, 102, 103, 104};
+  auto const structs_values2 = structs_col{{child_col_values2}}.release();
+
+  auto const results1  = search_bounds(structs_t, structs_values1);
+  auto const expected1 = int32s_col{0, 0, 0, 0, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, results1.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, results1.second->view(), print_all);
+
+  auto const results2  = search_bounds(structs_t, structs_values2);
+  auto const expected2 = int32s_col{5, 5, 5, 5, 5};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, results2.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, results2.second->view(), print_all);
+}
+
+TYPED_TEST(TypedStructSearchTest, SlicedColumnInputTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_values     = col_wrapper{0, 1, 2, 3, 4, 5};
+  auto const structs_values = structs_col{child_col_values}.release();
+
+  auto child_col_t              = col_wrapper{0, 1, 2, 2, 2, 2, 3, 3, 4, 4};
+  auto const structs_t_original = structs_col{child_col_t}.release();
+
+  auto structs_t = cudf::slice(structs_t_original->view(), {0, 10})[0];  // the entire column t
+  auto results   = search_bounds(structs_t, structs_values);
+  auto expected_lower_bound = int32s_col{0, 1, 2, 6, 8, 10};
+  auto expected_upper_bound = int32s_col{1, 2, 6, 8, 10, 10};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  structs_t            = cudf::slice(structs_t_original->view(), {0, 5})[0];
+  results              = search_bounds(structs_t, structs_values);
+  expected_lower_bound = int32s_col{0, 1, 2, 5, 5, 5};
+  expected_upper_bound = int32s_col{1, 2, 5, 5, 5, 5};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  structs_t            = cudf::slice(structs_t_original->view(), {5, 10})[0];
+  results              = search_bounds(structs_t, structs_values);
+  expected_lower_bound = int32s_col{0, 0, 0, 1, 3, 5};
+  expected_upper_bound = int32s_col{0, 0, 1, 3, 5, 5};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+}
+
+TYPED_TEST(TypedStructSearchTest, SimpleInputWithNullsTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_values     = col_wrapper{{1, null, 70, XXX, 2, 100}, null_at(1)};
+  auto const structs_values = structs_col{{child_col_values}, null_at(3)}.release();
+
+  // Sorted asc, nulls first
+  auto child_col_t = col_wrapper{{XXX, null, 0, 1, 2, 2, 2, 2, 3, 3, 4}, null_at(1)};
+  auto structs_t   = structs_col{{child_col_t}, null_at(0)}.release();
+
+  auto results =
+    search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::BEFORE});
+  auto expected_lower_bound = int32s_col{3, 1, 11, 0, 4, 11};
+  auto expected_upper_bound = int32s_col{4, 2, 11, 1, 8, 11};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted asc, nulls last
+  child_col_t = col_wrapper{{0, 1, 2, 2, 2, 2, 3, 3, 4, null, XXX}, null_at(9)};
+  structs_t   = structs_col{{child_col_t}, null_at(10)}.release();
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+  expected_lower_bound = int32s_col{1, 9, 9, 10, 2, 9};
+  expected_upper_bound = int32s_col{2, 10, 9, 11, 6, 9};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted dsc, nulls first
+  child_col_t = col_wrapper{{XXX, null, 4, 3, 3, 2, 2, 2, 2, 1, 0}, null_at(1)};
+  structs_t   = structs_col{{child_col_t}, null_at(0)}.release();
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::BEFORE});
+  expected_lower_bound = int32s_col{9, 11, 0, 11, 5, 0};
+  expected_upper_bound = int32s_col{10, 11, 0, 11, 9, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted dsc, nulls last
+  child_col_t = col_wrapper{{4, 3, 3, 2, 2, 2, 2, 1, 0, null, XXX}, null_at(9)};
+  structs_t   = structs_col{{child_col_t}, null_at(10)}.release();
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER});
+  expected_lower_bound = int32s_col{7, 0, 0, 0, 3, 0};
+  expected_upper_bound = int32s_col{8, 0, 0, 0, 7, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+}
+
+TYPED_TEST(TypedStructSearchTest, SimpleInputWithValuesHavingNullsTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_values     = col_wrapper{{1, null, 70, XXX, 2, 100}, null_at(1)};
+  auto const structs_values = structs_col{{child_col_values}, null_at(3)}.release();
+
+  // Sorted asc, search nulls first
+  auto child_col_t = col_wrapper{0, 0, 0, 1, 2, 2, 2, 2, 3, 3, 4};
+  auto structs_t   = structs_col{{child_col_t}}.release();
+
+  auto results =
+    search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::BEFORE});
+  auto expected_lower_bound = int32s_col{3, 0, 11, 0, 4, 11};
+  auto expected_upper_bound = int32s_col{4, 0, 11, 0, 8, 11};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted asc, search nulls last
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+  expected_lower_bound = int32s_col{3, 11, 11, 11, 4, 11};
+  expected_upper_bound = int32s_col{4, 11, 11, 11, 8, 11};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted dsc, search nulls first
+  child_col_t = col_wrapper{4, 3, 3, 2, 2, 2, 2, 1, 0, 0, 0};
+  structs_t   = structs_col{{child_col_t}}.release();
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::BEFORE});
+  expected_lower_bound = int32s_col{7, 11, 0, 11, 3, 0};
+  expected_upper_bound = int32s_col{8, 11, 0, 11, 7, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted dsc, search nulls last
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER});
+  expected_lower_bound = int32s_col{7, 0, 0, 0, 3, 0};
+  expected_upper_bound = int32s_col{8, 0, 0, 0, 7, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+}
+
+TYPED_TEST(TypedStructSearchTest, SimpleInputWithTargetHavingNullsTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto child_col_values     = col_wrapper{1, 0, 70, 0, 2, 100};
+  auto const structs_values = structs_col{{child_col_values}}.release();
+
+  // Sorted asc, nulls first
+  auto child_col_t = col_wrapper{{XXX, null, 0, 1, 2, 2, 2, 2, 3, 3, 4}, null_at(1)};
+  auto structs_t   = structs_col{{child_col_t}, null_at(0)}.release();
+
+  auto results =
+    search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::BEFORE});
+  auto expected_lower_bound = int32s_col{3, 2, 11, 2, 4, 11};
+  auto expected_upper_bound = int32s_col{4, 3, 11, 3, 8, 11};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted asc, nulls last
+  child_col_t = col_wrapper{{0, 1, 2, 2, 2, 2, 3, 3, 4, null, XXX}, null_at(9)};
+  structs_t   = structs_col{{child_col_t}, null_at(10)}.release();
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+  expected_lower_bound = int32s_col{1, 0, 9, 0, 2, 9};
+  expected_upper_bound = int32s_col{2, 1, 9, 1, 6, 9};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted dsc, nulls first
+  child_col_t = col_wrapper{{XXX, null, 4, 3, 3, 2, 2, 2, 2, 1, 0}, null_at(1)};
+  structs_t   = structs_col{{child_col_t}, null_at(0)}.release();
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::BEFORE});
+  expected_lower_bound = int32s_col{9, 10, 0, 10, 5, 0};
+  expected_upper_bound = int32s_col{10, 11, 0, 11, 9, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+
+  // Sorted dsc, nulls last
+  child_col_t = col_wrapper{{4, 3, 3, 2, 2, 2, 2, 1, 0, null, XXX}, null_at(9)};
+  structs_t   = structs_col{{child_col_t}, null_at(10)}.release();
+  results =
+    search_bounds(structs_t, structs_values, {cudf::order::DESCENDING}, {cudf::null_order::AFTER});
+  expected_lower_bound = int32s_col{7, 8, 0, 8, 3, 0};
+  expected_upper_bound = int32s_col{8, 11, 0, 11, 7, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+}
+
+TYPED_TEST(TypedStructSearchTest, ComplexStructTest)
+{
+  // Testing on struct<string, numeric, bool>.
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto names_column_t =
+    strings_col{"Cherry", "Kiwi", "Lemon", "Newton", "Tomato", /*NULL*/ "Washington"};
+  auto ages_column_t  = col_wrapper{{5, 10, 15, 20, null, XXX}, null_at(4)};
+  auto is_human_col_t = bools_col{false, false, false, false, false, /*NULL*/ true};
+
+  auto const structs_t =
+    structs_col{{names_column_t, ages_column_t, is_human_col_t}, null_at(5)}.release();
+
+  auto names_column_values = strings_col{"Bagel", "Tomato", "Lemonade", /*NULL*/ "Donut", "Butter"};
+  auto ages_column_values  = col_wrapper{{10, null, 15, XXX, 17}, null_at(1)};
+  auto is_human_col_values = bools_col{false, false, true, /*NULL*/ true, true};
+  auto const structs_values =
+    structs_col{{names_column_values, ages_column_values, is_human_col_values}, null_at(3)}
+      .release();
+
+  auto const results =
+    search_bounds(structs_t, structs_values, {cudf::order::ASCENDING}, {cudf::null_order::AFTER});
+  auto const expected_lower_bound = int32s_col{0, 4, 3, 5, 0};
+  auto const expected_upper_bound = int32s_col{0, 5, 3, 6, 0};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), print_all);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), print_all);
+}
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index f5136f321da..bf52c2609c4 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1618,83 +1618,6 @@ TEST_F(SearchTest, contains_nullable_column_false_string)
   ASSERT_EQ(result, expect);
 }
 
-TEST_F(SearchTest, search_dictionary)
-{
-  cudf::test::dictionary_column_wrapper<std::string> input(
-    {"", "", "10", "10", "20", "20", "30", "40"}, {0, 0, 1, 1, 1, 1, 1, 1});
-  cudf::test::dictionary_column_wrapper<std::string> values(
-    {"", "08", "10", "11", "30", "32", "90"}, {0, 1, 1, 1, 1, 1, 1});
-
-  auto result = cudf::upper_bound({cudf::table_view{{input}}},
-                                  {cudf::table_view{{values}}},
-                                  {cudf::order::ASCENDING},
-                                  {cudf::null_order::BEFORE});
-  fixed_width_column_wrapper<size_type> expect_upper{2, 2, 4, 4, 7, 7, 8};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_upper);
-
-  result = cudf::lower_bound({cudf::table_view{{input}}},
-                             {cudf::table_view{{values}}},
-                             {cudf::order::ASCENDING},
-                             {cudf::null_order::BEFORE});
-  fixed_width_column_wrapper<size_type> expect_lower{0, 2, 2, 4, 6, 7, 8};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_lower);
-}
-
-TEST_F(SearchTest, search_table_dictionary)
-{
-  fixed_width_column_wrapper<int32_t> column_0{{10, 10, 20, 20, 20, 20, 20, 20, 20, 50, 30},
-                                               {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
-  fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
-                                             {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
-  cudf::test::dictionary_column_wrapper<int16_t> column_2{
-    {90, 95, 77, 78, 79, 76, 61, 62, 63, 41, 50}, {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1}};
-  cudf::table_view input({column_0, column_1, column_2});
-
-  fixed_width_column_wrapper<int32_t> values_0{{10, 40, 20}, {1, 0, 1}};
-  fixed_width_column_wrapper<float> values_1{{6., .5, .5}, {0, 1, 1}};
-  cudf::test::dictionary_column_wrapper<int16_t> values_2{{95, 50, 77}, {1, 1, 0}};
-  cudf::table_view values({values_0, values_1, values_2});
-
-  std::vector<cudf::order> order_flags{
-    {cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING}};
-  std::vector<cudf::null_order> null_order_flags{
-    {cudf::null_order::AFTER, cudf::null_order::AFTER, cudf::null_order::AFTER}};
-
-  auto result = cudf::lower_bound(input, values, order_flags, null_order_flags);
-  fixed_width_column_wrapper<size_type> expect_lower{1, 10, 2};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_lower);
-
-  result = cudf::upper_bound(input, values, order_flags, null_order_flags);
-  fixed_width_column_wrapper<size_type> expect_upper{2, 11, 6};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect_upper);
-}
-
-TEST_F(SearchTest, contains_dictionary)
-{
-  cudf::test::dictionary_column_wrapper<std::string> column(
-    {"00", "00", "17", "17", "23", "23", "29"});
-  EXPECT_TRUE(cudf::contains(column, string_scalar{"23"}));
-  EXPECT_FALSE(cudf::contains(column, string_scalar{"28"}));
-
-  cudf::test::dictionary_column_wrapper<std::string> needles({"00", "17", "23", "27"});
-  fixed_width_column_wrapper<bool> expect{1, 1, 1, 1, 1, 1, 0};
-  auto result = cudf::contains(column, needles);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
-}
-
-TEST_F(SearchTest, contains_nullable_dictionary)
-{
-  cudf::test::dictionary_column_wrapper<int64_t> column({0, 0, 17, 17, 23, 23, 29},
-                                                        {1, 0, 1, 1, 1, 1, 1});
-  EXPECT_TRUE(cudf::contains(column, numeric_scalar<int64_t>{23}));
-  EXPECT_FALSE(cudf::contains(column, numeric_scalar<int64_t>{28}));
-
-  cudf::test::dictionary_column_wrapper<int64_t> needles({0, 17, 23, 27});
-  fixed_width_column_wrapper<bool> expect({1, 0, 1, 1, 1, 1, 0}, {1, 0, 1, 1, 1, 1, 1});
-  auto result = cudf::contains(column, needles);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
-}
-
 TEST_F(SearchTest, multi_contains_some)
 {
   using element_type = int64_t;
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 9eb082c513c..0f4688119b7 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -50,11 +50,16 @@ void run_sort_test(table_view input,
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view());
 }
 
+using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                     cudf::test::FloatingPointTypes,
+                                     cudf::test::DurationTypes,
+                                     cudf::test::TimestampTypes>;
+
 template <typename T>
 struct Sort : public BaseFixture {
 };
 
-TYPED_TEST_CASE(Sort, NumericTypes);
+TYPED_TEST_CASE(Sort, TestTypes);
 
 TYPED_TEST(Sort, WithNullMax)
 {
@@ -461,14 +466,14 @@ TYPED_TEST(Sort, WithStructColumnCombinations)
     +------------+       +------------+      +------------+       +------------+
     |           s|       |           s|      |           s|       |           s|
     +------------+       +------------+      +------------+       +------------+
-  2 |        null|     1 |   {1, null}|    2 |        null|     3 |{null, null}|
-  4 |        null|     0 |   {0, null}|    4 |        null|     5 |{null, null}|
-  1 |   {1, null}|     6 |   {null, 1}|    3 |{null, null}|     7 |   {null, 0}|
-  0 |   {0, null}|     7 |   {null, 0}|    5 |{null, null}|     6 |   {null, 1}|
-  6 |   {null, 1}|     3 |{null, null}|    7 |   {null, 0}|     0 |   {0, null}|
-  7 |   {null, 0}|     5 |{null, null}|    6 |   {null, 1}|     1 |   {1, null}|
-  3 |{null, null}|     2 |        null|    0 |   {0, null}|     2 |        null|
-  5 |{null, null}|     4 |        null|    1 |   {1, null}|     4 |        null|
+  2 |        null|     1 |   {1, null}|    2 |        null|     0 |   {0, null}|
+  4 |        null|     0 |   {0, null}|    4 |        null|     1 |   {1, null}|
+  3 |{null, null}|     6 |   {null, 1}|    3 |{null, null}|     7 |   {null, 0}|
+  5 |{null, null}|     7 |   {null, 0}|    5 |{null, null}|     6 |   {null, 1}|
+  6 |   {null, 1}|     3 |{null, null}|    7 |   {null, 0}|     3 |{null, null}|
+  7 |   {null, 0}|     5 |{null, null}|    6 |   {null, 1}|     5 |{null, null}|
+  1 |   {1, null}|     2 |        null|    0 |   {0, null}|     2 |        null|
+  0 |   {0, null}|     4 |        null|    1 |   {1, null}|     4 |        null|
     +------------+       +------------+      +------------+       +------------+
   */
   // clang-format on
@@ -477,7 +482,7 @@ TYPED_TEST(Sort, WithStructColumnCombinations)
   std::vector<order> column_order1{order::DESCENDING};
 
   // desc_nulls_first
-  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 1, 0, 6, 7, 3, 5}};
+  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 3, 5, 6, 7, 1, 0}};
   auto got = sorted_order(input, column_order1, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
   // Run test for sort and sort_by_key
@@ -499,7 +504,7 @@ TYPED_TEST(Sort, WithStructColumnCombinations)
   run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
 
   // asce_nulls_last
-  fixed_width_column_wrapper<int32_t> expected4{{3, 5, 7, 6, 0, 1, 2, 4}};
+  fixed_width_column_wrapper<int32_t> expected4{{0, 1, 7, 6, 3, 5, 2, 4}};
   got = sorted_order(input, column_order2, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
@@ -534,14 +539,14 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
     +------------+      +------------+
     |           s|      |           s|
     +------------+      +------------+
-  2 |      {9, 9}|    3 |{null, null}|
-  4 |      {9, 9}|    5 |{null, null}|
-  1 |   {1, null}|    7 |   {null, 0}|
-  0 |   {0, null}|    6 |   {null, 1}|
-  6 |   {null, 1}|    0 |   {0, null}|
-  7 |   {null, 0}|    1 |   {1, null}|
-  3 |{null, null}|    2 |      {9, 9}|
-  5 |{null, null}|    4 |      {9, 9}|
+  3 |{null, null}|    0 |   {0, null}|
+  5 |{null, null}|    1 |   {1, null}|
+  6 |   {null, 1}|    2 |      {9, 9}|
+  7 |   {null, 0}|    4 |      {9, 9}|
+  2 |      {9, 9}|    7 |   {null, 0}|
+  4 |      {9, 9}|    6 |   {null, 1}|
+  1 |   {1, null}|    3 |{null, null}|
+  0 |   {0, null}|    5 |{null, null}|
     +------------+      +------------+
   */
   // clang-format on
@@ -550,31 +555,33 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
   std::vector<order> column_order{order::DESCENDING};
 
   // desc_nulls_first
-  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 1, 0, 6, 7, 3, 5}};
+  fixed_width_column_wrapper<int32_t> expected1{{3, 5, 6, 7, 2, 4, 1, 0}};
   auto got = sorted_order(input, column_order, {null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(input, expected1, column_order, {null_order::AFTER});
 
   // desc_nulls_last
+  fixed_width_column_wrapper<int32_t> expected2{{2, 4, 1, 0, 6, 7, 3, 5}};
   got = sorted_order(input, column_order, {null_order::BEFORE});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected1, column_order, {null_order::BEFORE});
+  run_sort_test(input, expected2, column_order, {null_order::BEFORE});
 
   // asce_nulls_first
   std::vector<order> column_order2{order::ASCENDING};
-  fixed_width_column_wrapper<int32_t> expected2{{3, 5, 7, 6, 0, 1, 2, 4}};
+  fixed_width_column_wrapper<int32_t> expected3{{3, 5, 7, 6, 0, 1, 2, 4}};
   got = sorted_order(input, column_order2, {null_order::BEFORE});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected2, column_order2, {null_order::BEFORE});
+  run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
 
   // asce_nulls_last
+  fixed_width_column_wrapper<int32_t> expected4{{0, 1, 2, 4, 7, 6, 3, 5}};
   got = sorted_order(input, column_order2, {null_order::AFTER});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected2, column_order2, {null_order::AFTER});
+  run_sort_test(input, expected4, column_order2, {null_order::AFTER});
 }
 
 TYPED_TEST(Sort, Stable)
diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
index ee4a4df38e8..373cd50fb1f 100644
--- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ TYPED_TEST(DistinctCountCommon, NoNull)
 {
   using T = TypeParam;
 
-  std::vector<T> input = cudf::test::make_type_param_vector<T>(
+  auto const input = cudf::test::make_type_param_vector<T>(
     {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
 
   cudf::test::fixed_width_column_wrapper<T> input_col(input.begin(), input.end());
@@ -55,9 +55,9 @@ TYPED_TEST(DistinctCountCommon, TableNoNull)
 {
   using T = TypeParam;
 
-  std::vector<T> input1 = cudf::test::make_type_param_vector<T>(
+  auto const input1 = cudf::test::make_type_param_vector<T>(
     {1, 3, 3, 4, 31, 1, 8, 2, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4});
-  std::vector<T> input2 = cudf::test::make_type_param_vector<T>(
+  auto const input2 = cudf::test::make_type_param_vector<T>(
     {3, 3, 4, 31, 1, 8, 5, 0, 4, 1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1});
 
   std::vector<std::pair<T, T>> pair_input;
diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cu
index 2d1ae1a862d..a4d8ecb2bec 100644
--- a/cpp/tests/strings/array_tests.cu
+++ b/cpp/tests/strings/array_tests.cu
@@ -14,10 +14,16 @@
  * limitations under the License.
  */
 
+#include <tests/strings/utilities.h>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/strings/detail/copying.hpp>
@@ -27,10 +33,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <tests/strings/utilities.h>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -192,9 +195,8 @@ TEST_F(StringsColumnTest, Scatter)
     thrust::make_transform_iterator(h_strings2.begin(), [](auto str) { return str != nullptr; }));
   auto source = cudf::strings_column_view(strings2);
 
-  rmm::device_vector<int32_t> scatter_map;
-  scatter_map.push_back(4);
-  scatter_map.push_back(1);
+  std::vector<int32_t> h_scatter_map({4, 1});
+  auto scatter_map = cudf::detail::make_device_uvector_sync(h_scatter_map);
 
   auto source_column = cudf::column_device_view::create(source.parent());
   auto begin =
@@ -220,9 +222,8 @@ TEST_F(StringsColumnTest, ScatterScalar)
     thrust::make_transform_iterator(h_strings1.begin(), [](auto str) { return str != nullptr; }));
   auto target = cudf::strings_column_view(strings1);
 
-  rmm::device_vector<int32_t> scatter_map;
-  scatter_map.push_back(0);
-  scatter_map.push_back(5);
+  std::vector<int32_t> h_scatter_map({0, 5});
+  auto scatter_map = cudf::detail::make_device_uvector_sync(h_scatter_map);
 
   cudf::string_scalar scalar("__");
   auto begin = thrust::make_constant_iterator(cudf::string_view(scalar.data(), scalar.size()));
@@ -246,7 +247,7 @@ TEST_F(StringsColumnTest, ScatterZeroSizeStringsColumn)
   cudf::column_view values(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto target = cudf::strings_column_view(values);
 
-  rmm::device_vector<int32_t> scatter_map;
+  rmm::device_uvector<int32_t> scatter_map(0, rmm::cuda_stream_default);
   cudf::string_scalar scalar("");
   auto begin = thrust::make_constant_iterator(cudf::string_view(scalar.data(), scalar.size()));
 
diff --git a/cpp/tests/strings/combine_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
similarity index 76%
rename from cpp/tests/strings/combine_tests.cpp
rename to cpp/tests/strings/combine/concatenate_tests.cpp
index cfeca2bba29..d91f669e42d 100644
--- a/cpp/tests/strings/combine_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -28,8 +28,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <vector>
-
 struct StringsCombineTest : public cudf::test::BaseFixture {
 };
 
@@ -97,74 +95,74 @@ TEST_F(StringsCombineTest, Concatenate)
   }
 }
 
-TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
+TEST_F(StringsCombineTest, ConcatenateSkipNulls)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  std::vector<cudf::column_view> strings_columns;
-  strings_columns.push_back(zero_size_strings_column);
-  strings_columns.push_back(zero_size_strings_column);
-  cudf::table_view table(strings_columns);
-  auto results = cudf::strings::concatenate(table);
-  cudf::test::expect_strings_empty(results->view());
-}
+  cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"},
+                                              {1, 0, 0, 1, 1, 1, 1});
+  cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"},
+                                              {1, 0, 1, 1, 1, 0, 1});
+  cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"},
+                                              {1, 1, 1, 1, 1, 0, 1});
 
-TEST_F(StringsCombineTest, Join)
-{
-  std::vector<const char*> h_strings{"eee", "bb", nullptr, "zzzz", "", "aaa", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  auto view1 = cudf::strings_column_view(strings);
+  cudf::table_view table({strings1, strings2, strings3});
 
   {
-    auto results = cudf::strings::join_strings(view1);
-
-    cudf::test::strings_column_wrapper expected{"eeebbzzzzaaaééé"};
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "++", "+d+s", "+éa+t", "aa++u", "bbb++", "ééé+f+w"});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::string_scalar("+"),
+                                              cudf::string_scalar(""),
+                                              cudf::strings::separator_on_nulls::YES);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results = cudf::strings::join_strings(view1, cudf::string_scalar("+"));
-
-    cudf::test::strings_column_wrapper expected{"eee+bb+zzzz++aaa+ééé"};
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "", "d+s", "+éa+t", "aa++u", "bbb", "ééé+f+w"});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::string_scalar("+"),
+                                              cudf::string_scalar(""),
+                                              cudf::strings::separator_on_nulls::NO);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results =
-      cudf::strings::join_strings(view1, cudf::string_scalar("+"), cudf::string_scalar("___"));
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::string_scalar("+"),
+                                              cudf::string_scalar("", false),
+                                              cudf::strings::separator_on_nulls::NO);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  }
+  {
+    cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"});
+    auto results = cudf::strings::concatenate(table,
+                                              cudf::strings_column_view(sep_col),
+                                              cudf::string_scalar(""),
+                                              cudf::string_scalar(""),
+                                              cudf::strings::separator_on_nulls::NO);
 
-    cudf::test::strings_column_wrapper expected{"eee+bb+___+zzzz++aaa+ééé"};
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    cudf::test::strings_column_wrapper expected(
+      {"eee+xyz+q", "", "d.s", "@éa@t", "aa**u", "bbb", "ééé#f#w"});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
 }
 
-TEST_F(StringsCombineTest, JoinZeroSizeStringsColumn)
+TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
-  auto results      = cudf::strings::join_strings(strings_view);
+  std::vector<cudf::column_view> strings_columns;
+  strings_columns.push_back(zero_size_strings_column);
+  strings_columns.push_back(zero_size_strings_column);
+  cudf::table_view table(strings_columns);
+  auto results = cudf::strings::concatenate(table);
   cudf::test::expect_strings_empty(results->view());
 }
 
-TEST_F(StringsCombineTest, JoinAllNullStringsColumn)
+TEST_F(StringsCombineTest, SingleColumnErrorCheck)
 {
-  cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
-
-  auto results = cudf::strings::join_strings(cudf::strings_column_view(strings));
-  cudf::test::strings_column_wrapper expected1({""}, {0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
-
-  results = cudf::strings::join_strings(
-    cudf::strings_column_view(strings), cudf::string_scalar(""), cudf::string_scalar("3"));
-  cudf::test::strings_column_wrapper expected2({"333"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-
-  results = cudf::strings::join_strings(
-    cudf::strings_column_view(strings), cudf::string_scalar("-"), cudf::string_scalar("*"));
-  cudf::test::strings_column_wrapper expected3({"*-*-*"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected3);
+  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error);
 }
 
 struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture {
@@ -217,7 +215,6 @@ TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnEmptyAndNullStringsNo
 
   auto exp_results =
     cudf::test::strings_column_wrapper({"", "", "", ""}, {false, true, false, false});
-
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(sep_col));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
@@ -265,7 +262,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest,
 
   auto results = cudf::strings::concatenate(
     cudf::table_view{{col0}}, cudf::strings_column_view(sep_col), sep_rep, col_rep);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, exp_results, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnStringMixNoReplacements)
@@ -343,7 +340,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnStringMixSeparatorAnd
 
   auto results = cudf::strings::concatenate(
     cudf::table_view{{col0}}, cudf::strings_column_view(sep_col), sep_rep, col_rep);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, exp_results, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoReplacements)
@@ -355,12 +352,20 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR
   auto sep_col = cudf::test::strings_column_wrapper(
     {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false});
 
-  auto exp_results = cudf::test::strings_column_wrapper(
-    {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false});
-
+  auto exp_results1 = cudf::test::strings_column_wrapper(
+    {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false});
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true);
+
+  auto exp_results2 = cudf::test::strings_column_wrapper(
+    {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false});
+  results = cudf::strings::concatenate(cudf::table_view{{col0, col1}},
+                                       cudf::strings_column_view(sep_col),
+                                       cudf::string_scalar("", false),
+                                       cudf::string_scalar(""),
+                                       cudf::strings::separator_on_nulls::NO);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements)
@@ -375,13 +380,23 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement
     {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"},
     {true, true, false, true, false, true, false, true, true, true, true, true});
 
-  auto exp_results = cudf::test::strings_column_wrapper(
-    {"eeexyzfoo", "<null>~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""},
-    {true, true, false, true, false, true, false, true, true, false, false, false});
+  auto exp_results1 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "", "", "", "", "", "", "", "", "", ""},
+    {true, true, false, false, false, false, false, false, false, false, false, false});
 
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true);
+
+  auto exp_results2 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""},
+    {true, true, false, true, false, true, false, true, true, true, true, true});
+  results = cudf::strings::concatenate(cudf::table_view{{col0, col1}},
+                                       cudf::strings_column_view(sep_col),
+                                       cudf::string_scalar("", false),
+                                       cudf::string_scalar(""),
+                                       cudf::strings::separator_on_nulls::NO);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement)
@@ -395,26 +410,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl
   auto sep_col = cudf::test::strings_column_wrapper(
     {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"},
     {true, true, false, true, false, true, false, true, true, true, true, true});
-  auto sep_rep = cudf::string_scalar("!!!!!!!!!!");
+  auto sep_rep = cudf::string_scalar("!!!!!!!");
 
-  auto exp_results = cudf::test::strings_column_wrapper(
-    {"eeexyzfoo",
-     "<null>~~~",
-     "!!!!!!!!!!éaff",
-     "éééf",
-     "éa",
-     "",
-     "éaff",
-     "valid",
-     "doo",
-     "",
-     "",
-     ""},
-    {true, true, true, true, true, true, true, true, true, false, false, false});
+  auto exp_results1 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""},
+    {true, true, true, false, false, false, false, false, false, false, false, false});
 
   auto results = cudf::strings::concatenate(
     cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true);
+
+  auto exp_results2 = cudf::test::strings_column_wrapper(
+    {"eeexyzfoo", "<null>~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""},
+    {true, true, true, true, true, true, true, true, true, true, true, true});
+
+  results = cudf::strings::concatenate(cudf::table_view{{col0, col1}},
+                                       cudf::strings_column_view(sep_col),
+                                       sep_rep,
+                                       cudf::string_scalar(""),
+                                       cudf::strings::separator_on_nulls::NO);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement)
@@ -483,7 +498,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorAndC
 
   auto results = cudf::strings::concatenate(
     cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep, col_rep);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, exp_results, true);
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnNonNullableStrings)
@@ -499,5 +514,5 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnNonNullableStrings)
 
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, exp_results, true);
 }
diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp
new file mode 100644
index 00000000000..509acfd3292
--- /dev/null
+++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+struct StringsListsConcatenateTest : public cudf::test::BaseFixture {
+};
+
+namespace {
+using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+using STR_COL   = cudf::test::strings_column_wrapper;
+using INT_LISTS = cudf::test::lists_column_wrapper<int32_t>;
+
+constexpr bool print_all{false};
+
+auto all_nulls() { return cudf::test::iterator_all_nulls(); }
+
+auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); }
+
+auto null_at(std::vector<cudf::size_type> const& indices)
+{
+  return cudf::test::iterator_with_null_at(cudf::host_span<cudf::size_type const>{indices});
+}
+
+auto nulls_from_nullptr(std::vector<const char*> const& strs)
+{
+  return thrust::make_transform_iterator(strs.begin(), [](auto ptr) { return ptr != nullptr; });
+}
+
+}  // namespace
+
+TEST_F(StringsListsConcatenateTest, InvalidInput)
+{
+  // Invalid list type
+  {
+    auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release();
+    auto const string_lv    = cudf::lists_column_view(string_lists->view());
+    EXPECT_THROW(cudf::strings::join_list_elements(string_lv), cudf::logic_error);
+  }
+
+  // Invalid scalar separator
+  {
+    auto const string_lists =
+      STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release();
+    auto const string_lv = cudf::lists_column_view(string_lists->view());
+    EXPECT_THROW(cudf::strings::join_list_elements(string_lv, cudf::string_scalar("", false)),
+                 cudf::logic_error);
+  }
+
+  // Invalid column separators
+  {
+    auto const string_lists =
+      STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release();
+    auto const string_lv  = cudf::lists_column_view(string_lists->view());
+    auto const separators = STR_COL{"+++"}.release();  // size doesn't match with lists column size
+    EXPECT_THROW(cudf::strings::join_list_elements(string_lv, separators->view()),
+                 cudf::logic_error);
+  }
+}
+
+TEST_F(StringsListsConcatenateTest, EmptyInput)
+{
+  auto const string_lists = STR_LISTS{}.release();
+  auto const string_lv    = cudf::lists_column_view(string_lists->view());
+  auto const expected     = STR_COL{};
+  auto results            = cudf::strings::join_list_elements(string_lv);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+
+  auto const separators = STR_COL{}.release();
+  results               = cudf::strings::join_list_elements(string_lv, separators->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+}
+
+TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput)
+{
+  auto const string_lists =
+    STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}, STR_LISTS{}}.release();
+  auto const string_lv = cudf::lists_column_view(string_lists->view());
+
+  // Empty list results in empty string
+  {
+    auto const expected = STR_COL{"", "", "", ""};
+
+    auto results = cudf::strings::join_list_elements(string_lv);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+
+    auto const separators = STR_COL{"", "", "", ""}.release();
+    results               = cudf::strings::join_list_elements(string_lv, separators->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Empty list results in null
+  {
+    auto const expected = STR_COL{{"", "", "", "" /*NULL*/}, null_at(3)};
+    auto results =
+      cudf::strings::join_list_elements(string_lv,
+                                        cudf::string_scalar(""),
+                                        cudf::string_scalar(""),
+                                        cudf::strings::separator_on_nulls::NO,
+                                        cudf::strings::output_if_empty_list::NULL_ELEMENT);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+
+    auto const separators = STR_COL{"", "", "", ""}.release();
+    results               = cudf::strings::join_list_elements(string_lv,
+                                                separators->view(),
+                                                cudf::string_scalar(""),
+                                                cudf::string_scalar(""),
+                                                cudf::strings::separator_on_nulls::NO,
+                                                cudf::strings::output_if_empty_list::NULL_ELEMENT);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+}
+
+TEST_F(StringsListsConcatenateTest, ColumnHasEmptyListAndNullListInput)
+{
+  auto const string_lists =
+    STR_LISTS{{STR_LISTS{"abc", "def", ""}, STR_LISTS{} /*NULL*/, STR_LISTS{}, STR_LISTS{"gh"}},
+              null_at(1)}
+      .release();
+  auto const string_lv = cudf::lists_column_view(string_lists->view());
+
+  // Empty list results in empty string
+  {
+    auto const expected = STR_COL{{"abc-def-", "" /*NULL*/, "", "gh"}, null_at(1)};
+
+    auto results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("-"));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+
+    auto const separators = STR_COL{"-", "", "", ""}.release();
+    results               = cudf::strings::join_list_elements(string_lv, separators->view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Empty list results in null
+  {
+    auto const expected = STR_COL{{"abc-def-", "" /*NULL*/, "" /*NULL*/, "gh"}, null_at({1, 2})};
+    auto results =
+      cudf::strings::join_list_elements(string_lv,
+                                        cudf::string_scalar("-"),
+                                        cudf::string_scalar(""),
+                                        cudf::strings::separator_on_nulls::NO,
+                                        cudf::strings::output_if_empty_list::NULL_ELEMENT);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+
+    auto const separators = STR_COL{"-", "", "", ""}.release();
+    results               = cudf::strings::join_list_elements(string_lv,
+                                                separators->view(),
+                                                cudf::string_scalar(""),
+                                                cudf::string_scalar(""),
+                                                cudf::strings::separator_on_nulls::NO,
+                                                cudf::strings::output_if_empty_list::NULL_ELEMENT);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+}
+
+TEST_F(StringsListsConcatenateTest, AllNullsStringsInput)
+{
+  auto const string_lists = STR_LISTS{
+    STR_LISTS{{""}, all_nulls()},
+    STR_LISTS{{"", "", ""}, all_nulls()},
+    STR_LISTS{{"", ""},
+              all_nulls()}}.release();
+  auto const string_lv = cudf::lists_column_view(string_lists->view());
+  auto const expected  = STR_COL{{"", "", ""}, all_nulls()};
+
+  auto results = cudf::strings::join_list_elements(string_lv);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+
+  auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release();
+  results               = cudf::strings::join_list_elements(string_lv, separators->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+}
+
+TEST_F(StringsListsConcatenateTest, ScalarSeparator)
+{
+  auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)},
+                                       STR_LISTS{}, /*NULL*/
+                                       STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)},
+                                       STR_LISTS{"zzz", "xxxxx"},
+                                       STR_LISTS{{"v", "", "", "w"}, null_at({1, 2})}},
+                                      null_at(1)}
+                              .release();
+  auto const string_lv = cudf::lists_column_view(string_lists->view());
+
+  // No null replacement
+  {
+    auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
+    std::vector<const char*> h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx", nullptr};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // With null replacement
+  {
+    auto const results = cudf::strings::join_list_elements(
+      string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{
+      "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx", "v+++___+++___+++w"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Turn off separator-on-nulls
+  {
+    auto const results = cudf::strings::join_list_elements(string_lv,
+                                                           cudf::string_scalar("+++"),
+                                                           cudf::string_scalar(""),
+                                                           cudf::strings::separator_on_nulls::NO);
+    std::vector<const char*> h_expected{"a+++ccc", nullptr, "efgh+++ijk", "zzz+++xxxxx", "v+++w"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+}
+
+TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator)
+{
+  auto const string_lists = STR_LISTS{
+    {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)},
+     STR_LISTS{}, /*NULL*/
+     STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)},
+     STR_LISTS{"zzz", "xxxxx"},
+     STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/
+     STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)},
+     STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)},
+     STR_LISTS{"0a0b0c", "5x5y5z"},
+     STR_LISTS{"xxx"}, /*NULL*/
+     STR_LISTS{"ééé", "12345abcdef"},
+     STR_LISTS{"aaaééébbbéééccc", "12345"}},
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return i != 1 && i != 4 && i != 8;
+    })}.release();
+
+  // Sliced the entire lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
+    std::vector<const char*> h_expected{nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        "zzz+++xxxxx",
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        "0a0b0c+++5x5y5z",
+                                        nullptr,
+                                        "ééé+++12345abcdef",
+                                        "aaaééébbbéééccc+++12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the entire lists column, with null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{"a+++___+++ccc",
+                                        nullptr,
+                                        "___+++efgh+++ijk",
+                                        "zzz+++xxxxx",
+                                        nullptr,
+                                        "abcdef+++012345+++___+++xxx000",
+                                        "___+++11111+++00000",
+                                        "0a0b0c+++5x5y5z",
+                                        nullptr,
+                                        "ééé+++12345abcdef",
+                                        "aaaééébbbéééccc+++12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the first half of the lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
+    std::vector<const char*> h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the first half of the lists column, with null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{
+      "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the second half of the lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
+    std::vector<const char*> h_expected{
+      nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the second half of the lists column, with null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{"abcdef+++012345+++___+++xxx000",
+                                        "___+++11111+++00000",
+                                        "0a0b0c+++5x5y5z",
+                                        nullptr,
+                                        "ééé+++12345abcdef",
+                                        "aaaééébbbéééccc+++12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the middle part of the lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++"));
+    std::vector<const char*> h_expected{
+      "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the middle part of the lists column, with null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{"zzz+++xxxxx",
+                                        nullptr,
+                                        "abcdef+++012345+++___+++xxx000",
+                                        "___+++11111+++00000",
+                                        "0a0b0c+++5x5y5z"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+}
+
+TEST_F(StringsListsConcatenateTest, ColumnSeparators)
+{
+  auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)},
+                                       STR_LISTS{}, /*NULL*/
+                                       STR_LISTS{"0a0b0c", "xyzééé"},
+                                       STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)},
+                                       STR_LISTS{{"ééé" /*NULL*/, "ááá", "ííí"}, null_at(0)},
+                                       STR_LISTS{"zzz", "xxxxx"}},
+                                      null_at(1)}
+                              .release();
+  auto const string_lv  = cudf::lists_column_view(string_lists->view());
+  auto const separators = STR_COL{
+    {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^"},
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return i != 2 && i != 3;
+    })}.release();
+
+  // No null replacement
+  {
+    auto const results = cudf::strings::join_list_elements(string_lv, separators->view());
+    std::vector<const char*> h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // With null replacement for separators
+  {
+    auto const results =
+      cudf::strings::join_list_elements(string_lv, separators->view(), cudf::string_scalar("|||"));
+    std::vector<const char*> h_expected{
+      nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // With null replacement for strings
+  {
+    auto const results = cudf::strings::join_list_elements(
+      string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX"));
+    std::vector<const char*> h_expected{
+      "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // With null replacement for both separators and strings
+  {
+    auto const results = cudf::strings::join_list_elements(
+      string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX"));
+    std::vector<const char*> h_expected{"a+++XXXXX+++ccc",
+                                        nullptr,
+                                        "0a0b0c|||xyzééé",
+                                        "XXXXX|||efgh|||ijk",
+                                        "XXXXX%%%ááá%%%ííí",
+                                        "zzz^^^xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Turn off separator-on-nulls
+  {
+    auto const results = cudf::strings::join_list_elements(string_lv,
+                                                           separators->view(),
+                                                           cudf::string_scalar("+++"),
+                                                           cudf::string_scalar(""),
+                                                           cudf::strings::separator_on_nulls::NO);
+    std::vector<const char*> h_expected{
+      "a+++ccc", nullptr, "0a0b0c+++xyzééé", "efgh+++ijk", "ááá%%%ííí", "zzz^^^xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+}
+
+TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators)
+{
+  auto const string_lists = STR_LISTS{
+    {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)},
+     STR_LISTS{}, /*NULL*/
+     STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)},
+     STR_LISTS{"zzz", "xxxxx"},
+     STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/
+     STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)},
+     STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)},
+     STR_LISTS{"0a0b0c", "5x5y5z"},
+     STR_LISTS{"xxx"}, /*NULL*/
+     STR_LISTS{"ééé", "12345abcdef"},
+     STR_LISTS{"aaaééébbbéééccc", "12345"}},
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return i != 1 && i != 4 && i != 8;
+    })}.release();
+  auto const separators = STR_COL{
+    {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^", "~!~", "###", "&&&", "-+-", "=+="},
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+      return i != 2 && i != 3;
+    })}.release();
+
+  // Sliced the entire lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
+    std::vector<const char*> h_expected{nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        nullptr,
+                                        "0a0b0c###5x5y5z",
+                                        nullptr,
+                                        "ééé-+-12345abcdef",
+                                        "aaaééébbbéééccc=+=12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the entire lists column, with null replacements
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{"a+++___+++ccc",
+                                        nullptr,
+                                        "___|||efgh|||ijk",
+                                        "zzz|||xxxxx",
+                                        nullptr,
+                                        "abcdef^^^012345^^^___^^^xxx000",
+                                        "___~!~11111~!~00000",
+                                        "0a0b0c###5x5y5z",
+                                        nullptr,
+                                        "ééé-+-12345abcdef",
+                                        "aaaééébbbéééccc=+=12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the first half of the lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
+    std::vector<const char*> h_expected{nullptr, nullptr, nullptr, nullptr};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the first half of the lists column, with null replacements
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{
+      "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the second half of the lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
+    std::vector<const char*> h_expected{
+      nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the second half of the lists column, with null replacements
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{"abcdef^^^012345^^^___^^^xxx000",
+                                        "___~!~11111~!~00000",
+                                        "0a0b0c###5x5y5z",
+                                        nullptr,
+                                        "ééé-+-12345abcdef",
+                                        "aaaééébbbéééccc=+=12345"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the middle part of the lists column, no null replacement
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]);
+    auto const results   = cudf::strings::join_list_elements(string_lv, sep_col);
+    std::vector<const char*> h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+
+  // Sliced the middle part of the lists column, with null replacements
+  {
+    auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]);
+    auto const sep_col   = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]);
+    auto const results   = cudf::strings::join_list_elements(
+      string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___"));
+    std::vector<const char*> h_expected{"zzz|||xxxxx",
+                                        nullptr,
+                                        "abcdef^^^012345^^^___^^^xxx000",
+                                        "___~!~11111~!~00000",
+                                        "0a0b0c###5x5y5z"};
+    auto const expected =
+      STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all);
+  }
+}
diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
new file mode 100644
index 00000000000..552cd5b0f95
--- /dev/null
+++ b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <tests/strings/utilities.h>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+struct JoinStringsTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(JoinStringsTest, Join)
+{
+  std::vector<const char*> h_strings{"eee", "bb", nullptr, "zzzz", "", "aaa", "ééé"};
+  cudf::test::strings_column_wrapper strings(
+    h_strings.begin(),
+    h_strings.end(),
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto view1 = cudf::strings_column_view(strings);
+
+  {
+    auto results = cudf::strings::join_strings(view1);
+
+    cudf::test::strings_column_wrapper expected{"eeebbzzzzaaaééé"};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = cudf::strings::join_strings(view1, cudf::string_scalar("+"));
+
+    cudf::test::strings_column_wrapper expected{"eee+bb+zzzz++aaa+ééé"};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results =
+      cudf::strings::join_strings(view1, cudf::string_scalar("+"), cudf::string_scalar("___"));
+
+    cudf::test::strings_column_wrapper expected{"eee+bb+___+zzzz++aaa+ééé"};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
+TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
+{
+  cudf::column_view zero_size_strings_column(
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
+  auto results      = cudf::strings::join_strings(strings_view);
+  cudf::test::expect_strings_empty(results->view());
+}
+
+TEST_F(JoinStringsTest, JoinAllNullStringsColumn)
+{
+  cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
+
+  auto results = cudf::strings::join_strings(cudf::strings_column_view(strings));
+  cudf::test::strings_column_wrapper expected1({""}, {0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
+
+  results = cudf::strings::join_strings(
+    cudf::strings_column_view(strings), cudf::string_scalar(""), cudf::string_scalar("3"));
+  cudf::test::strings_column_wrapper expected2({"333"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+
+  results = cudf::strings::join_strings(
+    cudf::strings_column_view(strings), cudf::string_scalar("-"), cudf::string_scalar("*"));
+  cudf::test::strings_column_wrapper expected3({"*-*-*"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected3);
+}
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index bd463a7ab0d..854194d13c8 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -17,15 +17,18 @@
 #include <tests/strings/utilities.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/execution_policy.h>
@@ -53,7 +56,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     memsize += *itr ? (cudf::size_type)strlen(*itr) : 0;
   cudf::size_type count = (cudf::size_type)h_test_strings.size();
   thrust::host_vector<char> h_buffer(memsize);
-  thrust::device_vector<char> d_buffer(memsize);
+  rmm::device_uvector<char> d_buffer(memsize, rmm::cuda_stream_default);
   thrust::host_vector<thrust::pair<const char*, cudf::size_type>> strings(count);
   thrust::host_vector<cudf::size_type> h_offsets(count + 1);
   cudf::size_type offset = 0;
@@ -67,14 +70,13 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     } else {
       cudf::size_type length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
-      strings[idx] =
-        thrust::pair<const char*, cudf::size_type>{d_buffer.data().get() + offset, length};
+      strings[idx] = thrust::pair<const char*, cudf::size_type>{d_buffer.data() + offset, length};
       offset += length;
     }
     h_offsets[idx + 1] = offset;
   }
-  rmm::device_vector<thrust::pair<const char*, cudf::size_type>> d_strings(strings);
-  CUDA_TRY(cudaMemcpy(d_buffer.data().get(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
+  auto d_strings = cudf::detail::make_device_uvector_sync(strings);
+  CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), nulls);
@@ -90,9 +92,9 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   EXPECT_EQ(strings_view.chars().size(), memsize);
 
   // check string data
-  auto strings_data = cudf::strings::create_offsets(strings_view);
-  thrust::host_vector<char> h_chars_data(strings_data.first);
-  thrust::host_vector<cudf::size_type> h_offsets_data(strings_data.second);
+  auto strings_data   = cudf::strings::create_offsets(strings_view);
+  auto h_chars_data   = cudf::detail::make_std_vector_sync(strings_data.first);
+  auto h_offsets_data = cudf::detail::make_std_vector_sync(strings_data.second);
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
   EXPECT_EQ(
     memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
@@ -131,11 +133,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
       null_count++;
     h_offsets[idx + 1] = offset;
   }
+
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
-  rmm::device_vector<char> d_buffer(h_buffer);
-  rmm::device_vector<cudf::size_type> d_offsets(h_offsets);
-  rmm::device_vector<cudf::bitmask_type> d_nulls(h_nulls);
-  auto column = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
+  auto d_buffer  = cudf::detail::make_device_uvector_sync(h_buffer);
+  auto d_offsets = cudf::detail::make_device_uvector_sync(h_offsets);
+  auto d_nulls   = cudf::detail::make_device_uvector_sync(h_nulls);
+  auto column    = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), null_count);
   EXPECT_EQ(2, column->num_children());
@@ -146,9 +149,9 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   EXPECT_EQ(strings_view.chars().size(), memsize);
 
   // check string data
-  auto strings_data = cudf::strings::create_offsets(strings_view);
-  thrust::host_vector<char> h_chars_data(strings_data.first);
-  thrust::host_vector<cudf::size_type> h_offsets_data(strings_data.second);
+  auto strings_data   = cudf::strings::create_offsets(strings_view);
+  auto h_chars_data   = cudf::detail::make_std_vector_sync(strings_data.first);
+  auto h_offsets_data = cudf::detail::make_std_vector_sync(strings_data.second);
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
   EXPECT_EQ(
     memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
@@ -167,14 +170,15 @@ TEST_F(StringsFactoriesTest, CreateScalar)
 
 TEST_F(StringsFactoriesTest, EmptyStringsColumn)
 {
-  rmm::device_vector<char> d_chars;
-  rmm::device_vector<cudf::size_type> d_offsets(1, 0);
-  rmm::device_vector<cudf::bitmask_type> d_nulls;
+  rmm::device_uvector<char> d_chars{0, rmm::cuda_stream_default};
+  auto d_offsets = cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(1);
+  rmm::device_uvector<cudf::bitmask_type> d_nulls{0, rmm::cuda_stream_default};
 
   auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
   cudf::test::expect_strings_empty(results->view());
 
-  rmm::device_vector<thrust::pair<const char*, cudf::size_type>> d_strings;
+  rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> d_strings{
+    0, rmm::cuda_stream_default};
   results = cudf::make_strings_column(d_strings);
   cudf::test::expect_strings_empty(results->view());
 }
@@ -192,9 +196,9 @@ TEST_F(StringsFactoriesTest, CreateOffsets)
     std::vector<std::string>{"column", "of", "strings"}  // [3,6)
   };
   for (size_t idx = 0; idx < result.size(); idx++) {
-    auto strings_data = cudf::strings::create_offsets(cudf::strings_column_view(result[idx]));
-    thrust::host_vector<char> h_chars(strings_data.first);
-    thrust::host_vector<cudf::size_type> h_offsets(strings_data.second);
+    auto strings_data     = cudf::strings::create_offsets(cudf::strings_column_view(result[idx]));
+    auto h_chars          = cudf::detail::make_std_vector_sync(strings_data.first);
+    auto h_offsets        = cudf::detail::make_std_vector_sync(strings_data.second);
     auto expected_strings = expecteds[idx];
     for (size_t jdx = 0; jdx < h_offsets.size() - 1; ++jdx) {
       auto offset = h_offsets[jdx];
@@ -222,7 +226,7 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
     {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1});
 
   auto d_column = cudf::column_device_view::create(data);
-  rmm::device_vector<string_pair> pairs(d_column->size());
+  rmm::device_uvector<string_pair> pairs(d_column->size(), rmm::cuda_stream_default);
   thrust::transform(thrust::device,
                     d_column->pair_begin<cudf::string_view, true>(),
                     d_column->pair_end<cudf::string_view, true>(),
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index b7bfb9e3924..d8b570cee8b 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -25,8 +25,6 @@
 
 #include <tests/strings/utilities.h>
 
-#include <vector>
-
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
 
@@ -43,7 +41,7 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint)
   using fp_wrapper  = cudf::test::fixed_point_column_wrapper<RepType>;
 
   cudf::test::strings_column_wrapper strings(
-    {"1234", "-876", "543.2", "-0.12", ".25", "-.002", "-.0027", "", "-0.0"});
+    {"1.234E3", "-876", "543.2", "-0.12", ".25", "-2E-3", "-.0027", "", "-0.0"});
   auto results = cudf::strings::to_fixed_point(
     cudf::strings_column_view(strings),
     cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{-3}});
@@ -58,7 +56,7 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_scaled);
 
   cudf::test::strings_column_wrapper strings_nulls(
-    {"1234", "-876", "543", "900000", "2500000", "", ""}, {1, 1, 1, 1, 1, 1, 0});
+    {"1234", "-876", "543", "900000", "25E5", "", ""}, {1, 1, 1, 1, 1, 1, 0});
   results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings_nulls),
                                           cudf::data_type{cudf::type_to_id<DecimalType>()});
   auto const expected_nulls = fp_wrapper{
@@ -73,8 +71,8 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVeryLarge)
   using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
   auto const strings = cudf::test::strings_column_wrapper({"1234000000000000000000",
                                                            "-876000000000000000000",
-                                                           "543200000000000000000",
-                                                           "-120000000000000000",
+                                                           "5432e+17",
+                                                           "-12E016",
                                                            "250000000000000000",
                                                            "-2800000000000000",
                                                            "",
@@ -93,8 +91,8 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVerySmall)
   using fp_wrapper   = cudf::test::fixed_point_column_wrapper<RepType>;
   auto const strings = cudf::test::strings_column_wrapper({"0.00000000000000001234",
                                                            "-0.0000000000000000876",
-                                                           ".000000000000000005432",
-                                                           "-.000000000000000012",
+                                                           "543.2e-20",
+                                                           "-12E-18",
                                                            "+.000000000000000025",
                                                            "-.00000000002147483647",
                                                            "",
@@ -171,7 +169,7 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     {"1234", "+876", "543.2", "-00.120", "1E34", "1.0.02", "", "-0.0"});
   auto results        = cudf::strings::is_fixed_point(cudf::strings_column_view(strings));
   auto const expected = cudf::test::fixed_width_column_wrapper<bool>(
-    {true, true, true, true, false, false, false, true});
+    {true, true, true, true, true, false, false, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
   results = cudf::strings::is_fixed_point(
@@ -184,35 +182,38 @@ TEST_F(StringsConvertTest, IsFixedPoint)
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{1}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  cudf::test::strings_column_wrapper big_numbers({"2147483647",
-                                                  "-2147483647",
-                                                  "2147483648",
-                                                  "9223372036854775807",
-                                                  "-9223372036854775807",
-                                                  "9223372036854775808"});
+  cudf::test::strings_column_wrapper big_numbers({
+    "2147483647",
+    "-2147483647",
+    "2147483648",
+    "9223372036854775807",
+    "-9223372036854775807",
+    "9223372036854775808",
+    "100E2147483648",
+  });
   results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL32});
   auto const expected32 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, false, false, false, false});
+    cudf::test::fixed_width_column_wrapper<bool>({true, true, false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32);
 
   results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers),
                                           cudf::data_type{cudf::type_id::DECIMAL64});
   auto const expected64 =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, false});
+    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}});
   auto const expected32_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true});
+    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, true, true, true, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled);
 
   results = cudf::strings::is_fixed_point(
     cudf::strings_column_view(big_numbers),
     cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}});
   auto const expected64_scaled =
-    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, false, false, false});
+    cudf::test::fixed_width_column_wrapper<bool>({true, true, true, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }
diff --git a/cpp/tests/strings/hash_string.cu b/cpp/tests/strings/hash_string.cu
index 629c02a989e..023d648cfdf 100644
--- a/cpp/tests/strings/hash_string.cu
+++ b/cpp/tests/strings/hash_string.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,21 @@
  * limitations under the License.
  */
 
+#include "./utilities.h"
+#include "rmm/exec_policy.hpp"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include "./utilities.h"
 
-#include <thrust/execution_policy.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
 #include <thrust/transform.h>
 #include <vector>
 
@@ -58,8 +64,8 @@ TEST_F(StringsHashTest, HashTest)
   auto strings_column = cudf::column_device_view::create(strings_view.parent());
   auto d_view         = *strings_column;
 
-  thrust::device_vector<uint32_t> d_values(strings_view.size());
-  thrust::transform(thrust::device,
+  rmm::device_uvector<uint32_t> d_values(strings_view.size(), rmm::cuda_stream_default);
+  thrust::transform(rmm::exec_policy(),
                     thrust::make_counting_iterator<uint32_t>(0),
                     thrust::make_counting_iterator<uint32_t>(strings_view.size()),
                     d_values.begin(),
@@ -67,6 +73,6 @@ TEST_F(StringsHashTest, HashTest)
 
   uint32_t h_expected[] = {
     2739798893, 2739798893, 3506676360, 1891213601, 3778137224, 0, 0, 1551088011};
-  thrust::host_vector<uint32_t> h_values(d_values);
+  auto h_values = cudf::detail::make_host_vector_sync(d_values);
   for (uint32_t idx = 0; idx < h_values.size(); ++idx) EXPECT_EQ(h_values[idx], h_expected[idx]);
 }
diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cpp
similarity index 96%
rename from cpp/tests/strings/integers_tests.cu
rename to cpp/tests/strings/integers_tests.cpp
index f15116ae4c2..d5f17954c50 100644
--- a/cpp/tests/strings/integers_tests.cu
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -18,11 +18,15 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <tests/strings/utilities.h>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
 #include <string>
 #include <vector>
 
@@ -287,16 +291,16 @@ TYPED_TEST_CASE(StringsIntegerConvertTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
 {
-  thrust::device_vector<TypeParam> d_integers(255);
-  thrust::sequence(
-    thrust::device, d_integers.begin(), d_integers.end(), -(TypeParam)(d_integers.size() / 2));
-  d_integers.push_back(std::numeric_limits<TypeParam>::min());
-  d_integers.push_back(std::numeric_limits<TypeParam>::max());
+  thrust::host_vector<TypeParam> h_integers(255);
+  std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
+  h_integers.push_back(std::numeric_limits<TypeParam>::min());
+  h_integers.push_back(std::numeric_limits<TypeParam>::max());
+  auto d_integers    = cudf::detail::make_device_uvector_sync(h_integers);
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
   CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
-                      d_integers.data().get(),
+                      d_integers.data(),
                       d_integers.size() * sizeof(TypeParam),
                       cudaMemcpyDeviceToDevice));
   integers_view.set_null_count(0);
@@ -304,7 +308,8 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   // convert to strings
   auto results_strings = cudf::strings::from_integers(integers->view());
 
-  thrust::host_vector<TypeParam> h_integers(d_integers);
+  // copy back to host
+  h_integers = cudf::detail::make_host_vector_sync(d_integers);
   std::vector<std::string> h_strings;
   for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr)
     h_strings.push_back(std::to_string(*itr));
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index 44eb35d4163..5c81057b6d7 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -759,3 +759,120 @@ TEST_F(JsonTests, MixedOutput)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
   }
 }
+
+TEST_F(JsonTests, StripQuotes)
+{
+  // we normally expect our outputs here to be
+  // b     (no quotes)
+  // but with string_quotes_from_single_strings false, we expect
+  // "b"   (with quotes)
+  {
+    std::string str("{\"a\" : \"b\"}");
+    cudf::test::strings_column_wrapper input({str, str});
+
+    cudf::strings::get_json_object_options options;
+    options.set_strip_quotes_from_single_strings(false);
+
+    std::string json_path("$.a");
+    auto result_raw =
+      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+    auto result = drop_whitespace(*result_raw);
+
+    cudf::test::strings_column_wrapper expected_raw({"\"b\"", "\"b\""});
+    auto expected = drop_whitespace(expected_raw);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
+  }
+
+  // a valid, but empty row
+  {
+    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
+    std::string json_path("$.store.bicycle");
+
+    cudf::strings::get_json_object_options options;
+    options.set_strip_quotes_from_single_strings(true);
+
+    auto result =
+      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+
+    cudf::test::strings_column_wrapper expected({""});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+}
+
+TEST_F(JsonTests, AllowSingleQuotes)
+{
+  // Tests allowing single quotes for strings.
+  // Note:  this flag allows a mix of single and double quotes. it doesn't explicitly require
+  // single-quotes only.
+
+  // various queries on:
+  std::vector<std::string> input_strings{
+    // clang-format off
+    "{\'a\': {\'b\' : \'c\'}}",
+
+    "{"
+      "\'a\': {\'b\' : \"c\"},"
+      "\'d\': [{\"e\":123}, {\'f\':-10}]"
+    "}",
+
+    "{"
+      "\'b\': 123"
+    "}",
+
+    "{"
+      "\"a\": [\'y\',500]"
+    "}",
+
+    "{"
+      "\'a\': \"\""
+    "}",
+
+    "{"
+      "\"a\": {"
+                "\'z\': {\'i\': 10, \'j\': 100},"
+                "\'b\': [\'c\',null,true,-1]"
+              "}"
+    "}",
+    
+    "{"
+      "\'a\': \"abc'def\""
+    "}",
+
+    "{"
+      "\'a\': \"'abc'def'\""
+    "}",
+    // clang-format on
+  };
+
+  cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end());
+  {
+    std::string json_path("$.a");
+
+    cudf::strings::get_json_object_options options;
+    options.set_allow_single_quotes(true);
+
+    auto result =
+      cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options);
+
+    // clang-format off
+    cudf::test::strings_column_wrapper expected({
+      "{\'b\' : \'c\'}",
+      "{\'b\' : \"c\"}",
+      "",
+      "[\'y\',500]",
+      "",
+      "{"
+         "\'z\': {\'i\': 10, \'j\': 100},"
+         "\'b\': [\'c\',null,true,-1]"
+      "}",
+      "abc'def",
+      "'abc'def'"
+      }, 
+      {1, 1, 0, 1, 1, 1, 1, 1});
+    // clang-format on
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
+  }
+}
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 359b31177bf..536cfc8daab 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -337,30 +337,6 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
   }
 }
 
-TEST_F(StringsReplaceTest, ReplaceNulls)
-{
-  std::vector<const char*> h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""};
-
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  auto strings_view = cudf::strings_column_view(strings);
-
-  {
-    auto results = cudf::strings::replace_nulls(strings_view, cudf::string_scalar("___"));
-    std::vector<const char*> h_expected{"Héllo", "thesé", "___", "ARE THE", "tést strings", ""};
-    cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  }
-  {
-    auto results = cudf::strings::replace_nulls(strings_view);
-    std::vector<const char*> h_expected{"Héllo", "thesé", "", "ARE THE", "tést strings", ""};
-    cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  }
-}
-
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
   cudf::column_view zero_size_strings_column(
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index d700892de78..1fb4b88c79e 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -42,10 +43,10 @@ void row_comparison(cudf::table_view input1,
 
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
-  rmm::device_vector<cudf::order> d_column_order(column_order);
+  auto d_column_order = cudf::detail::make_device_uvector_sync(column_order);
 
   auto comparator = cudf::row_lexicographic_comparator<false>(
-    *device_table_1, *device_table_2, d_column_order.data().get());
+    *device_table_1, *device_table_2, d_column_order.data());
 
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator(0),
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index 7324cf3ec6a..d23ca3033cc 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -339,3 +339,38 @@ TEST(TextSubwordTest, TokenizeWithSpecialTokens)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
 }
+
+TEST(TextSubwordTest, ZeroHashBinCoefficient)
+{
+  std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
+  {
+    std::ofstream outfile(hash_file, std::ofstream::out);
+    outfile << "26899\n27424\n2\n";
+    outfile << "6321733446031528966 0\n0 0\n9\n";  // zeroes are here
+    outfile << "6206321707968233475\n3014663\n6205475701751152646\n";
+    outfile << "451412623364\n5214737420442796033\n6173800107753209856\n";
+    outfile << "0\n6356997\n6064762127302393858\n";
+    outfile << "0\n1\n2\n";
+  }
+
+  std::vector<const char*> h_strings{".zzzz"};
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+  auto vocab  = nvtext::load_vocabulary_file(hash_file);
+  auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
+                                         *vocab,
+                                         8,
+                                         8,
+                                         true,  // do_lower_case
+                                         true,  // do_truncate
+                                         MAX_ROWS_TENSOR);
+
+  // clang-format off
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected_tokens({7, 0, 0, 0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected_attn(  {1, 1, 0, 0, 0, 0, 0, 0});
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected_metadata({0, 0, 1});
+  // clang-format on
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_token_ids->view(), expected_tokens);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
+}
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 21e5c818197..1431710f3ca 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -16,8 +16,6 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
@@ -47,7 +45,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypes)
   // expect size of the type per row
   auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
   cudf::mutable_column_view mcv(*expected);
-  thrust::fill(rmm::exec_policy(0),
+  thrust::fill(rmm::exec_policy(),
                mcv.begin<size_type>(),
                mcv.end<size_type>(),
                sizeof(device_storage_type_t<T>) * CHAR_BIT);
@@ -70,7 +68,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypesWithNulls)
   // expect size of the type + 1 bit per row
   auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
   cudf::mutable_column_view mcv(*expected);
-  thrust::fill(rmm::exec_policy(0),
+  thrust::fill(rmm::exec_policy(),
                mcv.begin<size_type>(),
                mcv.end<size_type>(),
                (sizeof(device_storage_type_t<T>) * CHAR_BIT) + 1);
@@ -490,7 +488,7 @@ TEST_F(RowBitCount, Table)
   auto expected   = cudf::make_fixed_width_column(data_type{type_id::INT32}, t.num_rows());
   cudf::mutable_column_view mcv(*expected);
   thrust::transform(
-    rmm::exec_policy(0),
+    rmm::exec_policy(),
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(0) + t.num_rows(),
     mcv.begin<size_type>(),
@@ -586,11 +584,11 @@ TEST_F(RowBitCount, EmptyTable)
   }
 
   {
-    auto strings = cudf::strings::detail::make_empty_strings_column(0);
+    auto strings = cudf::make_empty_column(data_type{type_id::STRING});
     auto ints    = cudf::make_empty_column(data_type{type_id::INT32});
     cudf::table_view empty({*strings, *ints});
 
     auto result = cudf::row_bit_count(empty);
     CUDF_EXPECTS(result != nullptr && result->size() == 0, "Expected an empty column");
   }
-}
\ No newline at end of file
+}
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 3924fa1ac19..bc690e04f21 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf_test/base_fixture.hpp>
@@ -21,7 +22,8 @@
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <thrust/device_vector.h>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 struct DispatcherTest : public cudf::test::BaseFixture {
 };
@@ -67,10 +69,10 @@ __global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 
 TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
-  thrust::device_vector<bool> result(1, false);
-  dispatch_test_kernel<<<1, 1>>>(cudf::type_to_id<TypeParam>(), result.data().get());
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
+  dispatch_test_kernel<<<1, 1>>>(cudf::type_to_id<TypeParam>(), result.data());
   CUDA_TRY(cudaDeviceSynchronize());
-  EXPECT_EQ(true, result[0]);
+  EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default));
 }
 
 struct IdDispatcherTest : public DispatcherTest, public testing::WithParamInterface<cudf::type_id> {
@@ -128,11 +130,11 @@ __global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2
 
 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
 {
-  thrust::device_vector<bool> result(1, false);
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
   double_dispatch_test_kernel<<<1, 1>>>(
-    cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data().get());
+    cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
   CUDA_TRY(cudaDeviceSynchronize());
-  EXPECT_EQ(true, result[0]);
+  EXPECT_EQ(true, result.front_element(rmm::cuda_stream_default));
 }
 
 struct IdDoubleDispatcherTest : public DispatcherTest,
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 15d014f9d9c..6121af01720 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -85,61 +85,71 @@ inline cudf::column make_exp_chrono_column(cudf::type_id type_id)
         cudf::data_type{type_id},
         test_timestamps_D.size(),
         rmm::device_buffer{test_timestamps_D.data(),
-                           test_timestamps_D.size() * sizeof(test_timestamps_D.front())});
+                           test_timestamps_D.size() * sizeof(test_timestamps_D.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::TIMESTAMP_SECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_s.size(),
         rmm::device_buffer{test_timestamps_s.data(),
-                           test_timestamps_s.size() * sizeof(test_timestamps_s.front())});
+                           test_timestamps_s.size() * sizeof(test_timestamps_s.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_ms.size(),
         rmm::device_buffer{test_timestamps_ms.data(),
-                           test_timestamps_ms.size() * sizeof(test_timestamps_ms.front())});
+                           test_timestamps_ms.size() * sizeof(test_timestamps_ms.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::TIMESTAMP_MICROSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_us.size(),
         rmm::device_buffer{test_timestamps_us.data(),
-                           test_timestamps_us.size() * sizeof(test_timestamps_us.front())});
+                           test_timestamps_us.size() * sizeof(test_timestamps_us.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::TIMESTAMP_NANOSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_ns.size(),
         rmm::device_buffer{test_timestamps_ns.data(),
-                           test_timestamps_ns.size() * sizeof(test_timestamps_ns.front())});
+                           test_timestamps_ns.size() * sizeof(test_timestamps_ns.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::DURATION_DAYS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_D.size(),
         rmm::device_buffer{test_durations_D.data(),
-                           test_durations_D.size() * sizeof(test_durations_D.front())});
+                           test_durations_D.size() * sizeof(test_durations_D.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::DURATION_SECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_s.size(),
         rmm::device_buffer{test_durations_s.data(),
-                           test_durations_s.size() * sizeof(test_durations_s.front())});
+                           test_durations_s.size() * sizeof(test_durations_s.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::DURATION_MILLISECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_ms.size(),
         rmm::device_buffer{test_durations_ms.data(),
-                           test_durations_ms.size() * sizeof(test_durations_ms.front())});
+                           test_durations_ms.size() * sizeof(test_durations_ms.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::DURATION_MICROSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_us.size(),
         rmm::device_buffer{test_durations_us.data(),
-                           test_durations_us.size() * sizeof(test_durations_us.front())});
+                           test_durations_us.size() * sizeof(test_durations_us.front()),
+                           rmm::cuda_stream_default});
     case cudf::type_id::DURATION_NANOSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_ns.size(),
         rmm::device_buffer{test_durations_ns.data(),
-                           test_durations_ns.size() * sizeof(test_durations_ns.front())});
+                           test_durations_ns.size() * sizeof(test_durations_ns.front()),
+                           rmm::cuda_stream_default});
     default: CUDF_FAIL("");
   }
 };
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 5205124c129..eabb307f77b 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -42,6 +42,8 @@
 
 #include <numeric>
 #include <sstream>
+#include "cudf/detail/utilities/vector_factories.hpp"
+#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 namespace test {
@@ -93,8 +95,8 @@ struct column_property_comparator {
     // recurse
     cudf::type_dispatcher(lhs_l.child().type(),
                           column_property_comparator<check_exact_equality>{},
-                          lhs_l.get_sliced_child(0),
-                          rhs_l.get_sliced_child(0));
+                          lhs_l.get_sliced_child(rmm::cuda_stream_default),
+                          rhs_l.get_sliced_child(rmm::cuda_stream_default));
   }
 };
 
@@ -170,7 +172,7 @@ class corresponding_rows_not_equivalent {
 };
 
 // Stringify the inconsistent values resulted from the comparison of two columns element-wise
-std::string stringify_column_differences(thrust::device_vector<int> const& differences,
+std::string stringify_column_differences(cudf::device_span<int const> differences,
                                          column_view const& lhs,
                                          column_view const& rhs,
                                          bool print_all_differences,
@@ -178,13 +180,12 @@ std::string stringify_column_differences(thrust::device_vector<int> const& diffe
 {
   CUDF_EXPECTS(not differences.empty(), "Shouldn't enter this function if `differences` is empty");
   std::string const depth_str = depth > 0 ? "depth " + std::to_string(depth) + '\n' : "";
+  // move the differences to the host.
+  auto h_differences = cudf::detail::make_host_vector_sync(differences);
   if (print_all_differences) {
     std::ostringstream buffer;
     buffer << depth_str << "differences:" << std::endl;
 
-    // thrust may crash if a device_vector is passed to fixed_width_column_wrapper,
-    // thus we construct fixed_width_column_wrapper from a host_vector instead
-    thrust::host_vector<int> h_differences(differences);
     auto source_table = cudf::table_view({lhs, rhs});
     auto diff_column =
       fixed_width_column_wrapper<int32_t>(h_differences.begin(), h_differences.end());
@@ -198,7 +199,7 @@ std::string stringify_column_differences(thrust::device_vector<int> const& diffe
              << h_differences[i] << "] = " << h_right_strings[i] << std::endl;
     return buffer.str();
   } else {
-    int index     = differences[0];  // only stringify first difference
+    int index     = h_differences[0];  // only stringify first difference
     auto diff_lhs = cudf::detail::slice(lhs, index, index + 1);
     auto diff_rhs = cudf::detail::slice(rhs, index, index + 1);
     return depth_str + "first difference: " + "lhs[" + std::to_string(index) +
@@ -222,16 +223,18 @@ struct column_comparator_impl {
                                               corresponding_rows_unequal,
                                               corresponding_rows_not_equivalent>;
 
-    auto differences = thrust::device_vector<int>(lhs.size());  // worst case: everything different
-    auto diff_iter   = thrust::copy_if(thrust::device,
+    auto differences = rmm::device_uvector<int>(
+      lhs.size(), rmm::cuda_stream_default);  // worst case: everything different
+    auto diff_iter = thrust::copy_if(rmm::exec_policy(),
                                      thrust::make_counting_iterator(0),
                                      thrust::make_counting_iterator(lhs.size()),
                                      differences.begin(),
                                      ComparatorType(*d_lhs, *d_rhs));
 
-    differences.resize(thrust::distance(differences.begin(), diff_iter));  // shrink back down
+    differences.resize(thrust::distance(differences.begin(), diff_iter),
+                       rmm::cuda_stream_default);  // shrink back down
 
-    if (not differences.empty())
+    if (not differences.is_empty())
       GTEST_FAIL() << stringify_column_differences(
         differences, lhs, rhs, print_all_differences, depth);
   }
@@ -256,7 +259,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     if (lhs_l.is_empty()) { return; }
 
     // worst case - everything is different
-    thrust::device_vector<int> differences(lhs.size());
+    rmm::device_uvector<int> differences(lhs.size(), rmm::cuda_stream_default);
 
     // TODO : determine how equals/equivalency should work for columns with divergent underlying
     // data, but equivalent null masks. Example:
@@ -283,8 +286,9 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     // compare offsets, taking slicing into account
 
     // left side
-    size_type lhs_shift = cudf::detail::get_value<size_type>(lhs_l.offsets(), lhs_l.offset(), 0);
-    auto lhs_offsets    = thrust::make_transform_iterator(
+    size_type lhs_shift =
+      cudf::detail::get_value<size_type>(lhs_l.offsets(), lhs_l.offset(), rmm::cuda_stream_default);
+    auto lhs_offsets = thrust::make_transform_iterator(
       lhs_l.offsets().begin<size_type>() + lhs_l.offset(),
       [lhs_shift] __device__(size_type offset) { return offset - lhs_shift; });
     auto lhs_valids = thrust::make_transform_iterator(
@@ -294,8 +298,9 @@ struct column_comparator_impl<list_view, check_exact_equality> {
       });
 
     // right side
-    size_type rhs_shift = cudf::detail::get_value<size_type>(rhs_l.offsets(), rhs_l.offset(), 0);
-    auto rhs_offsets    = thrust::make_transform_iterator(
+    size_type rhs_shift =
+      cudf::detail::get_value<size_type>(rhs_l.offsets(), rhs_l.offset(), rmm::cuda_stream_default);
+    auto rhs_offsets = thrust::make_transform_iterator(
       rhs_l.offsets().begin<size_type>() + rhs_l.offset(),
       [rhs_shift] __device__(size_type offset) { return offset - rhs_shift; });
     auto rhs_valids = thrust::make_transform_iterator(
@@ -305,7 +310,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
       });
 
     auto diff_iter = thrust::copy_if(
-      thrust::device,
+      rmm::exec_policy(),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(lhs_l.size() + 1),
       differences.begin(),
@@ -321,15 +326,16 @@ struct column_comparator_impl<list_view, check_exact_equality> {
         return lhs_offsets[index] == rhs_offsets[index] ? false : true;
       });
 
-    differences.resize(thrust::distance(differences.begin(), diff_iter));  // shrink back down
+    differences.resize(thrust::distance(differences.begin(), diff_iter),
+                       rmm::cuda_stream_default);  // shrink back down
 
-    if (not differences.empty())
+    if (not differences.is_empty())
       GTEST_FAIL() << stringify_column_differences(
         differences, lhs, rhs, print_all_differences, depth);
 
     // recurse
-    auto lhs_child = lhs_l.get_sliced_child(0);
-    auto rhs_child = rhs_l.get_sliced_child(0);
+    auto lhs_child = lhs_l.get_sliced_child(rmm::cuda_stream_default);
+    auto rhs_child = rhs_l.get_sliced_child(rmm::cuda_stream_default);
     cudf::type_dispatcher(lhs_child.type(),
                           column_comparator<check_exact_equality>{},
                           lhs_child,
@@ -518,8 +524,9 @@ std::string nested_offsets_to_string(NestedColumnView const& c, std::string cons
   size_type output_size = c.size() + 1;
 
   // the first offset value to normalize everything against
-  size_type first = cudf::detail::get_value<size_type>(offsets, c.offset(), 0);
-  rmm::device_vector<size_type> shifted_offsets(output_size);
+  size_type first =
+    cudf::detail::get_value<size_type>(offsets, c.offset(), rmm::cuda_stream_default);
+  rmm::device_uvector<size_type> shifted_offsets(output_size, rmm::cuda_stream_default);
 
   // normalize the offset values for the column offset
   size_type const* d_offsets = offsets.head<size_type>() + c.offset();
@@ -530,7 +537,7 @@ std::string nested_offsets_to_string(NestedColumnView const& c, std::string cons
     shifted_offsets.begin(),
     [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
 
-  thrust::host_vector<size_type> h_shifted_offsets(shifted_offsets);
+  auto const h_shifted_offsets = cudf::detail::make_host_vector_sync(shifted_offsets);
   std::ostringstream buffer;
   for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
     buffer << h_shifted_offsets[idx];
@@ -687,7 +694,7 @@ struct column_view_printer {
     lists_column_view lcv(col);
 
     // propage slicing to the child if necessary
-    column_view child    = lcv.get_sliced_child(0);
+    column_view child    = lcv.get_sliced_child(rmm::cuda_stream_default);
     bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
 
     std::string tmp =
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 24884c15f64..22e15809a2d 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -209,7 +209,7 @@ TEST(SpanTest, CanConstructFromDeviceContainers)
 {
   auto d_thrust_vector = thrust::device_vector<int>(1);
   auto d_vector        = rmm::device_vector<int>(1);
-  auto d_uvector       = rmm::device_uvector<int>(1, 0);
+  auto d_uvector       = rmm::device_uvector<int>(1, rmm::cuda_stream_default);
 
   (void)device_span<int>(d_thrust_vector);
   (void)device_span<int>(d_vector);
diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp
index f0f174a4328..0953c02078a 100644
--- a/cpp/tests/utilities_tests/type_list_tests.cpp
+++ b/cpp/tests/utilities_tests/type_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,11 +19,6 @@
 
 using namespace cudf::test;  // this will make reading code way easier
 
-namespace std {
-template <class T, class U>
-constexpr bool is_same_v = std::is_same<T, U>::value;
-}
-
 namespace {
 // Work around to remove parentheses surrounding a type
 template <typename T>
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index 5088c3122bd..64d9ad6fc3f 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -29,6 +29,8 @@
 #include <cudf/wrappers/timestamps.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
 
 template <typename T>
 struct ChronoColumnTest : public cudf::test::BaseFixture {
@@ -83,17 +85,15 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
 
   // round-trip through the host to copy `chrono_col` values
   // to a new fixed_width_column_wrapper `primitive_col`
-  // When C++17, use structured bindings
-  thrust::host_vector<Rep> chrono_col_data;
-  std::vector<cudf::bitmask_type> chrono_col_mask;
-  std::tie(chrono_col_data, chrono_col_mask) = to_host<Rep>(chrono_col);
+  auto const [chrono_col_data, chrono_col_mask] = to_host<Rep>(chrono_col);
 
   auto primitive_col =
     fixed_width_column_wrapper<Rep>(chrono_col_data.begin(), chrono_col_data.end());
 
-  thrust::device_vector<int32_t> indices(this->size());
-  thrust::sequence(indices.begin(), indices.end());
-  EXPECT_TRUE(thrust::all_of(indices.begin(),
+  rmm::device_uvector<int32_t> indices(this->size(), rmm::cuda_stream_default);
+  thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end());
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(),
+                             indices.begin(),
                              indices.end(),
                              compare_chrono_elements_to_primitive_representation<T>{
                                *cudf::column_device_view::create(primitive_col),
@@ -144,10 +144,11 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
   auto chrono_rhs_col =
     generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs_));
 
-  thrust::device_vector<int32_t> indices(this->size());
-  thrust::sequence(indices.begin(), indices.end());
+  rmm::device_uvector<int32_t> indices(this->size(), rmm::cuda_stream_default);
+  thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end());
 
   EXPECT_TRUE(thrust::all_of(
+    rmm::exec_policy(),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::LESS,
@@ -155,6 +156,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
                                        *cudf::column_device_view::create(chrono_rhs_col)}));
 
   EXPECT_TRUE(thrust::all_of(
+    rmm::exec_policy(),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::GREATER,
@@ -162,6 +164,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
                                        *cudf::column_device_view::create(chrono_lhs_col)}));
 
   EXPECT_TRUE(thrust::all_of(
+    rmm::exec_policy(),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::LESS_EQUAL,
@@ -169,6 +172,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
                                        *cudf::column_device_view::create(chrono_lhs_col)}));
 
   EXPECT_TRUE(thrust::all_of(
+    rmm::exec_policy(),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::GREATER_EQUAL,
diff --git a/docker_build/Dockerfile b/docker_build/Dockerfile
index 0c04cab152a..696a6969778 100644
--- a/docker_build/Dockerfile
+++ b/docker_build/Dockerfile
@@ -34,15 +34,11 @@ RUN apt update -y --fix-missing && \
 
 RUN apt install -y --no-install-recommends \
       git \
-      libboost-all-dev \
       python3.8-dev \
       build-essential \
       autoconf \
       bison \
       flex \
-      libboost-filesystem-dev \
-      libboost-system-dev \
-      libboost-regex-dev \
       libjemalloc-dev \
       wget \
       libssl-dev \
diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst
index b4ca0321073..d3042be2129 100644
--- a/docs/cudf/source/api.rst
+++ b/docs/cudf/source/api.rst
@@ -206,6 +206,13 @@ Window
 .. autoclass:: Rolling
     :members:
 
+SubwordTokenizer
+----------------
+.. currentmodule:: cudf.core.subword_tokenizer
+
+.. autoclass:: SubwordTokenizer
+    :members:
+    :special-members: __call__
 
 General utility functions
 -------------------------
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index b68d7b5849f..c9d4441efae 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = "0.19"
+version = '21.06'
 # The full version, including alpha/beta/rc tags.
-release = "0.19.0"
+release = '21.06.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/cudf/source/io-gds-integration.rst b/docs/cudf/source/io-gds-integration.rst
new file mode 100644
index 00000000000..9ccf773b2e4
--- /dev/null
+++ b/docs/cudf/source/io-gds-integration.rst
@@ -0,0 +1,22 @@
+GPUDirect Storage Integration
+=============================
+
+Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. 
+GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. 
+GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. 
+The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
+
+Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. 
+This variable also controls the GDS compatibility mode. There are two special values for the environment variable:
+
+- "GDS": Use of GDS is enabled; GDS compatibility mode is *off*.
+- "ALWAYS": Use of GDS is enabled; GDS compatibility mode is *on*.
+
+Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers.
+
+This environment variable also affects how cuDF treats GDS errors.
+When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
+When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), 
+cuDF throws an exception to propagate the error to te user.
+
+NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases.
\ No newline at end of file
diff --git a/docs/cudf/source/io-supported-types.rst b/docs/cudf/source/io-supported-types.rst
index e09e155ef92..a74f3239044 100644
--- a/docs/cudf/source/io-supported-types.rst
+++ b/docs/cudf/source/io-supported-types.rst
@@ -5,58 +5,62 @@ The following table lists are compatible cudf types for each supported IO format
 
 .. rst-class:: io-supported-types-table
 .. table::
-    :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
+    :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-----------------+--------+--------+--------+--------+
-    |                       |       CSV       |      Parquet    |      JSON       |       ORC       |  AVRO  |       HDF       |       DLPack    |      Feather    |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | Data Type             | Writer | Reader | Writer | Reader | Writer | Reader | Writer | Reader | Reader | Writer | Reader | Writer | Reader | Writer | Reader |
-    +=======================+========+========+========+========+========+========+========+========+========+========+========+========+========+========+========+
-    | int8                  | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | int16                 | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | int32                 | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | int64                 | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | uint8                 | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ✅     | ❌     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | uint16                | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ✅     | ❌     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | uint32                | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ✅     | ❌     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | uint64                | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ❌     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | float32               | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | float64               | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | bool                  | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | str                   | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | category              | ✅     | ❌     | ❌     | ❌     | ✅     | ❌     | ❌     | ❌     | ❌     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | list                  | ❌     | ❌     | ✅     | ✅     | ✅     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | timedelta64[s]        | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ❌     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | timedelta64[ms]       | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ❌     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | timedelta64[us]       | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ❌     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | timedelta64[ns]       | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ❌     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | datetime64[s]         | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | datetime64[ms]        | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | datetime64[us]        | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | datetime64[ns]        | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | struct                | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ✅     | ✅     | ❌     | ❌     | ✅     | ✅     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
-    | decimal               | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     | ❌     |
-    +-----------------------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+-------------------+--------+--------+---------+---------+
+    |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |        HDF        |       DLPack    |      Feather      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | Data Type             | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader | Writer¹ | Reader¹ | Writer | Reader | Writer¹ | Reader¹ |
+    +=======================+========+========+========+========+=========+========+========+========+========+=========+=========+========+========+=========+=========+
+    | int8                  | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | int16                 | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | int32                 | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | int64                 | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | uint8                 | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ✅     | ❌     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | uint16                | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ✅     | ❌     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | uint32                | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ✅     | ❌     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | uint64                | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ❌     | ❌     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | float32               | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | float64               | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | bool                  | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ✅     | ✅     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | str                   | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | category              | ✅     | ❌     | ❌     | ❌     | ✅      | ❌     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | list                  | ❌     | ❌     | ✅     | ✅     | ✅      | ❌     | ❌     | ❌     | ❌     | ❌      | ❌      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | timedelta64[s]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | timedelta64[ms]       | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | timedelta64[us]       | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | timedelta64[ns]       | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | datetime64[s]         | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | datetime64[ms]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | datetime64[us]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | datetime64[ns]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | struct                | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ❌     | ❌     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | decimal64             | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ❌     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+
+**Notes:**
+
+* [¹] - Not GPU-accelerated.
diff --git a/docs/cudf/source/io.rst b/docs/cudf/source/io.rst
index 5186473ae10..e88162d8f52 100644
--- a/docs/cudf/source/io.rst
+++ b/docs/cudf/source/io.rst
@@ -8,4 +8,5 @@ This page contains Input / Output related APIs in cuDF.
    :maxdepth: 2
    :caption: Contents:
 
-   io-supported-types.rst
\ No newline at end of file
+   io-supported-types.rst
+   io-gds-integration.rst
\ No newline at end of file
diff --git a/java/LICENSE-bundled b/java/LICENSE-bundled
deleted file mode 100644
index 165befcb174..00000000000
--- a/java/LICENSE-bundled
+++ /dev/null
@@ -1,26 +0,0 @@
-The binary distribution of this product bundles binaries of Boost
-which are available under the following license:
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
diff --git a/java/README.md b/java/README.md
index 6ca58496605..ed445cd764f 100644
--- a/java/README.md
+++ b/java/README.md
@@ -38,12 +38,12 @@ In some cases there may be a classifier to indicate the version of cuda required
 Build From Source section below for more information about when this can happen. No official
 release of the jar will have a classifier on it.
 
-CUDA 10.0:
+CUDA 11.0:
 ```xml
 <dependency>
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <classifier>cuda10</classifier>
+    <classifier>cuda11</classifier>
     <version>${cudf.version}</version>
 </dependency>
 ```
@@ -52,23 +52,7 @@ CUDA 10.0:
 
 Build the native code first, and make sure the a JDK is installed and available.
 
-When building libcudf, make sure you install boost first:
-```bash
-# Install Boost C++ for Ubuntu 16.04/18.04/20.04
-sudo apt install libboost-filesystem-dev
-```
-or for a smaller installation footprint (Boost is a large library), build it from the source:
-```bash
-wget https://dl.bintray.com/boostorg/release/1.74.0/source/boost_1_74_0.tar.bz2
-tar xvf boost_1_74_0.tar.bz2
-cd boost_1_74_0
-./bootstrap.sh --with-libraries=filesystem
-./b2 cxxflags=-fPIC link=static
-sudo cp stage/lib/libboost_filesystem.a /usr/local/lib/
-```
-and pass in the cmake options
-`-DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON` so that Apache Arrow and Boost libraries are
-linked statically.
+Pass in the cmake option `-DCUDF_USE_ARROW_STATIC=ON` so that Apache Arrow is linked statically.
 
 If you use the default cmake options libcudart will be dynamically linked to libcudf
 which is included.  If you do this the resulting jar will have a classifier associated with it
@@ -78,8 +62,7 @@ There is experimental work to try and remove that requirement but it is not full
 you can build cuDF with `-DCUDA_STATIC_RUNTIME=ON` when running cmake, and similarly 
 `-DCUDA_STATIC_RUNTIME=ON` when running maven.  This will statically link in the CUDA runtime
 and result in a jar with no classifier that should run on any host that has a version of the
-driver new enough to support the runtime that this was built with. Unfortunately `libnvrtc` is still
-required for runtime code generation which also is tied to a specific version of cuda.
+driver new enough to support the runtime that this was built with.
 
 To build with maven for dynamic linking you would run.
 
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index cbf7e22b229..a030366b86a 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -17,7 +17,7 @@
 ###
 # Build the image for cudf development environment.
 #
-# Arguments: CUDA_VERSION=10.1, 10.2 or 11.0
+# Arguments: CUDA_VERSION=11.0, 11.1, 11.2.0 or 11.2.2
 #
 ###
 ARG CUDA_VERSION
@@ -25,16 +25,18 @@ FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
 
 ### Install basic requirements
 RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-8 epel-release
+RUN yum install -y devtoolset-9 epel-release
 RUN yum install -y git zlib-devel maven tar wget patch
 
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
 
-RUN cd /rapids/ && wget https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.gz && \
-   tar zxf boost_1_72_0.tar.gz && \
-   cd boost_1_72_0 && \
-   scl enable devtoolset-8 "./bootstrap.sh --prefix=/usr && ./b2 install --with-filesystem threading=multi link=static cxxflags=-fPIC; exit 0"
-
 RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v3.19.0/cmake-3.19.0-Linux-x86_64.tar.gz && \
    tar zxf cmake-3.19.0-Linux-x86_64.tar.gz
+
+# get GDS user-space lib
+RUN cd /tmp/ && wget https://developer.download.nvidia.com/gds/redist/rel-0.95.1/gds-redistrib-0.95.1.tgz && \
+    tar zxf gds-redistrib-0.95.1.tgz && \
+    cp -R ./gds-redistrib-0.95.1/targets/x86_64-linux/lib/* /usr/local/cuda/targets/x86_64-linux/lib && \
+    cp -R ./gds-redistrib-0.95.1/targets/x86_64-linux/include/* /usr/local/cuda/targets/x86_64-linux/include && \
+    rm -rf gds-redistrib-0.95.1*
diff --git a/java/ci/README.md b/java/ci/README.md
index 3ffed71b27c..968ce279a2c 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -11,16 +11,16 @@
 
 In the root path of cuDF repo, run below command to build the docker image.
 ```bash
-docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=10.1 -t cudf-build:10.1-devel-centos7 .
+docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.2.2 -t cudf-build:11.2.2-devel-centos7 .
 ```
 
-We support different CUDA versions as below:
-* CUDA 10.1
-* CUDA 10.2
+The following CUDA versions are supported:
 * CUDA 11.0
+* CUDA 11.1
+* CUDA 11.2
 
 Change the --build-arg CUDA_VERSION to what you need.
-You can replace the tag "cudf-build:10.1-devel-centos7" with another name you like.
+You can replace the tag "cudf-build:11.2.2-devel-centos7" with another name you like.
 
 ## Start the docker then build
 
@@ -28,7 +28,7 @@ You can replace the tag "cudf-build:10.1-devel-centos7" with another name you li
 
 Run below command to start a docker container with GPU.
 ```bash
-nvidia-docker run -it cudf-build:10.1-devel-centos7 bash
+nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash
 ```
 
 ### Download the cuDF source code
@@ -36,7 +36,7 @@ nvidia-docker run -it cudf-build:10.1-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-0.19
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.06
 ```
 
 ### Build cuDF jar with devtoolset
@@ -44,10 +44,10 @@ git clone --recursive https://github.com/rapidsai/cudf.git -b branch-0.19
 ```bash
 cd cudf
 export WORKSPACE=`pwd`
-scl enable devtoolset-8 "java/ci/build-in-docker.sh"
+scl enable devtoolset-9 "java/ci/build-in-docker.sh"
 ```
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-0.19-SNAPSHOT-cuda10-1.jar.
+You can find the cuDF jar in java/target/ like cudf-21.06.0-SNAPSHOT-cuda11.jar.
 
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 10be5b9c639..e26ac455433 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -30,7 +30,7 @@ OUT=${OUT:-out}
 
 SIGN_FILE=$1
 #Set absolute path for OUT_PATH
-OUT_PATH=$WORKSPACE/$OUT
+OUT_PATH="$WORKSPACE/$OUT"
 
 # set on Jenkins parameter
 echo "SIGN_FILE: $SIGN_FILE,\
@@ -52,26 +52,27 @@ export LIBCUDF_KERNEL_CACHE_PATH=/rapids
 export PATH=/usr/local/cmake-3.19.0-Linux-x86_64/bin:$PATH
 
 ###### Build libcudf ######
-rm -rf $WORKSPACE/cpp/build
-mkdir -p $WORKSPACE/cpp/build
-cd $WORKSPACE/cpp/build
-cmake .. -DUSE_NVTX=$ENABLE_NVTX -DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL
+rm -rf "$WORKSPACE/cpp/build"
+mkdir -p "$WORKSPACE/cpp/build"
+cd "$WORKSPACE/cpp/build"
+cmake .. -DUSE_NVTX=$ENABLE_NVTX -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL
+
 make -j$PARALLEL_LEVEL
 make install DESTDIR=$INSTALL_PREFIX
 
 ###### Build cudf jar ######
-BUILD_ARG="-Dmaven.repo.local=$WORKSPACE/.m2 -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS"
+BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS"
 if [ "$SIGN_FILE" == true ]; then
     # Build javadoc and sources only when SIGN_FILE is true
     BUILD_ARG="$BUILD_ARG -Prelease"
 fi
 
-if [ -f $WORKSPACE/java/ci/settings.xml ]; then
+if [ -f "$WORKSPACE/java/ci/settings.xml" ]; then
     # Build with an internal settings.xml
-    BUILD_ARG="$BUILD_ARG -s $WORKSPACE/java/ci/settings.xml"
+    BUILD_ARG="$BUILD_ARG -s \"$WORKSPACE/java/ci/settings.xml\""
 fi
 
-cd $WORKSPACE/java
+cd "$WORKSPACE/java"
 mvn -B clean package $BUILD_ARG
 
 ###### Sanity test: fail if static cudart found ######
diff --git a/java/pom.xml b/java/pom.xml
index a3fd464b320..fe2d9a453f7 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>0.19-SNAPSHOT</version>
+    <version>21.06.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
@@ -283,13 +283,6 @@
                 <include>LICENSE</include>
               </includes>
             </resource>
-            <resource>
-              <directory>${basedir}</directory>
-              <targetPath>META-INF</targetPath>
-              <includes>
-                <include>LICENSE-bundled</include>
-              </includes>
-            </resource>
             <resource>
                 <directory>${project.build.directory}/native-deps/</directory>
             </resource>
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregate.java b/java/src/main/java/ai/rapids/cudf/Aggregate.java
deleted file mode 100644
index 93a04b0e84a..00000000000
--- a/java/src/main/java/ai/rapids/cudf/Aggregate.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- */
-
-package ai.rapids.cudf;
-
-/**
- * Class for all the aggregate functions like count, max etc
- * @deprecated please use Aggregation.onColumn instead.
- */
-@Deprecated
-public final class Aggregate extends AggregationOnColumn {
-  private Aggregate(AggregationOnColumn toCopy) {
-    super(toCopy.wrapped, toCopy.getColumnIndex());
-  }
-
-  /**
-   * Include null in count if includeNulls = true
-   * @deprecated please use Aggregation.count instead.
-   */
-  @Deprecated
-  static Aggregate count(int index, boolean includeNulls) {
-    return new Aggregate(Aggregation.count(includeNulls).onColumn(index));
-  }
-
-  /**
-   * Get the first element in a list (possibly first non-null element)
-   * @deprecated please use Aggregation.nth instead
-   */
-  @Deprecated
-  static Aggregate first(int index, boolean includeNulls) {
-    return new Aggregate(Aggregate.nth(0, includeNulls).onColumn(index));
-  }
-
-  /**
-   * Get the last element in a list (possibly last non-null element)
-   * @deprecated please use Aggregation.nth instead
-   */
-  @Deprecated
-  static Aggregate last(int index, boolean includeNulls) {
-    return new Aggregate(Aggregate.nth(-1, includeNulls).onColumn(index));
-  }
-
-  @Deprecated
-  static Aggregate max(int index) {
-    return new Aggregate(Aggregation.max().onColumn(index));
-  }
-
-  @Deprecated
-  static Aggregate min(int index) {
-    return new Aggregate(Aggregation.min().onColumn(index));
-  }
-
-  @Deprecated
-  static Aggregate mean(int index) {
-    return new Aggregate(Aggregation.mean().onColumn(index));
-  }
-
-  @Deprecated
-  static Aggregate sum(int index) {
-    return new Aggregate(Aggregation.sum().onColumn(index));
-  }
-
-  @Deprecated
-  static Aggregate median(int index) {
-    return new Aggregate(Aggregation.median().onColumn(index));
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 7d8989571f7..4be906a5349 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -54,33 +54,22 @@ enum Kind {
         NUNIQUE(15),
         NTH_ELEMENT(16),
         ROW_NUMBER(17),
-        COLLECT(18),
-        LEAD(19),
-        LAG(20),
-        PTX(21),
-        CUDA(22);
+        COLLECT_LIST(18),
+        COLLECT_SET(19),
+        LEAD(20),
+        LAG(21),
+        PTX(22),
+        CUDA(23);
 
         final int nativeId;
 
         Kind(int nativeId) {this.nativeId = nativeId;}
     }
 
-    /*
-     * This is analogous to the native 'null_policy'.
-     */
-    public enum NullPolicy {
-        EXCLUDE(false),
-        INCLUDE(true);
-
-        NullPolicy(boolean includeNulls) { this.includeNulls = includeNulls; }
-
-        final boolean includeNulls;
-    }
-
     /**
      * An Aggregation that only needs a kind and nothing else.
      */
-    private static final class NoParamAggregation extends Aggregation {
+    private static class NoParamAggregation extends Aggregation {
         public NoParamAggregation(Kind kind) {
             super(kind);
         }
@@ -107,11 +96,11 @@ public boolean equals(Object other) {
         }
     }
 
-    private static final class NthAggregation extends Aggregation {
+    public static final class NthAggregation extends Aggregation {
         private final int offset;
         private final NullPolicy nullPolicy;
 
-        public NthAggregation(int offset, NullPolicy nullPolicy) {
+        private NthAggregation(int offset, NullPolicy nullPolicy) {
             super(Kind.NTH_ELEMENT);
             this.offset = offset;
             this.nullPolicy = nullPolicy;
@@ -139,7 +128,7 @@ public boolean equals(Object other) {
         }
     }
 
-    private static final class DdofAggregation extends Aggregation {
+    private static class DdofAggregation extends Aggregation {
         private final int ddof;
 
         public DdofAggregation(Kind kind, int ddof) {
@@ -169,7 +158,7 @@ public boolean equals(Object other) {
         }
     }
 
-    private static final class CountLikeAggregation extends Aggregation {
+    private static class CountLikeAggregation extends Aggregation {
         private final NullPolicy nullPolicy;
 
         public CountLikeAggregation(Kind kind, NullPolicy nullPolicy) {
@@ -280,17 +269,18 @@ long getDefaultOutput() {
         }
     }
 
-    private static final class CollectAggregation extends Aggregation {
+    public static final class CollectListAggregation extends Aggregation
+        implements RollingAggregation<CollectListAggregation> {
         private final NullPolicy nullPolicy;
 
-        public CollectAggregation(NullPolicy nullPolicy) {
-            super(Kind.COLLECT);
+        private CollectListAggregation(NullPolicy nullPolicy) {
+            super(Kind.COLLECT_LIST);
             this.nullPolicy = nullPolicy;
         }
 
         @Override
         long createNativeInstance() {
-            return Aggregation.createCollectAgg(nullPolicy.includeNulls);
+            return Aggregation.createCollectListAgg(nullPolicy.includeNulls);
         }
 
         @Override
@@ -302,14 +292,55 @@ public int hashCode() {
         public boolean equals(Object other) {
             if (this == other) {
                 return true;
-            } else if (other instanceof CollectAggregation) {
-                CollectAggregation o = (CollectAggregation) other;
+            } else if (other instanceof CollectListAggregation) {
+                CollectListAggregation o = (CollectListAggregation) other;
                 return o.nullPolicy == this.nullPolicy;
             }
             return false;
         }
     }
 
+    private static final class CollectSetAggregation extends Aggregation {
+        private final NullPolicy nullPolicy;
+        private final NullEquality nullEquality;
+        private final NaNEquality nanEquality;
+
+        public CollectSetAggregation(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) {
+            super(Kind.COLLECT_SET);
+            this.nullPolicy = nullPolicy;
+            this.nullEquality = nullEquality;
+            this.nanEquality = nanEquality;
+        }
+
+        @Override
+        long createNativeInstance() {
+            return Aggregation.createCollectSetAgg(nullPolicy.includeNulls,
+                nullEquality.nullsEqual,
+                nanEquality.nansEqual);
+        }
+
+        @Override
+        public int hashCode() {
+            return 31 * kind.hashCode()
+                + Boolean.hashCode(nullPolicy.includeNulls)
+                + Boolean.hashCode(nullEquality.nullsEqual)
+                + Boolean.hashCode(nanEquality.nansEqual);
+        }
+
+        @Override
+        public boolean equals(Object other) {
+            if (this == other) {
+                return true;
+            } else if (other instanceof CollectSetAggregation) {
+                CollectSetAggregation o = (CollectSetAggregation) other;
+                return o.nullPolicy == this.nullPolicy &&
+                    o.nullEquality == this.nullEquality &&
+                    o.nanEquality == this.nanEquality;
+            }
+            return false;
+        }
+    }
+
     protected final Kind kind;
 
     protected Aggregation(Kind kind) {
@@ -320,8 +351,8 @@ protected Aggregation(Kind kind) {
      * Add a column to the Aggregation so it can be used on a specific column of data.
      * @param columnIndex the index of the column to operate on.
      */
-    public AggregationOnColumn onColumn(int columnIndex) {
-        return new AggregationOnColumn(this, columnIndex);
+    public <T extends Aggregation> AggregationOnColumn<T> onColumn(int columnIndex) {
+        return new AggregationOnColumn((T)this, columnIndex);
     }
 
     /**
@@ -361,50 +392,73 @@ static void close(long[] ptrs) {
 
     static native void close(long ptr);
 
+    public static class SumAggregation extends NoParamAggregation
+        implements RollingAggregation<SumAggregation> {
+        private SumAggregation() {
+            super(Kind.SUM);
+        }
+    }
+
     /**
      * Sum reduction.
      */
-    public static Aggregation sum() {
-        return new NoParamAggregation(Kind.SUM);
+    public static SumAggregation sum() {
+        return new SumAggregation();
+    }
+
+    public static class ProductAggregation extends NoParamAggregation {
+        private ProductAggregation() {
+            super(Kind.PRODUCT);
+        }
     }
 
     /**
      * Product reduction.
      */
-    public static Aggregation product() {
-        return new NoParamAggregation(Kind.PRODUCT);
+    public static ProductAggregation product() {
+        return new ProductAggregation();
+    }
+
+    public static class MinAggregation extends NoParamAggregation
+        implements RollingAggregation<MinAggregation> {
+        private MinAggregation() {
+            super(Kind.MIN);
+        }
     }
 
     /**
      * Min reduction.
      */
-    public static Aggregation min() {
-        return new NoParamAggregation(Kind.MIN);
+    public static MinAggregation min() {
+        return new MinAggregation();
+    }
+
+    public static class MaxAggregation extends NoParamAggregation
+        implements RollingAggregation<MaxAggregation> {
+        private MaxAggregation() {
+            super(Kind.MAX);
+        }
     }
 
     /**
      * Max reduction.
      */
-    public static Aggregation max() {
-        return new NoParamAggregation(Kind.MAX);
+    public static MaxAggregation max() {
+        return new MaxAggregation();
     }
 
-    /**
-     * Count number of valid, a.k.a. non-null, elements.
-     */
-    public static Aggregation count() {
-        return count(NullPolicy.EXCLUDE);
+    public static class CountAggregation extends CountLikeAggregation
+        implements RollingAggregation<CountAggregation> {
+        private CountAggregation(NullPolicy nullPolicy) {
+            super(Kind.COUNT, nullPolicy);
+        }
     }
 
     /**
-     * Count number of elements.
-     * (This is deprecated, use {@link Aggregation#count(NullPolicy nullPolicy)} instead)
-     * @param includeNulls true if nulls should be counted. false if only non-null values should be
-     *                     counted.
+     * Count number of valid, a.k.a. non-null, elements.
      */
-    @Deprecated
-    public static Aggregation count(boolean includeNulls) {
-        return count(includeNulls ? NullPolicy.INCLUDE : NullPolicy.EXCLUDE);
+    public static CountAggregation count() {
+        return count(NullPolicy.EXCLUDE);
     }
 
     /**
@@ -412,8 +466,14 @@ public static Aggregation count(boolean includeNulls) {
      * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values
      *                   should be counted.
      */
-    public static Aggregation count(NullPolicy nullPolicy) {
-        return new CountLikeAggregation(Kind.COUNT, nullPolicy);
+    public static CountAggregation count(NullPolicy nullPolicy) {
+        return new CountAggregation(nullPolicy);
+    }
+
+    public static class AnyAggregation extends NoParamAggregation {
+        private AnyAggregation() {
+            super(Kind.ANY);
+        }
     }
 
     /**
@@ -421,8 +481,14 @@ public static Aggregation count(NullPolicy nullPolicy) {
      * if any of the elements in the range are true or non-zero, otherwise produces a false or 0.
      * Null values are skipped.
      */
-    public static Aggregation any() {
-        return new NoParamAggregation(Kind.ANY);
+    public static AnyAggregation any() {
+        return new AnyAggregation();
+    }
+
+    public static class AllAggregation extends NoParamAggregation {
+        private AllAggregation() {
+            super(Kind.ALL);
+        }
     }
 
     /**
@@ -430,28 +496,48 @@ public static Aggregation any() {
      * the range are true or non-zero, otherwise produces a false or 0.
      * Null values are skipped.
      */
-    public static Aggregation all() {
-        return new NoParamAggregation(Kind.ALL);
+    public static AllAggregation all() {
+        return new AllAggregation();
+    }
+
+
+    public static class SumOfSquaresAggregation extends NoParamAggregation {
+        private SumOfSquaresAggregation() {
+            super(Kind.SUM_OF_SQUARES);
+        }
     }
 
     /**
      * Sum of squares reduction.
      */
-    public static Aggregation sumOfSquares() {
-        return new NoParamAggregation(Kind.SUM_OF_SQUARES);
+    public static SumOfSquaresAggregation sumOfSquares() {
+        return new SumOfSquaresAggregation();
+    }
+
+    public static class MeanAggregation extends NoParamAggregation
+        implements RollingAggregation<MeanAggregation>{
+        private MeanAggregation() {
+            super(Kind.MEAN);
+        }
     }
 
     /**
      * Arithmetic mean reduction.
      */
-    public static Aggregation mean() {
-        return new NoParamAggregation(Kind.MEAN);
+    public static MeanAggregation mean() {
+        return new MeanAggregation();
+    }
+
+    public static class VarianceAggregation extends DdofAggregation {
+        private VarianceAggregation(int ddof) {
+            super(Kind.VARIANCE, ddof);
+        }
     }
 
     /**
      * Variance aggregation with 1 as the delta degrees of freedom.
      */
-    public static Aggregation variance() {
+    public static VarianceAggregation variance() {
         return variance(1);
     }
 
@@ -460,15 +546,21 @@ public static Aggregation variance() {
      * @param ddof delta degrees of freedom. The divisor used in calculation of variance is
      *             <code>N - ddof</code>, where N is the population size.
      */
-    public static Aggregation variance(int ddof) {
-        return new DdofAggregation(Kind.VARIANCE, ddof);
+    public static VarianceAggregation variance(int ddof) {
+        return new VarianceAggregation(ddof);
     }
 
 
+    public static class StandardDeviationAggregation extends DdofAggregation {
+        private StandardDeviationAggregation(int ddof) {
+            super(Kind.STD, ddof);
+        }
+    }
+
     /**
      * Standard deviation aggregation with 1 as the delta degrees of freedom.
      */
-    public static Aggregation standardDeviation() {
+    public static StandardDeviationAggregation standardDeviation() {
         return standardDeviation(1);
     }
 
@@ -477,39 +569,59 @@ public static Aggregation standardDeviation() {
      * @param ddof delta degrees of freedom. The divisor used in calculation of std is
      *             <code>N - ddof</code>, where N is the population size.
      */
-    public static Aggregation standardDeviation(int ddof) {
-        return new DdofAggregation(Kind.STD, ddof);
+    public static StandardDeviationAggregation standardDeviation(int ddof) {
+        return new StandardDeviationAggregation(ddof);
+    }
+
+    public static class MedianAggregation extends NoParamAggregation {
+        private MedianAggregation() {
+            super(Kind.MEDIAN);
+        }
     }
 
     /**
      * Median reduction.
      */
-    public static Aggregation median() {
-        return new NoParamAggregation(Kind.MEDIAN);
+    public static MedianAggregation median() {
+        return new MedianAggregation();
     }
 
     /**
      * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
      */
-    public static Aggregation quantile(double ... quantiles) {
+    public static QuantileAggregation quantile(double ... quantiles) {
         return quantile(QuantileMethod.LINEAR, quantiles);
     }
 
     /**
      * Aggregate to compute various quantiles.
      */
-    public static Aggregation quantile(QuantileMethod method, double ... quantiles) {
+    public static QuantileAggregation quantile(QuantileMethod method, double ... quantiles) {
         return new QuantileAggregation(method, quantiles);
     }
 
+    public static class ArgMaxAggregation extends NoParamAggregation
+        implements RollingAggregation<ArgMaxAggregation>{
+        private ArgMaxAggregation() {
+            super(Kind.ARGMAX);
+        }
+    }
+
     /**
      * Index of max element. Please note that when using this aggregation with a group by if the
      * data is not already sorted by the grouping keys it may be automatically sorted
      * prior to doing the aggregation. This would result in an index into the sorted data being
      * returned.
      */
-    public static Aggregation argMax() {
-        return new NoParamAggregation(Kind.ARGMAX);
+    public static ArgMaxAggregation argMax() {
+        return new ArgMaxAggregation();
+    }
+
+    public static class ArgMinAggregation extends NoParamAggregation
+        implements RollingAggregation<ArgMinAggregation>{
+        private ArgMinAggregation() {
+            super(Kind.ARGMIN);
+        }
     }
 
     /**
@@ -518,27 +630,21 @@ public static Aggregation argMax() {
      * prior to doing the aggregation. This would result in an index into the sorted data being
      * returned.
      */
-    public static Aggregation argMin() {
-        return new NoParamAggregation(Kind.ARGMIN);
+    public static ArgMinAggregation argMin() {
+        return new ArgMinAggregation();
     }
 
-    /**
-     * Number of unique, non-null, elements.
-     */
-    public static Aggregation nunique() {
-        return nunique(NullPolicy.EXCLUDE);
+    public static class NuniqueAggregation extends CountLikeAggregation {
+        private NuniqueAggregation(NullPolicy nullPolicy) {
+            super(Kind.NUNIQUE, nullPolicy);
+        }
     }
 
     /**
-     * Number of unique elements.
-     * (This is deprecated, use {@link Aggregation#nunique(NullPolicy nullPolicy)} instead)
-     * @param includeNulls true if nulls should be counted else false. If nulls are counted they
-     *                     compare as equal so multiple null values in a range would all only
-     *                     increase the count by 1.
+     * Number of unique, non-null, elements.
      */
-    @Deprecated
-    public static Aggregation nunique(boolean includeNulls) {
-        return nunique(includeNulls ? NullPolicy.INCLUDE : NullPolicy.EXCLUDE);
+    public static NuniqueAggregation nunique() {
+        return nunique(NullPolicy.EXCLUDE);
     }
 
     /**
@@ -547,8 +653,8 @@ public static Aggregation nunique(boolean includeNulls) {
      *                   compare as equal so multiple null values in a range would all only
      *                   increase the count by 1.
      */
-    public static Aggregation nunique(NullPolicy nullPolicy) {
-        return new CountLikeAggregation(Kind.NUNIQUE, nullPolicy);
+    public static NuniqueAggregation nunique(NullPolicy nullPolicy) {
+        return new NuniqueAggregation(nullPolicy);
     }
 
     /**
@@ -556,23 +662,10 @@ public static Aggregation nunique(NullPolicy nullPolicy) {
      * @param offset the offset to look at. Negative numbers go from the end of the group. Any
      *               value outside of the group range results in a null.
      */
-    public static Aggregation nth(int offset) {
+    public static NthAggregation nth(int offset) {
         return nth(offset, NullPolicy.INCLUDE);
     }
 
-    /**
-     * Get the nth element in a group.
-     * (This is deprecated, use {@link Aggregation#nth(int offset, NullPolicy nullPolicy)} instead)
-     * @param offset the offset to look at. Negative numbers go from the end of the group. Any
-     *               value outside of the group range results in a null.
-     * @param includeNulls true if nulls should be included in the aggregation or false if they
-     *                     should be skipped.
-     */
-    @Deprecated
-    public static Aggregation nth(int offset, boolean includeNulls) {
-        return nth(offset, includeNulls ? NullPolicy.INCLUDE : NullPolicy.EXCLUDE);
-    }
-
     /**
      * Get the nth element in a group.
      * @param offset the offset to look at. Negative numbers go from the end of the group. Any
@@ -580,38 +673,71 @@ public static Aggregation nth(int offset, boolean includeNulls) {
      * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they
      *                   should be skipped.
      */
-    public static Aggregation nth(int offset, NullPolicy nullPolicy) {
+    public static NthAggregation nth(int offset, NullPolicy nullPolicy) {
         return new NthAggregation(offset, nullPolicy);
     }
 
+    public static class RowNumberAggregation extends NoParamAggregation
+        implements RollingAggregation<RowNumberAggregation>{
+        private RowNumberAggregation() {
+            super(Kind.ROW_NUMBER);
+        }
+    }
+
     /**
      * Get the row number, only makes since for a window operations.
      */
-    public static Aggregation rowNumber() {
-        return new NoParamAggregation(Kind.ROW_NUMBER);
+    public static RowNumberAggregation rowNumber() {
+        return new RowNumberAggregation();
     }
 
     /**
-     * Collect the values into a list. nulls will be skipped.
+     * Collect the values into a list. Nulls will be skipped.
      */
-    public static Aggregation collect() {
-        return collect(NullPolicy.EXCLUDE);
+    public static CollectListAggregation collectList() {
+        return collectList(NullPolicy.EXCLUDE);
     }
 
     /**
      * Collect the values into a list.
-     * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they
-     *                     should be skipped.
+     *
+     * @param nullPolicy Indicates whether to include/exclude nulls during collection.
+     */
+    public static CollectListAggregation collectList(NullPolicy nullPolicy) {
+        return new CollectListAggregation(nullPolicy);
+    }
+
+    /**
+     * Collect the values into a set. All null values will be excluded, and all nan values are regarded as
+     * unique instances.
      */
-    public static Aggregation collect(NullPolicy nullPolicy) {
-        return new CollectAggregation(nullPolicy);
+    public static CollectSetAggregation collectSet() {
+        return new CollectSetAggregation(NullPolicy.EXCLUDE, NullEquality.UNEQUAL, NaNEquality.UNEQUAL);
+    }
+
+    /**
+     * Collect the values into a set.
+     *
+     * @param nullPolicy   Indicates whether to include/exclude nulls during collection.
+     * @param nullEquality Flag to specify whether null entries within each list should be considered equal.
+     * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
+     */
+    public static CollectSetAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) {
+        return new CollectSetAggregation(nullPolicy, nullEquality, nanEquality);
+    }
+
+    public static class LeadAggregation extends LeadLagAggregation
+        implements RollingAggregation<LeadAggregation> {
+        private LeadAggregation(int offset, ColumnVector defaultOutput) {
+            super(Kind.LEAD, offset, defaultOutput);
+        }
     }
 
     /**
      * In a rolling window return the value offset entries ahead or null if it is outside of the
      * window.
      */
-    public static Aggregation lead(int offset) {
+    public static LeadAggregation lead(int offset) {
         return lead(offset, null);
     }
 
@@ -621,15 +747,23 @@ public static Aggregation lead(int offset) {
      * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life
      * time of this aggregation operation.
      */
-    public static Aggregation lead(int offset, ColumnVector defaultOutput) {
-        return new LeadLagAggregation(Kind.LEAD, offset, defaultOutput);
+    public static LeadAggregation lead(int offset, ColumnVector defaultOutput) {
+        return new LeadAggregation(offset, defaultOutput);
     }
 
+    public static class LagAggregation extends LeadLagAggregation
+        implements RollingAggregation<LagAggregation>{
+        private LagAggregation(int offset, ColumnVector defaultOutput) {
+            super(Kind.LAG, offset, defaultOutput);
+        }
+    }
+
+
     /**
      * In a rolling window return the value offset entries behind or null if it is outside of the
      * window.
      */
-    public static Aggregation lag(int offset) {
+    public static LagAggregation lag(int offset) {
         return lag(offset, null);
     }
 
@@ -639,8 +773,8 @@ public static Aggregation lag(int offset) {
      * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life
      * time of this aggregation operation.
      */
-    public static Aggregation lag(int offset, ColumnVector defaultOutput) {
-        return new LeadLagAggregation(Kind.LAG, offset, defaultOutput);
+    public static LagAggregation lag(int offset, ColumnVector defaultOutput) {
+        return new LagAggregation(offset, defaultOutput);
     }
 
     /**
@@ -675,7 +809,12 @@ public static Aggregation lag(int offset, ColumnVector defaultOutput) {
     private static native long createLeadLagAgg(int kind, int offset);
 
     /**
-     * Create a collect aggregation including nulls or not.
+     * Create a collect list aggregation including nulls or not.
+     */
+    private static native long createCollectListAgg(boolean includeNulls);
+
+    /**
+     * Create a collect set aggregation.
      */
-    private static native long createCollectAgg(boolean includeNulls);
+    private static native long createCollectSetAgg(boolean includeNulls, boolean nullsEqual, boolean nansEqual);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java b/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java
index 76c871d5b5a..bb1404e5a07 100644
--- a/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java
+++ b/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,18 +22,18 @@
  * An Aggregation instance that also holds a column number so the aggregation can be done on
  * a specific column of data in a table.
  */
-public class AggregationOnColumn extends Aggregation {
-    protected final Aggregation wrapped;
+public class AggregationOnColumn<T extends Aggregation> extends Aggregation {
+    protected final T wrapped;
     protected final int columnIndex;
 
-    AggregationOnColumn(Aggregation wrapped, int columnIndex) {
+    AggregationOnColumn(T wrapped, int columnIndex) {
         super(wrapped.kind);
         this.wrapped = wrapped;
         this.columnIndex = columnIndex;
     }
 
     @Override
-    public AggregationOnColumn onColumn(int columnIndex) {
+    public AggregationOnColumn<T> onColumn(int columnIndex) {
         if (columnIndex == getColumnIndex()) {
             return this; // NOOP
         } else {
@@ -44,7 +44,7 @@ public AggregationOnColumn onColumn(int columnIndex) {
     /**
      * Do the aggregation over a given Window.
      */
-    public AggregationOverWindow overWindow(WindowOptions windowOptions) {
+    public <R extends Aggregation & RollingAggregation<R>> AggregationOverWindow<R> overWindow(WindowOptions windowOptions) {
         return new AggregationOverWindow(wrapped, columnIndex, windowOptions);
     }
 
diff --git a/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java b/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java
index e3bfae6fd7e..abce287c9b0 100644
--- a/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java
+++ b/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,10 +22,11 @@
  * An Aggregation instance that also holds a column number and window metadata so the aggregation
  * can be done over a specific window.
  */
-public class AggregationOverWindow extends AggregationOnColumn {
+public class AggregationOverWindow<T extends Aggregation & RollingAggregation<T>>
+    extends AggregationOnColumn<T> {
     protected final WindowOptions windowOptions;
 
-    AggregationOverWindow(Aggregation wrapped, int columnIndex, WindowOptions windowOptions) {
+    AggregationOverWindow(T wrapped, int columnIndex, WindowOptions windowOptions) {
         super(wrapped, columnIndex);
         this.windowOptions = windowOptions;
 
@@ -43,7 +44,7 @@ public WindowOptions getWindowOptions() {
     }
 
     @Override
-    public AggregationOnColumn onColumn(int columnIndex) {
+    public AggregationOnColumn<T> onColumn(int columnIndex) {
         if (columnIndex == getColumnIndex()) {
             return this; // NOOP
         } else {
@@ -52,7 +53,7 @@ public AggregationOnColumn onColumn(int columnIndex) {
     }
 
     @Override
-    public AggregationOverWindow overWindow(WindowOptions windowOptions) {
+    public AggregationOverWindow<T> overWindow(WindowOptions windowOptions) {
         if (this.windowOptions.equals(windowOptions)) {
             return this;
         }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index fcdb5d44ad3..e543d0c7b21 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -488,7 +488,7 @@ public static ColumnVector concatenate(ColumnView... columns) {
    * Concatenate columns of strings together, combining a corresponding row from each column
    * into a single string row of a new column with no separator string inserted between each
    * combined string and maintaining null values in combined rows.
-   * @param columns array of columns containing strings.
+   * @param columns array of columns containing strings, must be non-empty
    * @return A new java column vector containing the concatenated strings.
    */
   public static ColumnVector stringConcatenate(ColumnView[] columns) {
@@ -500,31 +500,133 @@ public static ColumnVector stringConcatenate(ColumnView[] columns) {
 
   /**
    * Concatenate columns of strings together, combining a corresponding row from each column into
-   * a single string row of a new column.
+   * a single string row of a new column. This version includes the separator for null rows
+   * if 'narep' is valid.
    * @param separator string scalar inserted between each string being merged.
    * @param narep string scalar indicating null behavior. If set to null and any string in the row
    *              is null the resulting string will be null. If not null, null values in any column
    *              will be replaced by the specified string.
-   * @param columns array of columns containing strings, must be more than 2 columns
+   * @param columns array of columns containing strings, must be non-empty
    * @return A new java column vector containing the concatenated strings.
    */
   public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, ColumnView[] columns) {
-    assert columns.length >= 2 : ".stringConcatenate() operation requires at least 2 columns";
+    return stringConcatenate(separator, narep, columns, true);
+  }
+
+  /**
+   * Concatenate columns of strings together, combining a corresponding row from each column into
+   * a single string row of a new column.
+   * @param separator string scalar inserted between each string being merged.
+   * @param narep string scalar indicating null behavior. If set to null and any string in the row
+   *              is null the resulting string will be null. If not null, null values in any column
+   *              will be replaced by the specified string.
+   * @param columns array of columns containing strings, must be non-empty
+   * @param separateNulls if true, then the separator is included for null rows if
+   *                       `narep` is valid.
+   * @return A new java column vector containing the concatenated strings.
+   */
+  public static ColumnVector stringConcatenate(Scalar separator, Scalar narep, ColumnView[] columns,
+      boolean separateNulls) {
+    assert columns != null : "input columns should not be null";
+    assert columns.length > 0 : "input columns should not be empty";
     assert separator != null : "separator scalar provided may not be null";
     assert separator.getType().equals(DType.STRING) : "separator scalar must be a string scalar";
     assert narep != null : "narep scalar provided may not be null";
     assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar";
-    long size = columns[0].getRowCount();
-    long[] column_views = new long[columns.length];
 
+    long[] columnViews = new long[columns.length];
     for(int i = 0; i < columns.length; i++) {
       assert columns[i] != null : "Column vectors passed may not be null";
-      assert columns[i].getType().equals(DType.STRING) : "All columns must be of type string for .cat() operation";
-      assert columns[i].getRowCount() == size : "Row count mismatch, all columns must have the same number of rows";
-      column_views[i] = columns[i].getNativeView();
+      columnViews[i] = columns[i].getNativeView();
     }
 
-    return new ColumnVector(stringConcatenation(column_views, separator.getScalarHandle(), narep.getScalarHandle()));
+    return new ColumnVector(stringConcatenation(columnViews, separator.getScalarHandle(),
+        narep.getScalarHandle(), separateNulls));
+  }
+
+  /**
+   * Concatenate columns of strings together using a separator specified for each row
+   * and returns the result as a string column. If the row separator for a given row is null,
+   * output column for that row is null. Null column values for a given row are skipped.
+   * @param columns array of columns containing strings
+   * @param sepCol strings column that provides the separator for a given row
+   * @return A new java column vector containing the concatenated strings with separator between.
+   */
+  public static ColumnVector stringConcatenate(ColumnView[] columns, ColumnView sepCol) {
+    try (Scalar nullString = Scalar.fromString(null);
+         Scalar emptyString = Scalar.fromString("")) {
+      return stringConcatenate(columns, sepCol, nullString, emptyString, false);
+    }
+  }
+
+  /**
+   * Concatenate columns of strings together using a separator specified for each row
+   * and returns the result as a string column. If the row separator for a given row is null,
+   * output column for that row is null unless separatorNarep is provided.
+   * The separator is applied between two output row values if the separateNulls
+   * is `YES` or only between valid rows if separateNulls is `NO`.
+   * @param columns array of columns containing strings
+   * @param sepCol strings column that provides the separator for a given row
+   * @param separatorNarep string scalar indicating null behavior when a separator is null.
+   *                        If set to null and the separator is null the resulting string will
+   *                        be null. If not null, this string will be used in place of a null
+   *                        separator.
+   * @param colNarep string that should be used in place of any null strings
+   *                  found in any column.
+   * @param separateNulls if true, then the separator is included for null rows if
+   *                       `colNarep` is valid.
+   * @return A new java column vector containing the concatenated strings with separator between.
+   */
+  public static ColumnVector stringConcatenate(ColumnView[] columns,
+      ColumnView sepCol, Scalar separatorNarep, Scalar colNarep, boolean separateNulls) {
+    assert columns.length >= 1 : ".stringConcatenate() operation requires at least 1 column";
+    assert separatorNarep != null : "separator narep scalar provided may not be null";
+    assert colNarep != null : "column narep scalar provided may not be null";
+    assert separatorNarep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar";
+    assert colNarep.getType().equals(DType.STRING) : "column narep scalar must be a string scalar";
+
+    long[] columnViews = new long[columns.length];
+    for(int i = 0; i < columns.length; i++) {
+      assert columns[i] != null : "Column vectors passed may not be null";
+      columnViews[i] = columns[i].getNativeView();
+    }
+
+    return new ColumnVector(stringConcatenationSepCol(columnViews, sepCol.getNativeView(),
+      separatorNarep.getScalarHandle(), colNarep.getScalarHandle(), separateNulls));
+  }
+
+  /**
+   * Concatenate columns of lists horizontally (row by row), combining a corresponding row
+   * from each column into a single list row of a new column.
+   * NOTICE: Any concatenation involving a null list element will result in a null list.
+   *
+   * @param columns array of columns containing lists, must be non-empty
+   * @return A new java column vector containing the concatenated lists.
+   */
+  public static ColumnVector listConcatenateByRow(ColumnView... columns) {
+    return listConcatenateByRow(false, columns);
+  }
+
+  /**
+   * Concatenate columns of lists horizontally (row by row), combining a corresponding row
+   * from each column into a single list row of a new column.
+   *
+   * @param ignoreNull whether to ignore null list element of input columns: If true, null list
+   *                   will be ignored from concatenation; Otherwise, any concatenation involving
+   *                   a null list element will result in a null list
+   * @param columns    array of columns containing lists, must be non-empty
+   * @return A new java column vector containing the concatenated lists.
+   */
+  public static ColumnVector listConcatenateByRow(boolean ignoreNull, ColumnView... columns) {
+    assert columns != null : "input columns should not be null";
+    assert columns.length > 0 : "input columns should not be empty";
+
+    long[] columnViews = new long[columns.length];
+    for(int i = 0; i < columns.length; i++) {
+      columnViews[i] = columns[i].getNativeView();
+    }
+
+    return new ColumnVector(concatListByRow(columnViews, ignoreNull));
   }
 
   /**
@@ -669,6 +771,56 @@ private static native long makeList(long[] handles, long typeHandle, int scale,
 
   private static native long concatenate(long[] viewHandles) throws CudfException;
 
+  /**
+   * Native method to concatenate columns of lists horizontally (row by row), combining a row
+   * from each column into a single list.
+   *
+   * @param columnViews array of longs holding the native handles of the column_views to combine.
+   * @return native handle of the resulting cudf column, used to construct the Java column
+   * by the listConcatenateByRow method.
+   */
+  private static native long concatListByRow(long[] columnViews, boolean ignoreNull);
+
+  /**
+   * Native method to concatenate columns of strings together, combining a row from
+   * each column into a single string.
+   *
+   * @param columnViews array of longs holding the native handles of the column_views to combine.
+   * @param separator   string scalar inserted between each string being merged, may not be null.
+   * @param narep       string scalar indicating null behavior. If set to null and any string in
+   *                    the row is null the resulting string will be null. If not null, null
+   *                    values in any column will be replaced by the specified string. The
+   *                    underlying value in the string scalar may be null, but the object passed
+   *                    in may not.
+   * @param separate_nulls boolean if true, then the separator is included for null rows if
+   *                       `narep` is valid.
+   * @return native handle of the resulting cudf column, used to construct the Java column
+   * by the stringConcatenate method.
+   */
+  private static native long stringConcatenation(long[] columnViews, long separator, long narep,
+                                                 boolean separate_nulls);
+
+  /**
+   * Native method to concatenate columns of strings together using a separator specified for each row
+   * and returns the result as a string column.
+   * @param columns array of longs holding the native handles of the column_views to combine.
+   * @param sep_column long holding the native handle of the strings_column_view used as separators.
+   * @param separator_narep string scalar indicating null behavior when a separator is null.
+   *                        If set to null and the separator is null the resulting string will
+   *                        be null. If not null, this string will be used in place of a null
+   *                        separator.
+   * @param col_narep string String scalar that should be used in place of any null strings
+   *                         found in any column.
+   * @param separate_nulls boolean if true, then the separator is included for null rows if
+   *                       `col_narep` is valid.
+   * @return native handle of the resulting cudf column, used to construct the Java column.
+   */
+  private static native long stringConcatenationSepCol(long[] columnViews,
+                                                       long sep_column,
+                                                       long separator_narep,
+                                                       long col_narep,
+                                                       boolean separate_nulls);
+
   /**
    * Native method to hash each row of the given table. Hashing function dispatched on the
    * native side using the hashId.
@@ -1219,6 +1371,16 @@ public static ColumnVector fromStrings(String... values) {
     }
   }
 
+  /**
+   * Create a new string vector from the given values.  This API
+   * supports inline nulls.
+   */
+  public static ColumnVector fromUTF8Strings(byte[]... values) {
+    try (HostColumnVector host = HostColumnVector.fromUTF8Strings(values)) {
+      return host.copyToDevice();
+    }
+  }
+
   /**
    * Create a new vector from the given values.  This API supports inline nulls,
    * but is much slower than building from primitive array of unscaledValues.
@@ -1433,4 +1595,48 @@ public static ColumnVector timestampNanoSecondsFromBoxedLongs(Long... values) {
     return build(DType.TIMESTAMP_NANOSECONDS, values.length, (b) -> b.appendBoxed(values));
   }
 
+  /**
+   * Creates an empty column according to the data type.
+   *
+   * It will create all the nested columns by iterating all the children in the input
+   * type object 'colType'.
+   *
+   * The performance is not good, so use it carefully. We may want to move this implementation
+   * to the native once figuring out a way to pass the nested data type to the native.
+   *
+   * @param colType the data type of the empty column
+   * @return an empty ColumnVector with its children. Each children contains zero elements.
+   * Users should close the ColumnVector to avoid memory leak.
+   */
+  public static ColumnVector empty(HostColumnVector.DataType colType) {
+    if (colType == null || colType.getType() == null) {
+      throw new IllegalArgumentException("The data type and its 'DType' should NOT be null.");
+    }
+    if (colType instanceof HostColumnVector.BasicType) {
+      // Non nested type
+      DType dt = colType.getType();
+      return new ColumnVector(makeEmptyCudfColumn(dt.typeId.getNativeId(), dt.getScale()));
+    } else if (colType instanceof HostColumnVector.ListType) {
+      // List type
+      assert colType.getNumChildren() == 1 : "List type requires one child type";
+      try (ColumnVector child = empty(colType.getChild(0))) {
+        return makeList(child);
+      }
+    } else if (colType instanceof HostColumnVector.StructType) {
+      // Struct type
+      ColumnVector[] children = new ColumnVector[colType.getNumChildren()];
+      try {
+        for (int i = 0; i < children.length; i++) {
+          children[i] = empty(colType.getChild(i));
+        }
+        return makeStruct(children);
+      } finally {
+        for (ColumnVector cv : children) {
+          if (cv != null) cv.close();
+        }
+      }
+    } else {
+      throw new IllegalArgumentException("Unsupported data type: " + colType);
+    }
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 402c64dd83d..d3b09c3b2bd 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -41,7 +41,7 @@ public class ColumnView implements AutoCloseable, BinaryOperable {
    * Constructs a Column View given a native view address
    * @param address the view handle
    */
-  protected ColumnView(long address) {
+  ColumnView(long address) {
     this.viewHandle = address;
     this.type = DType.fromNative(ColumnView.getNativeTypeId(viewHandle), ColumnView.getNativeTypeScale(viewHandle));
     this.rows = ColumnView.getNativeRowCount(viewHandle);
@@ -211,6 +211,15 @@ public void close() {
     viewHandle = 0;
   }
 
+  @Override
+  public String toString() {
+    return "ColumnView{" +
+           "rows=" + rows +
+           ", type=" + type +
+           ", nullCount=" + nullCount +
+           '}';
+  }
+
   /**
    * Used for string strip function.
    * Indicates characters to be stripped from the beginning, end, or both of each string.
@@ -1280,22 +1289,25 @@ public final ColumnVector quantile(QuantileMethod method, double[] quantiles) {
    * @return Column containing aggregate function result.
    * @throws IllegalArgumentException if unsupported window specification * (i.e. other than {@link WindowOptions.FrameType#ROWS} is used.
    */
-  public final ColumnVector rollingWindow(Aggregation op, WindowOptions options) {
+  public final ColumnVector rollingWindow(RollingAggregation op, WindowOptions options) {
+    Aggregation agg = op.getBaseAggregation();
     // Check that only row-based windows are used.
     if (!options.getFrameType().equals(WindowOptions.FrameType.ROWS)) {
       throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: "
           + options.getFrameType());
     }
 
-    long nativePtr = op.createNativeInstance();
+    long nativePtr = agg.createNativeInstance();
     try {
+      Scalar p = options.getPrecedingScalar();
+      Scalar f = options.getFollowingScalar();
       return new ColumnVector(
           rollingWindow(this.getNativeView(),
-              op.getDefaultOutput(),
+              agg.getDefaultOutput(),
               options.getMinPeriods(),
               nativePtr,
-              options.getPreceding(),
-              options.getFollowing(),
+              p == null || !p.isValid() ? 0 : p.getInt(),
+              f == null || !f.isValid() ? 0 : f.getInt(),
               options.getPrecedingCol() == null ? 0 : options.getPrecedingCol().getNativeView(),
               options.getFollowingCol() == null ? 0 : options.getFollowingCol().getNativeView()));
     } finally {
@@ -1303,6 +1315,18 @@ public final ColumnVector rollingWindow(Aggregation op, WindowOptions options) {
     }
   }
 
+  /**
+   * Compute the cumulative sum/prefix sum of the values in this column.
+   * This is similar to a rolling window SUM with unbounded preceding and none following.
+   * Input values 1, 2, 3
+   * Output values 1, 3, 6
+   * This currently only works for long values that are not nullable as this is currently a
+   * very simple implementation. It may be expanded in the future if needed.
+   */
+  public final ColumnVector prefixSum() {
+    return new ColumnVector(prefixSum(getNativeView()));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // LOGICAL
   /////////////////////////////////////////////////////////////////////////////
@@ -1435,23 +1459,32 @@ public ColumnView replaceChildrenWithViews(int[] indices,
       map.put(indices[index], views[index]);
     });
     List<ColumnView> newChildren = new ArrayList<>(getNumChildren());
-    IntStream.range(0, getNumChildren()).forEach(i -> {
-      ColumnView view = map.remove(i);
-      ColumnView child = getChildColumnView(i);
-      if (view == null) {
-        newChildren.add(child);
-      } else {
-        if (child.getRowCount() != view.getRowCount()) {
-          throw new IllegalArgumentException("Child row count doesn't match the old child");
+    List<ColumnView> toClose = new ArrayList<>(getNumChildren());
+    try {
+      IntStream.range(0, getNumChildren()).forEach(i -> {
+        ColumnView view = map.remove(i);
+        ColumnView child = getChildColumnView(i);
+        toClose.add(child);
+        if (view == null) {
+          newChildren.add(child);
+        } else {
+          if (child.getRowCount() != view.getRowCount()) {
+            throw new IllegalArgumentException("Child row count doesn't match the old child");
+          }
+          newChildren.add(view);
         }
-        newChildren.add(view);
+      });
+      if (!map.isEmpty()) {
+        throw new IllegalArgumentException("One or more invalid child indices passed to be " +
+            "replaced");
+      }
+      return new ColumnView(type, getRowCount(), Optional.of(getNullCount()), getValid(),
+          getOffsets(), newChildren.stream().toArray(n -> new ColumnView[n]));
+    } finally {
+      for (ColumnView columnView: toClose) {
+        columnView.close();
       }
-    });
-    if (!map.isEmpty()) {
-      throw new IllegalArgumentException("One or more invalid child indices passed to be replaced");
     }
-    return new ColumnView(type, getRowCount(), Optional.of(getNullCount()), getValid(),
-        getOffsets(), newChildren.stream().toArray(n -> new ColumnView[n]));
   }
 
   /**
@@ -2083,6 +2116,84 @@ public final ColumnVector substring(ColumnView start, ColumnView end) {
     return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView()));
   }
 
+  /**
+   * Given a lists column of strings (each row is a list of strings), concatenates the strings
+   * within each row and returns a single strings column result. Each new string is created by
+   * concatenating the strings from the same row (same list element) delimited by the separator
+   * provided. This version of the function relaces nulls with empty string and returns null
+   * for empty list.
+   * @param sepCol strings column that provides separators for concatenation.
+   * @return A new java column vector containing the concatenated strings with separator between.
+   */
+  public final ColumnVector stringConcatenateListElements(ColumnView sepCol) {
+    try (Scalar nullString = Scalar.fromString(null);
+         Scalar emptyString = Scalar.fromString("")) {
+      return stringConcatenateListElements(sepCol, nullString, emptyString,
+          false, false);
+    }
+  }
+
+  /**
+   * Given a lists column of strings (each row is a list of strings), concatenates the strings
+   * within each row and returns a single strings column result.
+   * Each new string is created by concatenating the strings from the same row (same list element)
+   * delimited by the row separator provided in the sepCol strings column.
+   * @param sepCol strings column that provides separators for concatenation.
+   * @param separatorNarep string scalar indicating null behavior when a separator is null.
+   *                        If set to null and the separator is null the resulting string will
+   *                        be null. If not null, this string will be used in place of a null
+   *                        separator.
+   * @param stringNarep string that should be used to replace null strings in any non-null list
+   *                     row. If set to null and the string is null the resulting string will
+   *                     be null. If not null, this string will be used in place of a null value.
+   * @param separateNulls if true, then the separator is included for null rows if
+   *                       `stringNarep` is valid.
+   * @param emptyStringOutputIfEmptyList if set to true, any input row that is an empty list
+   *                          will result in an empty string. Otherwise, it will result in a null.
+   * @return A new java column vector containing the concatenated strings with separator between.
+   */
+  public final ColumnVector stringConcatenateListElements(ColumnView sepCol,
+      Scalar separatorNarep, Scalar stringNarep, boolean separateNulls,
+      boolean emptyStringOutputIfEmptyList) {
+    assert type.equals(DType.LIST) : "column type must be a list";
+    assert separatorNarep != null : "separator narep scalar provided may not be null";
+    assert stringNarep != null : "string narep scalar provided may not be null";
+    assert separatorNarep.getType().equals(DType.STRING) : "separator naprep scalar must be a string scalar";
+    assert stringNarep.getType().equals(DType.STRING) : "string narep scalar must be a string scalar";
+
+    return new ColumnVector(stringConcatenationListElementsSepCol(getNativeView(),
+      sepCol.getNativeView(), separatorNarep.getScalarHandle(), stringNarep.getScalarHandle(),
+      separateNulls, emptyStringOutputIfEmptyList));
+  }
+
+  /**
+   * Given a lists column of strings (each row is a list of strings), concatenates the strings
+   * within each row and returns a single strings column result. Each new string is created by
+   * concatenating the strings from the same row (same list element) delimited by the
+   * separator provided.
+   * @param separator string scalar inserted between each string being merged.
+   * @param narep string scalar indicating null behavior. If set to null and any string in the row
+   *              is null the resulting string will be null. If not null, null values in any
+   *              column will be replaced by the specified string. The underlying value in the
+   *              string scalar may be null, but the object passed in may not.
+   * @param separateNulls if true, then the separator is included for null rows if
+   *                       `narep` is valid.
+   * @param emptyStringOutputIfEmptyList if set to true, any input row that is an empty list
+   *                          will result in an empty string. Otherwise, it will result in a null.
+   * @return A new java column vector containing the concatenated strings with separator between.
+   */
+  public final ColumnVector stringConcatenateListElements(Scalar separator,
+      Scalar narep, boolean separateNulls, boolean emptyStringOutputIfEmptyList) {
+    assert type.equals(DType.LIST) : "column type must be a list";
+    assert separator != null : "separator scalar provided may not be null";
+    assert narep != null : "column narep scalar provided may not be null";
+    assert narep.getType().equals(DType.STRING) : "narep scalar must be a string scalar";
+
+    return new ColumnVector(stringConcatenationListElements(getNativeView(),
+        separator.getScalarHandle(), narep.getScalarHandle(), separateNulls,
+        emptyStringOutputIfEmptyList));
+  }
+
    /**
    * Apply a JSONPath string to all rows in an input strings column.
    *
@@ -2504,6 +2615,19 @@ public final ColumnVector getMapValue(Scalar key) {
     return new ColumnVector(mapLookup(getNativeView(), key.getScalarHandle()));
   }
 
+  /** For a column of type List<Struct<String, String>> and a passed in String key, return a boolean
+   * column for all keys in the structs, It is true if the key exists in the corresponding map for
+   * that row, false otherwise. It will never return null for a row.
+   * @param key the String scalar to lookup in the column
+   * @return a boolean column based on the lookup result
+   */
+  public final ColumnVector getMapKeyExistence(Scalar key) {
+    assert type.equals(DType.LIST) : "column type must be a LIST";
+    assert key != null : "target string may not be null";
+    assert key.getType().equals(DType.STRING) : "target must be a string scalar";
+
+    return new ColumnVector(mapContains(getNativeView(), key.getScalarHandle()));
+  }
 
   /**
    * Create a new struct column view of existing column views. Note that this will NOT copy
@@ -2666,6 +2790,60 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) {
    */
   private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format);
 
+  /**
+   * Native method to concatenate a list column of strings (each row is a list of strings),
+   * concatenates the strings within each row and returns a single strings column result.
+   * Each new string is created by concatenating the strings from the same row (same list element)
+   * delimited by the row separator provided in the `separators` strings column.
+   * @param listColumnHandle long holding the native handle of the column containing lists of strings
+   *                         to concatenate.
+   * @param sepColumn long holding the native handle of the strings column that provides separators
+   *                  for concatenation.
+   * @param separatorNarep string scalar indicating null behavior when a separator is null.
+   *                       If set to null and the separator is null the resulting string will
+   *                       be null. If not null, this string will be used in place of a null
+   *                       separator.
+   * @param colNarep string String scalar that should be used in place of any null strings
+   *                 found in any column.
+   * @param separateNulls boolean if true, then the separator is included for null rows if
+   *                     `colNarep` is valid.
+   * @param emptyStringOutputIfEmptyList boolean if true, any input row that is an empty list
+   *                                     will result in an empty string. Otherwise, it will
+   *                                     result in a null.
+   * @return native handle of the resulting cudf column, used to construct the Java column.
+   */
+  private static native long stringConcatenationListElementsSepCol(long listColumnHandle,
+                                                                   long sepColumn,
+                                                                   long separatorNarep,
+                                                                   long colNarep,
+                                                                   boolean separateNulls,
+                                                                   boolean emptyStringOutputIfEmptyList);
+
+  /**
+   * Native method to concatenate a list column of strings (each row is a list of strings),
+   * concatenates the strings within each row and returns a single strings column result.
+   * Each new string is created by concatenating the strings from the same row (same list element)
+   * delimited by the separator provided.
+   * @param listColumnHandle long holding the native handle of the column containing lists of strings
+   *                     to concatenate.
+   * @param separator string scalar inserted between each string being merged, may not be null.
+   * @param narep string scalar indicating null behavior. If set to null and any string in the row
+   *              is null the resulting string will be null. If not null, null values in any
+   *              column will be replaced by the specified string. The underlying value in the
+   *              string scalar may be null, but the object passed in may not.
+   * @param separateNulls boolean if true, then the separator is included for null rows if
+   *                      `narep` is valid.
+   * @param emptyStringOutputIfEmptyList boolean if true, any input row that is an empty list
+   *                                     will result in an empty string. Otherwise, it will
+   *                                     result in a null.
+   * @return native handle of the resulting cudf column, used to construct the Java column.
+   */
+  private static native long stringConcatenationListElements(long listColumnHandle,
+                                                             long separator,
+                                                             long narep,
+                                                             boolean separateNulls,
+                                                             boolean emptyStringOutputIfEmptyList);
+
   private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException;
 
   /**
@@ -2814,20 +2992,6 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
 
   private static native long urlEncode(long cudfViewHandle);
 
-  /**
-   * Native method to concatenate columns of strings together, combining a row from
-   * each colunm into a single string.
-   * @param columnViews array of longs holding the native handles of the column_views to combine.
-   * @param separator string scalar inserted between each string being merged, may not be null.
-   * @param narep string scalar indicating null behavior. If set to null and any string in the row is null
-   *              the resulting string will be null. If not null, null values in any column will be
-   *              replaced by the specified string. The underlying value in the string scalar may be null,
-   *              but the object passed in may not.
-   * @return native handle of the resulting cudf column, used to construct the Java column
-   *         by the stringConcatenate method.
-   */
-  protected static native long stringConcatenation(long[] columnViews, long separator, long narep);
-
   /**
    * Native method for map lookup over a column of List<Struct<String,String>>
    * @param columnView the column view handle of the map
@@ -2836,6 +3000,15 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    * @throws CudfException
    */
   private static native long mapLookup(long columnView, long key) throws CudfException;
+
+  /**
+   * Native method for check the existence of a key over a column of List<Struct<String,String>>
+   * @param columnView the column view handle of the map
+   * @param key the string scalar that is the key for lookup
+   * @return boolean column handle of the result
+   * @throws CudfException
+   */
+  private static native long mapContains(long columnView, long key) throws CudfException;
   /**
    * Native method to add zeros as padding to the left of each string.
    */
@@ -2910,6 +3083,8 @@ private static native long rollingWindow(
       long preceding_col,
       long following_col);
 
+  private static native long prefixSum(long viewHandle) throws CudfException;
+
   private static native long nansToNulls(long viewHandle) throws CudfException;
 
   private static native long charLengths(long viewHandle) throws CudfException;
diff --git a/java/src/main/java/ai/rapids/cudf/CuFile.java b/java/src/main/java/ai/rapids/cudf/CuFile.java
index b1e752355db..4baad834570 100644
--- a/java/src/main/java/ai/rapids/cudf/CuFile.java
+++ b/java/src/main/java/ai/rapids/cudf/CuFile.java
@@ -35,7 +35,7 @@
 public class CuFile {
   private static final Logger log = LoggerFactory.getLogger(CuFile.class);
   private static boolean initialized = false;
-  private static long driverPointer = 0;
+  private static CuFileDriver driver;
 
   static {
     initialize();
@@ -45,25 +45,22 @@ public class CuFile {
    * Load the native libraries needed for libcufilejni, if not loaded already; open the cuFile
    * driver, and add a shutdown hook to close it.
    */
-  private static synchronized void initialize() {
+  static synchronized void initialize() {
     if (!initialized) {
       try {
         NativeDepsLoader.loadNativeDeps(new String[]{"cufilejni"});
-        driverPointer = createDriver();
+        driver = new CuFileDriver();
         Runtime.getRuntime().addShutdownHook(new Thread(() -> {
-          destroyDriver(driverPointer);
+          driver.close();
         }));
         initialized = true;
       } catch (Throwable t) {
+        // Cannot throw an exception here as the CI/CD machine may not have GDS installed.
         log.error("Could not load cuFile jni library...", t);
       }
     }
   }
 
-  private static native long createDriver();
-
-  private static native void destroyDriver(long pointer);
-
   /**
    * Check if the libcufilejni library is loaded.
    *
@@ -81,11 +78,25 @@ public static boolean libraryLoaded() {
    * @param path        The file path to copy to.
    * @param file_offset The file offset from which to write the buffer.
    * @param buffer      The device buffer to copy from.
-   * @return The file offset from which the buffer was appended.
    */
   public static void writeDeviceBufferToFile(File path, long file_offset,
                                              BaseDeviceMemoryBuffer buffer) {
-    writeToFile(path.getAbsolutePath(), file_offset, buffer.getAddress(), buffer.getLength());
+    writeDeviceMemoryToFile(path, file_offset, buffer.getAddress(), buffer.getLength());
+  }
+
+  /**
+   * Write device memory to a given file path synchronously.
+   * <p>
+   * This method is NOT thread safe if the path points to the same file on disk.
+   *
+   * @param path        The file path to copy to.
+   * @param file_offset The file offset from which to write the buffer.
+   * @param address     The device memory address to copy from.
+   * @param length      The length to copy.
+   */
+  public static void writeDeviceMemoryToFile(File path, long file_offset, long address,
+                                             long length) {
+    writeToFile(path.getAbsolutePath(), file_offset, address, length);
   }
 
   /**
@@ -98,7 +109,21 @@ public static void writeDeviceBufferToFile(File path, long file_offset,
    * @return The file offset from which the buffer was appended.
    */
   public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer buffer) {
-    return appendToFile(path.getAbsolutePath(), buffer.getAddress(), buffer.getLength());
+    return appendDeviceMemoryToFile(path, buffer.getAddress(), buffer.getLength());
+  }
+
+  /**
+   * Append device memory to a given file path synchronously.
+   * <p>
+   * This method is NOT thread safe if the path points to the same file on disk.
+   *
+   * @param path    The file path to copy to.
+   * @param address The device memory address to copy from.
+   * @param length  The length to copy.
+   * @return The file offset from which the buffer was appended.
+   */
+  public static long appendDeviceMemoryToFile(File path, long address, long length) {
+    return appendToFile(path.getAbsolutePath(), address, length);
   }
 
   /**
@@ -112,7 +137,21 @@ public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer bu
    */
   public static void readFileToDeviceBuffer(BaseDeviceMemoryBuffer buffer, File path,
                                             long fileOffset) {
-    readFromFile(buffer.getAddress(), buffer.getLength(), path.getAbsolutePath(), fileOffset);
+    readFileToDeviceMemory(buffer.getAddress(), buffer.getLength(), path, fileOffset);
+  }
+
+  /**
+   * Read a file into device memory synchronously.
+   * <p>
+   * This method is NOT thread safe if the path points to the same file on disk.
+   *
+   * @param address The device memory address to read into.
+   * @param length  The length to read.
+   * @param path       The file path to copy from.
+   * @param fileOffset The file offset from which to copy the content.
+   */
+  public static void readFileToDeviceMemory(long address, long length, File path, long fileOffset) {
+    readFromFile(address, length, path.getAbsolutePath(), fileOffset);
   }
 
   private static native void writeToFile(String path, long file_offset, long address, long length);
diff --git a/java/src/main/java/ai/rapids/cudf/CuFileBuffer.java b/java/src/main/java/ai/rapids/cudf/CuFileBuffer.java
new file mode 100644
index 00000000000..082a6c6994e
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CuFileBuffer.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Represents a cuFile buffer.
+ */
+public final class CuFileBuffer extends BaseDeviceMemoryBuffer {
+  private static final int ALIGNMENT = 4096;
+  private final DeviceMemoryBuffer deviceMemoryBuffer;
+  private final CuFileResourceCleaner cleaner;
+
+  static {
+    CuFile.initialize();
+  }
+
+  /**
+   * Construct a new cuFile buffer.
+   *
+   * @param buffer         The device memory buffer used for the cuFile buffer. This buffer is owned
+   *                       by the cuFile buffer, and will be closed when the cuFile buffer is closed.
+   * @param registerBuffer If true, register the cuFile buffer.
+   */
+  private CuFileBuffer(DeviceMemoryBuffer buffer, boolean registerBuffer) {
+    super(buffer.address, buffer.length, (MemoryBufferCleaner) null);
+    if (registerBuffer && !isAligned(buffer)) {
+      buffer.close();
+      throw new IllegalArgumentException(
+          "To register a cuFile buffer, its length must be a multiple of " + ALIGNMENT);
+    }
+    deviceMemoryBuffer = buffer;
+    cleaner = new CuFileResourceCleaner(create(buffer.address, buffer.length, registerBuffer), CuFileBuffer::destroy);
+    MemoryCleaner.register(this, cleaner);
+  }
+
+  /**
+   * Allocate memory for use with cuFile on the GPU. You must close it when done.
+   *
+   * @param bytes          size in bytes to allocate
+   * @param registerBuffer If true, register the cuFile buffer.
+   * @return the buffer
+   */
+  public static CuFileBuffer allocate(long bytes, boolean registerBuffer) {
+    DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(bytes);
+    return new CuFileBuffer(buffer, registerBuffer);
+  }
+
+  @Override
+  public MemoryBuffer slice(long offset, long len) {
+    throw new UnsupportedOperationException("Slice on cuFile buffer is not supported");
+  }
+
+  @Override
+  public void close() {
+    cleaner.close(this);
+    deviceMemoryBuffer.close();
+  }
+
+  long getPointer() {
+    return cleaner.getPointer();
+  }
+
+  private boolean isAligned(BaseDeviceMemoryBuffer buffer) {
+    return buffer.length % ALIGNMENT == 0;
+  }
+
+  private static native long create(long address, long length, boolean registerBuffer);
+
+  private static native void destroy(long pointer);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/CuFileDriver.java b/java/src/main/java/ai/rapids/cudf/CuFileDriver.java
new file mode 100644
index 00000000000..1934abf0dbd
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CuFileDriver.java
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Represents a cuFile driver.
+ */
+final class CuFileDriver implements AutoCloseable {
+  private final CuFileResourceCleaner cleaner;
+
+  CuFileDriver() {
+    cleaner = new CuFileResourceCleaner(create(), CuFileDriver::destroy);
+    MemoryCleaner.register(this, cleaner);
+  }
+
+  @Override
+  public void close() {
+    cleaner.close(this);
+  }
+
+  private static native long create();
+
+  private static native void destroy(long pointer);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/CuFileHandle.java b/java/src/main/java/ai/rapids/cudf/CuFileHandle.java
new file mode 100644
index 00000000000..2648e0a2d7c
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CuFileHandle.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Represents a cuFile file handle.
+ */
+abstract class CuFileHandle implements AutoCloseable {
+  private final CuFileResourceCleaner cleaner;
+
+  static {
+    CuFile.initialize();
+  }
+
+  protected CuFileHandle(long pointer) {
+    cleaner = new CuFileResourceCleaner(pointer, CuFileHandle::destroy);
+    MemoryCleaner.register(this, cleaner);
+  }
+
+  @Override
+  public void close() {
+    cleaner.close(this);
+  }
+
+  protected long getPointer() {
+    return cleaner.getPointer();
+  }
+
+  private static native void destroy(long pointer);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/CuFileReadHandle.java b/java/src/main/java/ai/rapids/cudf/CuFileReadHandle.java
new file mode 100644
index 00000000000..e684ca6ca03
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CuFileReadHandle.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Represents a cuFile file handle for reading.
+ */
+public final class CuFileReadHandle extends CuFileHandle {
+
+  /**
+   * Construct a reader using the specified file path.
+   *
+   * @param path The file path for reading.
+   */
+  public CuFileReadHandle(String path) {
+    super(create(path));
+  }
+
+  /**
+   * Read the file content into the specified cuFile buffer.
+   *
+   * @param buffer The cuFile buffer to store the content.
+   * @param fileOffset The file offset from which to read.
+   */
+  public void read(CuFileBuffer buffer, long fileOffset) {
+    readIntoBuffer(getPointer(), fileOffset, buffer.getPointer());
+  }
+
+  private static native long create(String path);
+
+  private static native void readIntoBuffer(long file, long fileOffset, long buffer);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/CuFileResourceCleaner.java b/java/src/main/java/ai/rapids/cudf/CuFileResourceCleaner.java
new file mode 100644
index 00000000000..b33ea1bc6a0
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CuFileResourceCleaner.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Keeps track and cleans a cuFile native resource.
+ */
+final class CuFileResourceCleaner extends MemoryCleaner.Cleaner {
+  private static final Logger log = LoggerFactory.getLogger(CuFileResourceCleaner.class);
+
+  private long pointer;
+  private final CuFileResourceDestroyer destroyer;
+  private boolean closed = false;
+
+  CuFileResourceCleaner(long pointer, CuFileResourceDestroyer destroyer) {
+    this.pointer = pointer;
+    this.destroyer = destroyer;
+    addRef();
+  }
+
+  long getPointer() {
+    return pointer;
+  }
+
+  synchronized void close(Object resource) {
+    delRef();
+    if (closed) {
+      logRefCountDebug("double free " + resource);
+      throw new IllegalStateException("Close called too many times " + resource);
+    }
+    clean(false);
+    closed = true;
+  }
+
+  @Override
+  protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
+    boolean neededCleanup = false;
+    long origAddress = pointer;
+    if (pointer != 0) {
+      try {
+        destroyer.destroy(pointer);
+      } finally {
+        // Always mark the resource as freed even if an exception is thrown.
+        // We cannot know how far it progressed before the exception, and
+        // therefore it is unsafe to retry.
+        pointer = 0;
+      }
+      neededCleanup = true;
+    }
+    if (neededCleanup && logErrorIfNotClean) {
+      log.error("A CUFile RESOURCE WAS LEAKED (ID: " + id + " " + Long.toHexString(origAddress) + ")");
+      logRefCountDebug("Leaked cuFile resource");
+    }
+    return neededCleanup;
+  }
+
+  @Override
+  public boolean isClean() {
+    return pointer == 0;
+  }
+}
diff --git a/cpp/src/io/json/json.h b/java/src/main/java/ai/rapids/cudf/CuFileResourceDestroyer.java
similarity index 75%
rename from cpp/src/io/json/json.h
rename to java/src/main/java/ai/rapids/cudf/CuFileResourceDestroyer.java
index 0c2309d9d64..df7a3936e56 100644
--- a/cpp/src/io/json/json.h
+++ b/java/src/main/java/ai/rapids/cudf/CuFileResourceDestroyer.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
-#pragma once
+package ai.rapids.cudf;
 
-#include "json_common.h"
+/**
+ * Destroys a cuFile native resource.
+ */
+interface CuFileResourceDestroyer {
+  void destroy(long pointer);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/CuFileWriteHandle.java b/java/src/main/java/ai/rapids/cudf/CuFileWriteHandle.java
new file mode 100644
index 00000000000..e58201f177f
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CuFileWriteHandle.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Represents a cuFile file handle for reading.
+ */
+public final class CuFileWriteHandle extends CuFileHandle {
+
+  /**
+   * Construct a writer using the specified file path.
+   *
+   * @param path The file path for writing.
+   */
+  public CuFileWriteHandle(String path) {
+    super(create(path));
+  }
+
+  /**
+   * Write the specified cuFile buffer into the file.
+   *
+   * @param buffer The cuFile buffer to write from.
+   * @param length The number of bytes to write.
+   * @param fileOffset The starting file offset from which to write.
+   */
+  public void write(CuFileBuffer buffer, long length, long fileOffset) {
+    writeFromBuffer(getPointer(), fileOffset, buffer.getPointer(), length);
+  }
+
+  /**
+   * Append the specified cuFile buffer to the file.
+   *
+   * @param buffer The cuFile buffer to append from.
+   * @param length The number of bytes to append.
+   * @return The file offset from which the buffer was appended.
+   */
+  public long append(CuFileBuffer buffer, long length) {
+    return appendFromBuffer(getPointer(), buffer.getPointer(), length);
+  }
+
+  private static native long create(String path);
+
+  private static native void writeFromBuffer(long file, long fileOffset, long buffer, long length);
+
+  private static native long appendFromBuffer(long file, long buffer, long length);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByOptions.java b/java/src/main/java/ai/rapids/cudf/GroupByOptions.java
index 63b21ad59d4..eb769ccf4ed 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -19,28 +19,49 @@
 package ai.rapids.cudf;
 
 /**
- * Options for group by (see cudf::groupby::Options)
+ * Options for groupby (see cudf::groupby::groupby's constructor)
  */
 public class GroupByOptions {
 
   public static GroupByOptions DEFAULT = new GroupByOptions(new Builder());
 
   private final boolean ignoreNullKeys;
+  private final boolean keysSorted;
+  private final boolean[] keysDescending;
+  private final boolean[] keysNullSmallest;
 
   private GroupByOptions(Builder builder) {
     ignoreNullKeys = builder.ignoreNullKeys;
+    keysSorted = builder.keysSorted;
+    keysDescending = builder.keysDescending;
+    keysNullSmallest = builder.keysNullSmallest;
   }
 
   boolean getIgnoreNullKeys() {
     return ignoreNullKeys;
   }
 
+  boolean getKeySorted() {
+    return keysSorted;
+  }
+
+  boolean[] getKeysDescending() {
+    return keysDescending;
+  }
+
+  boolean[] getKeysNullSmallest() {
+    return keysNullSmallest;
+  }
+
   public static Builder builder() {
     return new Builder();
   }
 
   public static class Builder {
     private boolean ignoreNullKeys = false;
+    private boolean keysSorted = false;
+    private boolean[] keysDescending = new boolean[0];
+    private boolean[] keysNullSmallest = new boolean[0];
 
     /**
      * If true, the cudf groupby will ignore grouping keys that are null.
@@ -52,6 +73,50 @@ public Builder withIgnoreNullKeys(boolean ignoreNullKeys) {
       return this;
     }
 
+    /**
+     * Indicates whether rows in `keys` are already sorted.
+     * The default value is false.
+     *
+     * If the `keys` are already sorted, better performance may be achieved by
+     * passing `keysSorted == true` and indicating the ascending/descending
+     * order of each column and null order by calling `withKeysDescending` and
+     * `withKeysNullSmallest`, respectively.
+     */
+    public Builder withKeysSorted(boolean keysSorted) {
+      this.keysSorted = keysSorted;
+      return this;
+    }
+
+    /**
+     * If `keysSorted == true`, indicates whether each
+     * column is ascending/descending. If empty or null, assumes all columns are
+     * ascending. Ignored if `keysSorted == false`.
+     */
+    public Builder withKeysDescending(boolean... keysDescending) {
+      if (keysDescending == null) {
+        // Use empty array instead of null
+        this.keysDescending = new boolean[0];
+      } else {
+        this.keysDescending = keysDescending;
+      }
+      return this;
+    }
+
+    /**
+     * If `keysSorted == true`, indicates the ordering
+     * of null values in each column. If empty or null, assumes all columns
+     * use 'null smallest'. Ignored if `keysSorted == false`.
+     */
+    public Builder withKeysNullSmallest(boolean... keysNullSmallest) {
+      if (keysNullSmallest == null) {
+        // Use empty array instead of null
+        this.keysNullSmallest = new boolean[0];
+      } else {
+        this.keysNullSmallest = keysNullSmallest;
+      }
+      return this;
+    }
+
     public GroupByOptions build() {
       return new GroupByOptions(this);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 846bcb3b635..46255428c1c 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -29,6 +29,7 @@
 import java.util.Objects;
 import java.util.Optional;
 import java.util.StringJoiner;
+import java.util.function.BiConsumer;
 import java.util.function.Consumer;
 
 /**
@@ -577,6 +578,40 @@ public static HostColumnVector fromStrings(String... values) {
     });
   }
 
+  /**
+   * Create a new string vector from the given values.  This API
+   * supports inline nulls.
+   */
+  public static HostColumnVector fromUTF8Strings(byte[]... values) {
+    int rows = values.length;
+    long nullCount = 0;
+    long bufferSize = 0;
+    // How many bytes do we need to hold the data.
+    for (byte[] s: values) {
+      if (s == null) {
+        nullCount++;
+      } else {
+        bufferSize += s.length;
+      }
+    }
+
+    BiConsumer<Builder, byte[]> appendUTF8 = nullCount == 0 ?
+      (b, s) -> b.appendUTF8String(s) :
+      (b, s) -> {
+        if (s == null) {
+          b.appendNull();
+        } else {
+          b.appendUTF8String(s);
+        }
+      };
+
+    return build(rows, bufferSize, (b) -> {
+      for (byte[] s: values) {
+        appendUTF8.accept(b, s);
+      }
+    });
+  }
+
   /**
    * Create a new vector from the given values.  This API supports inline nulls,
    * but is much slower than building from primitive array of unscaledValues.
@@ -1085,9 +1120,11 @@ private void appendChildOrNull(ColumnBuilder childBuilder, Object listElement) {
       } else if (listElement instanceof BigDecimal) {
         childBuilder.append((BigDecimal) listElement);
       } else if (listElement instanceof List) {
-        childBuilder.append((List) listElement);
+        childBuilder.append((List<?>) listElement);
       } else if (listElement instanceof StructData) {
         childBuilder.append((StructData) listElement);
+      } else if (listElement instanceof byte[]) {
+        childBuilder.appendUTF8String((byte[]) listElement);
       } else {
         throw new IllegalStateException("Unexpected element type: " + listElement.getClass());
       }
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
index d0e31284bd4..aa084ad7eef 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
@@ -257,6 +257,21 @@ public static void register(BatchedLZ4Decompressor.BatchedMetadata metadata, Cle
     all.add(new CleanerWeakReference(metadata, cleaner, collected, false));
   }
 
+  static void register(CuFileDriver driver, Cleaner cleaner) {
+    // It is now registered...
+    all.add(new CleanerWeakReference(driver, cleaner, collected, false));
+  }
+
+  static void register(CuFileBuffer buffer, Cleaner cleaner) {
+    // It is now registered...
+    all.add(new CleanerWeakReference(buffer, cleaner, collected, false));
+  }
+
+  static void register(CuFileHandle handle, Cleaner cleaner) {
+    // It is now registered...
+    all.add(new CleanerWeakReference(handle, cleaner, collected, false));
+  }
+
   /**
    * This is not 100% perfect and we can still run into situations where RMM buffers were not
    * collected and this returns false because of thread race conditions. This is just a best effort.
diff --git a/java/src/main/java/ai/rapids/cudf/NaNEquality.java b/java/src/main/java/ai/rapids/cudf/NaNEquality.java
new file mode 100644
index 00000000000..b135bc63007
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/NaNEquality.java
@@ -0,0 +1,33 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/*
+ * This is analogous to the native 'nan_equality'.
+ */
+public enum NaNEquality {
+  UNEQUAL(false),
+  ALL_EQUAL(true);
+
+  NaNEquality(boolean nansEqual) {
+    this.nansEqual = nansEqual;
+  }
+
+  final boolean nansEqual;
+}
diff --git a/java/src/main/java/ai/rapids/cudf/NullEquality.java b/java/src/main/java/ai/rapids/cudf/NullEquality.java
new file mode 100644
index 00000000000..d1e97f2cd32
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/NullEquality.java
@@ -0,0 +1,33 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/*
+ * This is analogous to the native 'null_equality'.
+ */
+public enum NullEquality {
+  UNEQUAL(false),
+  EQUAL(true);
+
+  NullEquality(boolean nullsEqual) {
+    this.nullsEqual = nullsEqual;
+  }
+
+  final boolean nullsEqual;
+}
diff --git a/java/src/main/java/ai/rapids/cudf/NullPolicy.java b/java/src/main/java/ai/rapids/cudf/NullPolicy.java
new file mode 100644
index 00000000000..469fbbdddac
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/NullPolicy.java
@@ -0,0 +1,33 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/*
+ * This is analogous to the native 'null_policy'.
+ */
+public enum NullPolicy {
+  EXCLUDE(false),
+  INCLUDE(true);
+
+  NullPolicy(boolean includeNulls) {
+    this.includeNulls = includeNulls;
+  }
+
+  final boolean includeNulls;
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java
new file mode 100644
index 00000000000..f5b0a0f74b3
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java
@@ -0,0 +1,448 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Per column settings for writing Parquet files.
+ */
+public class ParquetColumnWriterOptions {
+  private boolean isTimestampTypeInt96;
+  private int precision;
+  private boolean isNullable;
+  private String columName;
+  private ParquetColumnWriterOptions(AbstractStructBuilder builder) {
+    this.columName = builder.name;
+    this.isNullable = builder.isNullable;
+    this.childColumnOptions =
+        (ParquetColumnWriterOptions[]) builder.children.toArray(new ParquetColumnWriterOptions[0]);
+  }
+
+  /**
+   * Constructor used for list
+   */
+  private ParquetColumnWriterOptions(ListBuilder builder) {
+    assert(builder.children.size() == 1) : "Lists can only have one child";
+    this.columName = builder.name;
+    this.isNullable = builder.isNullable;
+    // we are adding the child twice even though lists have one child only because the way the cudf
+    // has implemented this it requires two children to be set for the list, but it drops the
+    // first one. This is something that is a lower priority and might be fixed in future
+    this.childColumnOptions =
+        new ParquetColumnWriterOptions[]{DUMMY_CHILD, builder.children.get(0)};
+  }
+
+  protected ParquetColumnWriterOptions[] childColumnOptions = {};
+  protected abstract static class AbstractStructBuilder<T extends AbstractStructBuilder,
+      V extends ParquetColumnWriterOptions> extends NestedBuilder<T, V> {
+    /**
+     * Builder specific to build a Struct meta
+     */
+    public AbstractStructBuilder(String name, boolean isNullable) {
+      super(name, isNullable);
+    }
+
+    protected AbstractStructBuilder() {
+      super();
+    }
+  }
+
+  // This child is needed as the first child of a List column meta due to how cudf has been
+  // implemented. Cudf drops the first child from the meta if a column is a LIST. This is done
+  // this way due to some complications in the parquet reader. There was change to fix this here:
+  // https://github.com/rapidsai/cudf/pull/7461/commits/5ce33b40abb87cc7b76b5efeb0a3a0215f9ef6fb
+  // but it was reverted later on here:
+  // https://github.com/rapidsai/cudf/pull/7461/commits/f248eb7265de995a95f998d46d897fb0ae47f53e
+  static ParquetColumnWriterOptions DUMMY_CHILD = new ParquetColumnWriterOptions("DUMMY");
+
+  public static abstract class NestedBuilder<T extends NestedBuilder, V extends ParquetColumnWriterOptions> {
+    protected List<ParquetColumnWriterOptions> children = new ArrayList<>();
+    protected boolean isNullable = true;
+    protected String name = "";
+
+    /**
+     * Builder specific to build a Struct meta
+     */
+    protected NestedBuilder(String name, boolean isNullable) {
+      this.name = name;
+      this.isNullable = isNullable;
+    }
+
+    protected NestedBuilder() {}
+
+    protected ParquetColumnWriterOptions withColumns(String name, boolean isNullable) {
+      return new ParquetColumnWriterOptions(name, isNullable);
+    }
+
+    protected ParquetColumnWriterOptions withDecimal(String name, int precision,
+                                                     boolean isNullable) {
+      return new ParquetColumnWriterOptions(name, false, precision, isNullable);
+    }
+
+    protected ParquetColumnWriterOptions withTimestamp(String name, boolean isInt96,
+                                                       boolean isNullable) {
+      return new ParquetColumnWriterOptions(name, isInt96, 0, isNullable);
+    }
+
+    /**
+     * Set the list column meta.
+     * Lists should have only one child in ColumnVector, but the metadata expects a
+     * LIST column to have two children and the first child to be the
+     * {@link ParquetColumnWriterOptions#DUMMY_CHILD}.
+     * This is the current behavior in cudf and will change in future
+     * @return this for chaining.
+     */
+    public T withListColumn(ParquetListColumnWriterOptions child) {
+      assert (child.getChildColumnOptions().length == 2) : "Lists can only have two children";
+      if (child.getChildColumnOptions()[0] != DUMMY_CHILD) {
+        throw new IllegalArgumentException("First child in the list has to be DUMMY_CHILD");
+      }
+      if (child.getChildColumnOptions()[1].getColumName().isEmpty()) {
+        throw new IllegalArgumentException("Column name can't be empty");
+      }
+      children.add(child);
+      return (T) this;
+    }
+
+    /**
+     * Set a child struct meta data
+     * @return this for chaining.
+     */
+    public T withStructColumn(ParquetStructColumnWriterOptions child) {
+      for (ParquetColumnWriterOptions opt: child.getChildColumnOptions()) {
+        if (opt.getColumName().isEmpty()) {
+          throw new IllegalArgumentException("Column name can't be empty");
+        }
+      }
+      children.add(child);
+      return (T) this;
+    }
+
+    /**
+     * Set column name
+     */
+    public T withNonNullableColumns(String... name) {
+      withColumns(false, name);
+      return (T) this;
+    }
+
+    /**
+     * Set nullable column meta data
+     */
+    public T withNullableColumns(String... name) {
+      withColumns(true, name);
+      return (T) this;
+    }
+
+    /**
+     * Set a simple child meta data
+     * @return this for chaining.
+     */
+    public T withColumns(boolean nullable, String... name) {
+      for (String n : name) {
+        children.add(withColumns(n, nullable));
+      }
+      return (T) this;
+    }
+
+    /**
+     * Set a Decimal child meta data
+     * @return this for chaining.
+     */
+    public T withDecimalColumn(String name, int precision, boolean nullable) {
+      children.add(withDecimal(name, precision, nullable));
+      return (T) this;
+    }
+
+    /**
+     * Set a Decimal child meta data
+     * @return this for chaining.
+     */
+    public T withNullableDecimalColumn(String name, int precision) {
+      withDecimalColumn(name, precision, true);
+      return (T) this;
+    }
+
+    /**
+     * Set a Decimal child meta data
+     * @return this for chaining.
+     */
+    public T withDecimalColumn(String name, int precision) {
+      withDecimalColumn(name, precision, false);
+      return (T) this;
+    }
+
+    /**
+     * Set a timestamp child meta data
+     * @return this for chaining.
+     */
+    public T withTimestampColumn(String name, boolean isInt96, boolean nullable) {
+      children.add(withTimestamp(name, isInt96, nullable));
+      return (T) this;
+    }
+
+    /**
+     * Set a timestamp child meta data
+     * @return this for chaining.
+     */
+    public T withTimestampColumn(String name, boolean isInt96) {
+      withTimestampColumn(name, isInt96, false);
+      return (T) this;
+    }
+
+    /**
+     * Set a timestamp child meta data
+     * @return this for chaining.
+     */
+    public T withNullableTimestampColumn(String name, boolean isInt96) {
+      withTimestampColumn(name, isInt96, true);
+      return (T) this;
+    }
+
+    public abstract V build();
+  }
+
+  ParquetColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
+                             int precision, boolean isNullable) {
+    this.isTimestampTypeInt96 = isTimestampTypeInt96;
+    this.precision = precision;
+    this.isNullable = isNullable;
+    this.columName = columnName;
+  }
+
+  ParquetColumnWriterOptions(String columnName, boolean isNullable) {
+    this.isTimestampTypeInt96 = false;
+    this.precision = 0;
+    this.isNullable = isNullable;
+    this.columName = columnName;
+  }
+
+  ParquetColumnWriterOptions(String columnName) {
+    this(columnName, true);
+  }
+
+  @FunctionalInterface
+  protected interface ByteArrayProducer {
+    boolean[] apply(ParquetColumnWriterOptions opt);
+  }
+
+  @FunctionalInterface
+  protected interface IntArrayProducer {
+    int[] apply(ParquetColumnWriterOptions opt);
+  }
+
+  boolean[] getFlatIsTimeTypeInt96() {
+    boolean[] ret = {isTimestampTypeInt96};
+    if (childColumnOptions.length > 0) {
+      return getFlatBooleans(ret, (opt) -> opt.getFlatIsTimeTypeInt96());
+    } else {
+      return ret;
+    }
+  }
+
+  protected boolean[] getFlatBooleans(boolean[] ret, ByteArrayProducer producer) {
+    boolean[][] childResults = new boolean[childColumnOptions.length][];
+    int totalChildrenFlatLength = ret.length;
+    for (int i = 0 ; i < childColumnOptions.length ; i++) {
+      ParquetColumnWriterOptions opt = childColumnOptions[i];
+      childResults[i] = producer.apply(opt);
+      totalChildrenFlatLength += childResults[i].length;
+    }
+
+    boolean[] result = new boolean[totalChildrenFlatLength];
+    System.arraycopy(ret, 0, result, 0, ret.length);
+    int copiedSoFar = ret.length;
+    for (int i = 0 ; i < childColumnOptions.length ; i++) {
+      System.arraycopy(childResults[i], 0, result, copiedSoFar, childResults[i].length);
+      copiedSoFar += childResults[i].length;
+    }
+    return result;
+  }
+
+  int[] getFlatPrecision() {
+    int[] ret = {precision};
+    if (childColumnOptions.length > 0) {
+      return getFlatInts(ret, (opt) -> opt.getFlatPrecision());
+    } else {
+      return ret;
+    }
+  }
+
+  boolean[] getFlatIsNullable() {
+    boolean[] ret = {isNullable};
+    if (childColumnOptions.length > 0) {
+      return getFlatBooleans(ret, (opt) -> opt.getFlatIsNullable());
+    } else {
+      return ret;
+    }
+  }
+
+  int[] getFlatNumChildren() {
+    int[] ret = {childColumnOptions.length};
+    if (childColumnOptions.length > 0) {
+      return getFlatInts(ret, (opt) -> opt.getFlatNumChildren());
+    } else {
+      return ret;
+    }
+  }
+
+  protected int[] getFlatInts(int[] ret, IntArrayProducer producer) {
+    int[][] childResults = new int[childColumnOptions.length][];
+    int totalChildrenFlatLength = ret.length;
+    for (int i = 0 ; i < childColumnOptions.length ; i++) {
+      ParquetColumnWriterOptions opt = childColumnOptions[i];
+      childResults[i] = producer.apply(opt);
+      totalChildrenFlatLength += childResults[i].length;
+    }
+
+    int[] result = new int[totalChildrenFlatLength];
+    System.arraycopy(ret, 0, result, 0, ret.length);
+    int copiedSoFar = ret.length;
+    for (int i = 0 ; i < childColumnOptions.length ; i++) {
+      System.arraycopy(childResults[i], 0, result, copiedSoFar, childResults[i].length);
+      copiedSoFar += childResults[i].length;
+    }
+    return result;
+  }
+
+  String[] getFlatColumnNames() {
+    String[] ret = {columName};
+    if (childColumnOptions.length > 0) {
+      return getFlatColumnNames(ret);
+    } else {
+      return ret;
+    }
+  }
+
+  protected String[] getFlatColumnNames(String[] ret) {
+    String[][] childResults = new String[childColumnOptions.length][];
+    int totalChildrenFlatLength = ret.length;
+    for (int i = 0 ; i < childColumnOptions.length ; i++) {
+      ParquetColumnWriterOptions opt = childColumnOptions[i];
+      childResults[i] = opt.getFlatColumnNames();
+      totalChildrenFlatLength += childResults[i].length;
+    }
+
+    String[] result = new String[totalChildrenFlatLength];
+    System.arraycopy(ret, 0, result, 0, ret.length);
+    int copiedSoFar = ret.length;
+    for (int i = 0 ; i < childColumnOptions.length ; i++) {
+      System.arraycopy(childResults[i], 0, result, copiedSoFar, childResults[i].length);
+      copiedSoFar += childResults[i].length;
+    }
+    return result;
+  }
+
+  /**
+   * Creates a ListBuilder for column called 'name'
+   */
+  public static ListBuilder listBuilder(String name) {
+    return new ListBuilder(name, true);
+  }
+
+  /**
+   * Creates a ListBuilder for column called 'name'
+   */
+  public static ListBuilder listBuilder(String name, boolean isNullable) {
+    return new ListBuilder(name, isNullable);
+  }
+
+  /**
+   * Creates a StructBuilder for column called 'name'
+   */
+  public static StructBuilder structBuilder(String name, boolean isNullable) {
+    return new StructBuilder(name, isNullable);
+  }
+
+  /**
+   * Creates a StructBuilder for column called 'name'
+   */
+  public static StructBuilder structBuilder(String name) {
+    return new StructBuilder(name, true);
+  }
+
+  /**
+   * Return if the column can have null values
+   */
+  public String getColumName() {
+    return columName;
+  }
+
+  /**
+   * Return if the column can have null values
+   */
+  public boolean isNullable() {
+    return isNullable;
+  }
+
+  /**
+   * Return the precision for this column
+   */
+  public int getPrecision() {
+    return precision;
+  }
+
+  /**
+   * Returns true if the writer is expected to write timestamps in INT96
+   */
+  public boolean isTimestampTypeInt96() {
+    return isTimestampTypeInt96;
+  }
+
+  /**
+   * Return the child columnOptions for this column
+   */
+  public ParquetColumnWriterOptions[] getChildColumnOptions() {
+    return childColumnOptions;
+  }
+
+  public static class ParquetStructColumnWriterOptions extends ParquetColumnWriterOptions {
+    protected ParquetStructColumnWriterOptions(AbstractStructBuilder builder) {
+      super(builder);
+    }
+  }
+
+  public static class ParquetListColumnWriterOptions extends ParquetColumnWriterOptions {
+    protected ParquetListColumnWriterOptions(ListBuilder builder) {
+      super(builder);
+    }
+  }
+
+  public static class StructBuilder extends AbstractStructBuilder<StructBuilder, ParquetStructColumnWriterOptions> {
+    public StructBuilder(String name, boolean isNullable) {
+      super(name, isNullable);
+    }
+
+    public ParquetStructColumnWriterOptions build() {
+      return new ParquetStructColumnWriterOptions(this);
+    }
+  }
+
+  public static class ListBuilder extends NestedBuilder<ListBuilder, ParquetListColumnWriterOptions> {
+    public ListBuilder(String name, boolean isNullable) {
+      super(name, isNullable);
+    }
+
+    public ParquetListColumnWriterOptions build() {
+      return new ParquetListColumnWriterOptions(this);
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 2e793494b7b..9992ae9eaf1 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -18,10 +18,58 @@
 
 package ai.rapids.cudf;
 
+import java.util.LinkedHashMap;
+import java.util.Map;
+
 /**
- * Settings for writing Parquet files.
+ * This class represents settings for writing Parquet files. It includes meta data information
+ * that will be used by the Parquet writer to write the file
  */
-public class ParquetWriterOptions extends CompressedMetadataWriterOptions {
+public final class ParquetWriterOptions extends ParquetColumnWriterOptions.ParquetStructColumnWriterOptions {
+  private final CompressionType compressionType;
+  private final Map<String, String> metadata;
+  private final StatisticsFrequency statsGranularity;
+
+  private ParquetWriterOptions(Builder builder) {
+    super(builder);
+    this.statsGranularity = builder.statsGranularity;
+    this.compressionType = builder.compressionType;
+    this.metadata = builder.metadata;
+  }
+
+  @Override
+  boolean[] getFlatIsTimeTypeInt96() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsTimeTypeInt96());
+  }
+
+  @Override
+  int[] getFlatPrecision() {
+    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatPrecision());
+  }
+
+  @Override
+  int[] getFlatNumChildren() {
+    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatNumChildren());
+  }
+
+  @Override
+  boolean[] getFlatIsNullable() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsNullable());
+  }
+
+  @Override
+  String[] getFlatColumnNames() {
+    return super.getFlatColumnNames(new String[]{});
+  }
+
+  String[] getMetadataKeys() {
+    return metadata.keySet().toArray(new String[metadata.size()]);
+  }
+
+  String[] getMetadataValues() {
+    return metadata.values().toArray(new String[metadata.size()]);
+  }
+
   public enum StatisticsFrequency {
     /** Do not generate statistics */
     NONE(0),
@@ -39,32 +87,62 @@ public enum StatisticsFrequency {
     }
   }
 
-  public static class Builder extends CMWriterBuilder<Builder> {
+  public static Builder builder() {
+    return new Builder();
+  }
+
+  public StatisticsFrequency getStatisticsFrequency() {
+    return statsGranularity;
+  }
+
+  public CompressionType getCompressionType() {
+    return compressionType;
+  }
+
+  public Map<String, String> getMetadata() {
+    return metadata;
+  }
+
+  public int getTopLevelChildren() {
+    return childColumnOptions.length;
+  }
+
+  public static class Builder extends ParquetColumnWriterOptions.AbstractStructBuilder<Builder,
+      ParquetWriterOptions> {
     private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP;
-    private boolean isTimestampTypeInt96 = false;
-    private int[] precisionValues = null;
+    final Map<String, String> metadata = new LinkedHashMap<>();
+    CompressionType compressionType = CompressionType.AUTO;
 
-    public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
-      this.statsGranularity = statsGranularity;
+    public Builder() {
+      super();
+    }
+
+    /**
+     * Add a metadata key and a value
+     */
+    public Builder withMetadata(String key, String value) {
+      this.metadata.put(key, value);
       return this;
     }
 
     /**
-     * Set whether the timestamps should be written in INT96
+     * Add a map of metadata keys and values
      */
-    public Builder withTimestampInt96(boolean int96) {
-      this.isTimestampTypeInt96 = int96;
+    public Builder withMetadata(Map<String, String> metadata) {
+      this.metadata.putAll(metadata);
       return this;
     }
 
     /**
-     * This is a temporary hack to make things work.  This API will go away once we can update the
-     * parquet APIs properly.
-     * @param precisionValues a value for each column, non-decimal columns are ignored.
-     * @return this for chaining.
+     * Set the compression type to use for writing
      */
-    public Builder withDecimalPrecisions(int ... precisionValues) {
-      this.precisionValues = precisionValues;
+    public Builder withCompressionType(CompressionType compression) {
+      this.compressionType = compression;
+      return this;
+    }
+
+    public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
+      this.statsGranularity = statsGranularity;
       return this;
     }
 
@@ -72,40 +150,4 @@ public ParquetWriterOptions build() {
       return new ParquetWriterOptions(this);
     }
   }
-
-  public static Builder builder() {
-    return new Builder();
-  }
-
-  private final StatisticsFrequency statsGranularity;
-
-  private ParquetWriterOptions(Builder builder) {
-    super(builder);
-    this.statsGranularity = builder.statsGranularity;
-    this.isTimestampTypeInt96 = builder.isTimestampTypeInt96;
-    this.precisions = builder.precisionValues;
-  }
-
-  public StatisticsFrequency getStatisticsFrequency() {
-    return statsGranularity;
-  }
-
-  /**
-   * Return the flattened list of precisions if set otherwise empty array will be returned.
-   * For a definition of what `flattened` means please look at {@link Builder#withDecimalPrecisions}
-   */
-  public int[] getPrecisions() {
-    return precisions;
-  }
-
-  /**
-   * Returns true if the writer is expected to write timestamps in INT96
-   */
-  public boolean isTimestampTypeInt96() {
-    return isTimestampTypeInt96;
-  }
-
-  private boolean isTimestampTypeInt96;
-
-  private int[] precisions;
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 8d63d2aeefc..97813182deb 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -173,6 +173,36 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
       long maxPoolSize) throws RmmException {
+    initialize(allocationMode, logConf, poolSize, maxPoolSize, 0, 0);
+  }
+
+  /**
+   * Initialize memory manager state and storage. This will always initialize
+   * the CUDA context for the calling thread if it is not already set. The
+   * caller is responsible for setting the desired CUDA device prior to this
+   * call if a specific device is already set.
+   * <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
+   * context of the calling thread after this returns.
+   * @param allocationMode Allocation strategy to use. Bit set using
+   *                       {@link RmmAllocationMode#CUDA_DEFAULT},
+   *                       {@link RmmAllocationMode#POOL},
+   *                       {@link RmmAllocationMode#ARENA} and
+   *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
+   * @param logConf        How to do logging or null if you don't want to
+   * @param poolSize       The initial pool size in bytes
+   * @param maxPoolSize    The maximum size the pool is allowed to grow. If the specified value
+   *                       is <= 0 then the pool size will not be artificially limited.
+   * @param allocationAlignment The size to which allocations are aligned.
+   * @param alignmentThreshold  Only allocations with size larger than or equal to this threshold
+   *                            are aligned with `allocationAlignment`.
+   * @throws IllegalStateException if RMM has already been initialized
+   * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
+   *                                  is not {@link RmmAllocationMode#POOL} or
+   *                                  {@link RmmAllocationMode#ARENA}, or the maximum pool size is
+   *                                  below the initial size.
+   */
+  public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
+      long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException {
     if (initialized) {
       throw new IllegalStateException("RMM is already initialized");
     }
@@ -195,7 +225,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       loc = logConf.loc;
     }
 
-    initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize);
+    initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize,
+        allocationAlignment, alignmentThreshold);
     MemoryCleaner.setDefaultGpu(Cuda.getDevice());
     initialized = true;
   }
@@ -241,7 +272,8 @@ private static long[] sortThresholds(long[] thresholds) {
   }
 
   private static native void initializeInternal(int allocationMode, int logTo, String path,
-      long poolSize, long maxPoolSize) throws RmmException;
+      long poolSize, long maxPoolSize, long allocationAlignment, long alignmentThreshold)
+      throws RmmException;
 
   /**
    * Shut down any initialized RMM instance.  This should be used very rarely.  It does not need to
diff --git a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
new file mode 100644
index 00000000000..9b80924463a
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
@@ -0,0 +1,29 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Used to tag an aggregation as something that is compatible with rolling window operations.
+ * Do not try to implement this yourself
+ */
+public interface RollingAggregation<T extends Aggregation> {
+  default T getBaseAggregation() {
+    return (T)this;
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 4221b394826..925cc89a51a 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -22,9 +22,9 @@
 import org.slf4j.LoggerFactory;
 
 import java.math.BigDecimal;
-import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
+import java.util.List;
 import java.util.Objects;
 
 /**
@@ -86,6 +86,8 @@ public static Scalar fromNull(DType type) {
       return new Scalar(type, makeDecimal32Scalar(0, type.getScale(), false));
     case DECIMAL64:
       return new Scalar(type, makeDecimal64Scalar(0L, type.getScale(), false));
+    case LIST:
+      throw new IllegalArgumentException("Please call 'listFromNull' to create a null list scalar.");
     default:
       throw new IllegalArgumentException("Unexpected type: " + type);
     }
@@ -327,10 +329,139 @@ public static Scalar timestampFromLong(DType type, Long value) {
   }
 
   public static Scalar fromString(String value) {
+    return fromUTF8String(value == null ? null : value.getBytes(StandardCharsets.UTF_8));
+  }
+
+  /**
+   * Creates a String scalar from an array of UTF8 bytes.
+   * @param value the array of UTF8 bytes
+   * @return a String scalar
+   */
+  public static Scalar fromUTF8String(byte[] value) {
     if (value == null) {
       return fromNull(DType.STRING);
     }
-    return new Scalar(DType.STRING, makeStringScalar(value.getBytes(StandardCharsets.UTF_8), true));
+    return new Scalar(DType.STRING, makeStringScalar(value, true));
+  }
+
+  /**
+   * Creates a null scalar of list type.
+   *
+   * Having this special API because the element type is required to build an empty
+   * nested column as the underlying column of the list scalar.
+   *
+   * @param elementType the data type of the element in the list.
+   * @return a null scalar of list type
+   */
+  public static Scalar listFromNull(HostColumnVector.DataType elementType) {
+    try (ColumnVector col = ColumnVector.empty(elementType)) {
+      return new Scalar(DType.LIST, makeListScalar(col.getNativeView(), false));
+    }
+  }
+
+  /**
+   * Creates a scalar of list from a ColumnView.
+   *
+   * All the rows in the ColumnView will be copied into the Scalar. So the ColumnView
+   * can be closed after this call completes.
+   */
+  public static Scalar listFromColumnView(ColumnView list) {
+    if (list == null) {
+      throw new IllegalArgumentException("'list' should NOT be null." +
+          " Please call 'listFromNull' to create a null list scalar.");
+    }
+    return new Scalar(DType.LIST, makeListScalar(list.getNativeView(), true));
+  }
+
+  /**
+   * Creates a null scalar of struct type.
+   *
+   * @param elementTypes data types of children in the struct
+   * @return a null scalar of struct type
+   */
+  public static Scalar structFromNull(HostColumnVector.DataType... elementTypes) {
+    ColumnVector[] children = new ColumnVector[elementTypes.length];
+    long[] childHandles = new long[elementTypes.length];
+    RuntimeException error = null;
+    try {
+      for (int i = 0; i < elementTypes.length; i++) {
+        // Build column vector having single null value rather than empty column vector,
+        // because struct scalar requires row count of children columns == 1.
+        children[i] = buildNullColumnVector(elementTypes[i]);
+        childHandles[i] = children[i].getNativeView();
+      }
+      return new Scalar(DType.STRUCT, makeStructScalar(childHandles, false));
+    } catch (RuntimeException ex) {
+      error = ex;
+      throw ex;
+    } catch (Exception ex) {
+      error = new RuntimeException(ex);
+      throw ex;
+    } finally {
+      // close all empty children
+      for (ColumnVector child : children) {
+        // We closed all created ColumnViews when we hit null. Therefore we exit the loop.
+        if (child == null) break;
+        // suppress exception during the close process to ensure that all elements are closed
+        try {
+          child.close();
+        } catch (Exception ex) {
+          if (error == null) {
+            error = new RuntimeException(ex);
+            continue;
+          }
+          error.addSuppressed(ex);
+        }
+      }
+      if (error != null) throw error;
+    }
+  }
+
+  /**
+   * Creates a scalar of struct from a ColumnView.
+   *
+   * @param columns children columns of struct
+   * @return a Struct scalar
+   */
+  public static Scalar structFromColumnViews(ColumnView... columns) {
+    if (columns == null) {
+      throw new IllegalArgumentException("input columns should NOT be null");
+    }
+    long[] columnHandles = new long[columns.length];
+    for (int i = 0; i < columns.length; i++) {
+      columnHandles[i] = columns[i].getNativeView();
+    }
+    return new Scalar(DType.STRUCT, makeStructScalar(columnHandles, true));
+  }
+
+  /**
+   * Build column vector of single row who holds a null value
+   *
+   * @param hostType host data type of null column vector
+   * @return the null vector
+   */
+  private static ColumnVector buildNullColumnVector(HostColumnVector.DataType hostType) {
+    DType dt = hostType.getType();
+    if (!dt.isNestedType()) {
+      try (HostColumnVector.Builder builder = HostColumnVector.builder(dt, 1)) {
+        builder.appendNull();
+        try (HostColumnVector hcv = builder.build()) {
+          return hcv.copyToDevice();
+        }
+      }
+    } else if (dt.typeId == DType.DTypeEnum.LIST) {
+      // type of List doesn't matter here because of type erasure in Java
+      try (HostColumnVector hcv = HostColumnVector.fromLists(hostType, (List<Integer>) null)) {
+        return hcv.copyToDevice();
+      }
+    } else if (dt.typeId == DType.DTypeEnum.STRUCT) {
+      try (HostColumnVector hcv = HostColumnVector.fromStructs(
+          hostType, (HostColumnVector.StructData) null)) {
+        return hcv.copyToDevice();
+      }
+    } else {
+      throw new IllegalArgumentException("Unsupported data type: " + hostType);
+    }
   }
 
   private static native void closeScalar(long scalarHandle);
@@ -342,6 +473,8 @@ public static Scalar fromString(String value) {
   private static native float getFloat(long scalarHandle);
   private static native double getDouble(long scalarHandle);
   private static native byte[] getUTF8(long scalarHandle);
+  private static native long getListAsColumnView(long scalarHandle);
+  private static native long[] getChildrenFromStructScalar(long scalarHandle);
   private static native long makeBool8Scalar(boolean isValid, boolean value);
   private static native long makeInt8Scalar(byte value, boolean isValid);
   private static native long makeUint8Scalar(byte value, boolean isValid);
@@ -360,6 +493,8 @@ public static Scalar fromString(String value) {
   private static native long makeTimestampTimeScalar(int dtypeNativeId, long value, boolean isValid);
   private static native long makeDecimal32Scalar(int value, int scale, boolean isValid);
   private static native long makeDecimal64Scalar(long value, int scale, boolean isValid);
+  private static native long makeListScalar(long viewHandle, boolean isValid);
+  private static native long makeStructScalar(long[] viewHandles, boolean isValid);
 
 
   Scalar(DType type, long scalarHandle) {
@@ -484,6 +619,51 @@ public byte[] getUTF8() {
     return getUTF8(getScalarHandle());
   }
 
+  /**
+   * Returns the scalar value as a ColumnView. Callers should close the returned ColumnView to
+   * avoid memory leak.
+   *
+   * The returned ColumnView is only valid as long as the Scalar remains valid. If the Scalar
+   * is closed before this ColumnView is closed, using this ColumnView will result in undefined
+   * behavior.
+   */
+  public ColumnView getListAsColumnView() {
+    assert DType.LIST.equals(type) : "Cannot get list for the vector of type " + type;
+    return new ColumnView(getListAsColumnView(getScalarHandle()));
+  }
+
+  /**
+   * Fetches views of children columns from struct scalar.
+   * The returned ColumnViews should be closed appropriately. Otherwise, a native memory leak will occur.
+   *
+   * @return array of column views refer to children of struct scalar
+   */
+  public ColumnView[] getChildrenFromStructScalar() {
+    assert DType.STRUCT.equals(type) : "Cannot get table for the vector of type " + type;
+
+    long[] childHandles = getChildrenFromStructScalar(getScalarHandle());
+    ColumnView[] children = new ColumnView[childHandles.length];
+    try {
+      for (int i = 0; i < children.length; i++) {
+        children[i] = new ColumnView(childHandles[i]);
+      }
+    } catch (Exception ex) {
+      // close all created ColumnViews if exception thrown
+      for (ColumnView child : children) {
+        // We closed all created ColumnViews when we hit null. Therefore we exit the loop.
+        if (child == null) break;
+        // make sure the close process is exception-free
+        try {
+          child.close();
+        } catch (Exception suppressed) {
+          ex.addSuppressed(suppressed);
+        }
+      }
+      throw ex;
+    }
+    return children;
+  }
+
   @Override
   public ColumnVector binaryOp(BinaryOp op, BinaryOperable rhs, DType outType) {
     if (rhs instanceof ColumnView) {
@@ -541,6 +721,11 @@ public boolean equals(Object o) {
       return getLong() == other.getLong();
     case STRING:
       return Arrays.equals(getUTF8(), other.getUTF8());
+    case LIST:
+      try (ColumnView viewMe = getListAsColumnView();
+           ColumnView viewO = other.getListAsColumnView()) {
+        return viewMe.equals(viewO);
+      }
     default:
       throw new IllegalStateException("Unexpected type: " + type);
     }
@@ -569,6 +754,7 @@ public int hashCode() {
       case UINT32:
       case TIMESTAMP_DAYS:
       case DECIMAL32:
+      case DURATION_DAYS:
         valueHash = getInt();
         break;
       case INT64:
@@ -578,6 +764,10 @@ public int hashCode() {
       case TIMESTAMP_MICROSECONDS:
       case TIMESTAMP_NANOSECONDS:
       case DECIMAL64:
+      case DURATION_MICROSECONDS:
+      case DURATION_SECONDS:
+      case DURATION_MILLISECONDS:
+      case DURATION_NANOSECONDS:
         valueHash = Long.hashCode(getLong());
         break;
       case FLOAT32:
@@ -589,6 +779,11 @@ public int hashCode() {
       case STRING:
         valueHash = Arrays.hashCode(getUTF8());
         break;
+      case LIST:
+        try (ColumnView v = getListAsColumnView()) {
+          valueHash = v.hashCode();
+        }
+        break;
       default:
         throw new IllegalStateException("Unknown scalar type: " + type);
       }
@@ -651,6 +846,12 @@ public String toString() {
       case DECIMAL64:
         sb.append(getBigDecimal());
         break;
+      case LIST:
+        try (ColumnView v = getListAsColumnView()) {
+          // It's not easy to pull out the elements so just a simple string of some metadata.
+          sb.append(v.toString());
+        }
+        break;
       default:
         throw new IllegalArgumentException("Unknown scalar type: " + type);
       }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 8f256987dd2..3cb836c7feb 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -244,6 +244,8 @@ private static native long[] readParquet(String[] filterColumnNames, String file
   /**
    * Setup everything to write parquet formatted data to a file.
    * @param columnNames     names that correspond to the table columns
+   * @param numChildren     Children of the top level
+   * @param flatNumChildren flattened list of children per column
    * @param nullable        true if the column can have nulls else false
    * @param metadataKeys    Metadata key names to place in the Parquet file
    * @param metadataValues  Metadata values corresponding to metadataKeys
@@ -256,18 +258,22 @@ private static native long[] readParquet(String[] filterColumnNames, String file
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetFileBegin(String[] columnNames,
+                                                   int numChildren,
+                                                   int[] flatNumChildren,
                                                    boolean[] nullable,
                                                    String[] metadataKeys,
                                                    String[] metadataValues,
                                                    int compression,
                                                    int statsFreq,
-                                                   boolean isInt96,
+                                                   boolean[] isInt96,
                                                    int[] precisions,
                                                    String filename) throws CudfException;
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
    * @param columnNames     names that correspond to the table columns
+   * @param numChildren     Children of the top level
+   * @param flatNumChildren flattened list of children per column
    * @param nullable        true if the column can have nulls else false
    * @param metadataKeys    Metadata key names to place in the Parquet file
    * @param metadataValues  Metadata values corresponding to metadataKeys
@@ -280,12 +286,14 @@ private static native long writeParquetFileBegin(String[] columnNames,
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetBufferBegin(String[] columnNames,
+                                                     int numChildren,
+                                                     int[] flatNumChildren,
                                                      boolean[] nullable,
                                                      String[] metadataKeys,
                                                      String[] metadataValues,
                                                      int compression,
                                                      int statsFreq,
-                                                     boolean isInt96,
+                                                     boolean[] isInt96,
                                                      int[] precisions,
                                                      HostBufferConsumer consumer) throws CudfException;
 
@@ -452,7 +460,9 @@ private static native void writeArrowIPCArrowChunk(long handle,
   private static native void readArrowIPCEnd(long handle);
 
   private static native long[] groupByAggregate(long inputTable, int[] keyIndices, int[] aggColumnsIndices,
-                                                long[] aggInstances, boolean ignoreNullKeys) throws CudfException;
+                                                long[] aggInstances, boolean ignoreNullKeys,
+                                                boolean keySorted, boolean[] keysDescending,
+                                                boolean[] keysNullSmallest) throws CudfException;
 
   private static native long[] rollingWindowAggregate(
       long inputTable,
@@ -465,10 +475,10 @@ private static native long[] rollingWindowAggregate(
       int[] following,
       boolean ignoreNullKeys) throws CudfException;
 
-  private static native long[] timeRangeRollingWindowAggregate(long inputTable, int[] keyIndices, int[] timestampIndices, boolean[] isTimesampAscending,
-                                                               int[] aggColumnsIndices, long[] aggInstances, int[] minPeriods,
-                                                               int[] preceding, int[] following, boolean[] unboundedPreceding, boolean[] unboundedFollowing, 
-                                                               boolean ignoreNullKeys) throws CudfException;
+  private static native long[] rangeRollingWindowAggregate(long inputTable, int[] keyIndices, int[] orderByIndices, boolean[] isOrderByAscending,
+                                                           int[] aggColumnsIndices, long[] aggInstances, int[] minPeriods,
+                                                           long[] preceding, long[] following, boolean[] unboundedPreceding, boolean[] unboundedFollowing,
+                                                           boolean ignoreNullKeys) throws CudfException;
 
   private static native long sortOrder(long inputTable, long[] sortKeys, boolean[] isDescending,
       boolean[] areNullsSmallest) throws CudfException;
@@ -543,6 +553,13 @@ private static native long[] repeatColumnCount(long tableHandle,
 
   private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress);
 
+  private static native ContiguousTable[] contiguousSplitGroups(long inputTable,
+                                                                int[] keyIndices,
+                                                                boolean ignoreNullKeys,
+                                                                boolean keySorted,
+                                                                boolean[] keysDescending,
+                                                                boolean[] keysNullSmallest);
+
   /////////////////////////////////////////////////////////////////////////////
   // TABLE CREATION APIs
   /////////////////////////////////////////////////////////////////////////////
@@ -819,35 +836,45 @@ private static class ParquetTableWriter implements TableWriter {
     HostBufferConsumer consumer;
 
     private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
-      int numColumns = options.getColumnNames().length;
-      assert (numColumns == options.getColumnNullability().length);
-      int[] precisions = options.getPrecisions();
-      if (precisions != null) {
-        assert (numColumns >= options.getPrecisions().length);
-      }
+      String[] columnNames = options.getFlatColumnNames();
+      boolean[] columnNullabilities = options.getFlatIsNullable();
+      boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
+      int[] precisions = options.getFlatPrecision();
+      int[] flatNumChildren = options.getFlatNumChildren();
+
       this.consumer = null;
-      this.handle = writeParquetFileBegin(options.getColumnNames(),
-          options.getColumnNullability(),
+      this.handle = writeParquetFileBegin(columnNames,
+          options.getTopLevelChildren(),
+          flatNumChildren,
+          columnNullabilities,
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
           options.getStatisticsFrequency().nativeId,
-          options.isTimestampTypeInt96(),
-          options.getPrecisions(),
+          timeInt96Values,
+          precisions,
           outputFile.getAbsolutePath());
     }
 
     private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer consumer) {
-      this.handle = writeParquetBufferBegin(options.getColumnNames(),
-          options.getColumnNullability(),
+      String[] columnNames = options.getFlatColumnNames();
+      boolean[] columnNullabilities = options.getFlatIsNullable();
+      boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
+      int[] precisions = options.getFlatPrecision();
+      int[] flatNumChildren = options.getFlatNumChildren();
+
+      this.consumer = consumer;
+      this.handle = writeParquetBufferBegin(columnNames,
+          options.getTopLevelChildren(),
+          flatNumChildren,
+          columnNullabilities,
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
           options.getStatisticsFrequency().nativeId,
-          options.isTimestampTypeInt96(),
-          options.getPrecisions(),
+          timeInt96Values,
+          precisions,
           consumer);
-      this.consumer = consumer;
     }
 
     @Override
@@ -1252,7 +1279,7 @@ public Table repeat(int count) {
    * @return the new Table.
    * @throws CudfException on any error.
    */
-  public Table repeat(ColumnVector counts) {
+  public Table repeat(ColumnView counts) {
     return repeat(counts, true);
   }
 
@@ -1267,7 +1294,7 @@ public Table repeat(ColumnVector counts) {
    * @return the new Table.
    * @throws CudfException on any error.
    */
-  public Table repeat(ColumnVector counts, boolean checkCount) {
+  public Table repeat(ColumnView counts, boolean checkCount) {
     return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount));
   }
 
@@ -1528,131 +1555,32 @@ public static Table merge(List<Table> tables, OrderByArg... args) {
     return merge(tables.toArray(new Table[tables.size()]), args);
   }
 
-  /**
-   * Returns count aggregation with only valid values.
-   * Null values are skipped.
-   * @param index Column on which aggregation is to be performed
-   * @return count aggregation of column `index` with null values skipped.
-   * @deprecated please use Aggregation.count.onColumn
-   */
-  @Deprecated
-  public static Aggregate count(int index) {
-    return Aggregate.count(index, false);
-  }
-
-  /**
-   * Returns count aggregation
-   * @param index Column on which aggregation is to be performed.
-   * @param include_nulls Include nulls if set to true
-   * @return count aggregation of column `index`
-   * @deprecated please use Aggregation.count.onColumn
-   */
-  @Deprecated
-  public static Aggregate count(int index, boolean include_nulls) {
-    return Aggregate.count(index, include_nulls);
-  }
-
-  /**
-   * Returns max aggregation. Null values are skipped.
-   * @param index Column on which max aggregation is to be performed.
-   * @return max aggregation of column `index`
-   * @deprecated please use Aggregation.max.onColumn
-   */
-  @Deprecated
-  public static Aggregate max(int index) {
-    return Aggregate.max(index);
-  }
-
-  /**
-   * Returns min aggregation. Null values are skipped.
-   * @param index Column on which min aggregation is to be performed.
-   * @return min aggregation of column `index`
-   * @deprecated please use Aggregation.min.onColumn
-   */
-  @Deprecated
-  public static Aggregate min(int index) {
-    return Aggregate.min(index);
-  }
-
-  /**
-   * Returns sum aggregation. Null values are skipped.
-   * @param index Column on which sum aggregation is to be performed.
-   * @return sum aggregation of column `index`
-   * @deprecated please use Aggregation.sum.onColumn
-   */
-  @Deprecated
-  public static Aggregate sum(int index) {
-    return Aggregate.sum(index);
-  }
-
-  /**
-   * Returns mean aggregation. Null values are skipped.
-   * @param index Column on which mean aggregation is to be performed.
-   * @return mean aggregation of column `index`
-   * @deprecated please use Aggregation.mean.onColumn
-   */
-  @Deprecated
-  public static Aggregate mean(int index) {
-    return Aggregate.mean(index);
-  }
-
-  /**
-   * Returns median aggregation. Null values are skipped.
-   * @param index Column on which median aggregation is to be performed.
-   * @return median aggregation of column `index`
-   * @deprecated please use Aggregation.median.onColumn
-   */
-  @Deprecated
-  public static Aggregate median(int index) {
-    return Aggregate.median(index);
-  }
-
-  /**
-   * Returns first aggregation.
-   * @param index Column on which first aggregation is to be performed.
-   * @param includeNulls Specifies whether null values are included in the aggregate operation.
-   * @return first aggregation of column `index`
-   * @deprecated please use Aggregation.nth.onColumn
-   */
-  @Deprecated
-  public static Aggregate first(int index, boolean includeNulls) {
-    return Aggregate.first(index, includeNulls);
-  }
-
-  /**
-   * Returns last aggregation.
-   * @param index Column on which last aggregation is to be performed.
-   * @param includeNulls Specifies whether null values are included in the aggregate operation.
-   * @return last aggregation of column `index`
-   * @deprecated please use Aggregation.nth.onColumn
-   */
-  @Deprecated
-  public static Aggregate last(int index, boolean includeNulls) {
-    return Aggregate.last(index, includeNulls);
-  }
-
   /**
    * Returns aggregate operations grouped by columns provided in indices
    * @param groupByOptions Options provided in the builder
    * @param indices columns to be considered for groupBy
    */
-  public AggregateOperation groupBy(GroupByOptions groupByOptions, int... indices) {
+  public GroupByOperation groupBy(GroupByOptions groupByOptions, int... indices) {
     return groupByInternal(groupByOptions, indices);
   }
 
   /**
    * Returns aggregate operations grouped by columns provided in indices
-   * null is considered as key while grouping.
-   * @param indices columnns to be considered for groupBy
+   * with default options as below:
+   *  - null is considered as key while grouping.
+   *  - keys are not presorted.
+   *  - empty key order array.
+   *  - empty null order array.
+   * @param indices columns to be considered for groupBy
    */
-  public AggregateOperation groupBy(int... indices) {
+  public GroupByOperation groupBy(int... indices) {
     return groupByInternal(GroupByOptions.builder().withIgnoreNullKeys(false).build(),
         indices);
   }
 
-  private AggregateOperation groupByInternal(GroupByOptions groupByOptions, int[] indices) {
+  private GroupByOperation groupByInternal(GroupByOptions groupByOptions, int[] indices) {
     int[] operationIndicesArray = copyAndValidate(indices);
-    return new AggregateOperation(this, groupByOptions, operationIndicesArray);
+    return new GroupByOperation(this, groupByOptions, operationIndicesArray);
   }
 
   /**
@@ -1706,7 +1634,7 @@ private int[] copyAndValidate(int[] indices) {
    * @return table containing copy of all elements of this table passing
    * the filter defined by the boolean mask
    */
-  public Table filter(ColumnVector mask) {
+  public Table filter(ColumnView mask) {
     assert mask.getType().equals(DType.BOOL8) : "Mask column must be of type BOOL8";
     assert getRowCount() == 0 || getRowCount() == mask.getRowCount() : "Mask column has incorrect size";
     return new Table(filter(nativeHandle, mask.getNativeView()));
@@ -1942,7 +1870,7 @@ public ColumnVector rowBitCount() {
    * @param gatherMap the map of indexes.  Must be non-nullable and integral type.
    * @return the resulting Table.
    */
-  public Table gather(ColumnVector gatherMap) {
+  public Table gather(ColumnView gatherMap) {
     return gather(gatherMap, true);
   }
 
@@ -1960,7 +1888,7 @@ public Table gather(ColumnVector gatherMap) {
    *                    when setting this to false.
    * @return the resulting Table.
    */
-  public Table gather(ColumnVector gatherMap, boolean checkBounds) {
+  public Table gather(ColumnView gatherMap, boolean checkBounds) {
     return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
   }
 
@@ -2178,7 +2106,7 @@ public ColumnVector[] convertToRows() {
    * @param schema the types of each column.
    * @return the parsed table.
    */
-  public static Table convertFromRows(ColumnVector vec, DType ... schema) {
+  public static Table convertFromRows(ColumnView vec, DType ... schema) {
     // TODO at some point we need a schema that support nesting so we can support nested types
     // TODO we will need scale at some point very soon too
     int[] types = new int[schema.length];
@@ -2315,14 +2243,14 @@ public Collection<List<Integer>> outputIndices() {
   }
 
   /**
-   * Class representing aggregate operations
+   * Class representing groupby operations
    */
-  public static final class AggregateOperation {
+  public static final class GroupByOperation {
 
     private final Operation operation;
     private final GroupByOptions groupByOptions;
 
-    AggregateOperation(final Table table, GroupByOptions groupByOptions, final int... indices) {
+    GroupByOperation(final Table table, GroupByOptions groupByOptions, final int... indices) {
       operation = new Operation(table, indices);
       this.groupByOptions = groupByOptions;
     }
@@ -2379,7 +2307,10 @@ public Table aggregate(AggregationOnColumn... aggregates) {
             operation.indices,
             aggColumnIndexes,
             aggOperationInstances,
-            groupByOptions.getIgnoreNullKeys()))) {
+            groupByOptions.getIgnoreNullKeys(),
+            groupByOptions.getKeySorted(),
+            groupByOptions.getKeysDescending(),
+            groupByOptions.getKeysNullSmallest()))) {
           // prepare the final table
           ColumnVector[] finalCols = new ColumnVector[keysLength + aggregates.length];
 
@@ -2416,7 +2347,7 @@ public Table aggregate(AggregationOnColumn... aggregates) {
      *                             ROWS BETWEEN 1 PRECEDING and 1 FOLLOWING)
      *  FROM my_sales_table WHERE ...
      * 
-     * Each window-aggregation is represented by a different {@link WindowAggregate} argument,
+     * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument,
      * indicating:
      *  1. the {@link Aggregation.Kind},
      *  2. the number of rows preceding and following the current row, within a window,
@@ -2489,8 +2420,10 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
           for (AggregationOverWindow operation: entry.getValue().operations()) {
             aggColumnIndexes[opIndex] = columnIndex;
             aggInstances[opIndex] = operation.createNativeInstance();
-            aggPrecedingWindows[opIndex] = operation.getWindowOptions().getPreceding();
-            aggFollowingWindows[opIndex] = operation.getWindowOptions().getFollowing();
+            Scalar p = operation.getWindowOptions().getPrecedingScalar();
+            aggPrecedingWindows[opIndex] = p == null || !p.isValid() ? 0 : p.getInt();
+            Scalar f = operation.getWindowOptions().getFollowingScalar();
+            aggFollowingWindows[opIndex] = f == null || ! f.isValid() ? 1 : f.getInt();
             aggMinPeriods[opIndex] = operation.getWindowOptions().getMinPeriods();
             defaultOutputs[opIndex] = operation.getDefaultOutput();
             opIndex++;
@@ -2526,7 +2459,7 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
     }
 
     /**
-     * Computes time-range-based window aggregation functions on the Table/projection, 
+     * Computes range-based window aggregation functions on the Table/projection,
      * based on windows specified in the argument.
      * 
      * This method enables queries such as the following SQL:
@@ -2536,7 +2469,7 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
      *                             RANGE BETWEEN INTERVAL 1 DAY PRECEDING and CURRENT ROW)
      *  FROM my_sales_table WHERE ...
      * 
-     * Each window-aggregation is represented by a different {@link WindowAggregate} argument,
+     * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument,
      * indicating:
      *  1. the {@link Aggregation.Kind},
      *  2. the index for the timestamp column to base the window definitions on
@@ -2575,10 +2508,10 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
      * @param windowAggregates the window-aggregations to be performed
      * @return Table instance, with each column containing the result of each aggregation.
      * @throws IllegalArgumentException if the window arguments are not of type
-     * {@link WindowOptions.FrameType#RANGE},
+     * {@link WindowOptions.FrameType#RANGE} or the orderBys are not of (Boolean-exclusive) integral type
      * i.e. the timestamp-column was not specified for the aggregation.
      */
-    public Table aggregateWindowsOverTimeRanges(AggregationOverWindow... windowAggregates) {
+    public Table aggregateWindowsOverRanges(AggregationOverWindow... windowAggregates) {
       // To improve performance and memory we want to remove duplicate operations
       // and also group the operations by column so hopefully cudf can do multiple aggregations
       // in a single pass.
@@ -2590,51 +2523,82 @@ public Table aggregateWindowsOverTimeRanges(AggregationOverWindow... windowAggre
       for (int outputIndex = 0; outputIndex < windowAggregates.length; outputIndex++) {
         AggregationOverWindow agg = windowAggregates[outputIndex];
         if (agg.getWindowOptions().getFrameType() != WindowOptions.FrameType.RANGE) {
-          throw new IllegalArgumentException("Expected time-range-based window specification. Unexpected window type: " 
-                  + agg.getWindowOptions().getFrameType());
+          throw new IllegalArgumentException("Expected range-based window specification. Unexpected window type: "
+              + agg.getWindowOptions().getFrameType());
         }
+
+        DType orderByType = operation.table.getColumn(agg.getWindowOptions().getOrderByColumnIndex()).getType();
+        switch (orderByType.getTypeId()) {
+          case INT8:
+          case INT16:
+          case INT32:
+          case INT64:
+          case UINT8:
+          case UINT16:
+          case UINT32:
+          case UINT64:
+          case TIMESTAMP_MILLISECONDS:
+          case TIMESTAMP_SECONDS:
+          case TIMESTAMP_DAYS:
+          case TIMESTAMP_NANOSECONDS:
+          case TIMESTAMP_MICROSECONDS:
+            break;
+          default:
+            throw new IllegalArgumentException("Expected range-based window orderBy's " +
+                "type: integral (Boolean-exclusive) and timestamp");
+        }
+
         ColumnWindowOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnWindowOps());
         totalOps += ops.add(agg, outputIndex);
       }
 
       int[] aggColumnIndexes = new int[totalOps];
-      int[] timestampColumnIndexes = new int[totalOps];
-      boolean[] isTimestampOrderAscending = new boolean[totalOps];
+      int[] orderByColumnIndexes = new int[totalOps];
+      boolean[] isOrderByOrderAscending = new boolean[totalOps];
       long[] aggInstances = new long[totalOps];
+      long[] aggPrecedingWindows = new long[totalOps];
+      long[] aggFollowingWindows = new long[totalOps];
       try {
-        int[] aggPrecedingWindows = new int[totalOps];
-        int[] aggFollowingWindows = new int[totalOps];
         boolean[] aggPrecedingWindowsUnbounded = new boolean[totalOps];
         boolean[] aggFollowingWindowsUnbounded = new boolean[totalOps];
         int[] aggMinPeriods = new int[totalOps];
         int opIndex = 0;
         for (Map.Entry<Integer, ColumnWindowOps> entry: groupedOps.entrySet()) {
           int columnIndex = entry.getKey();
-          for (AggregationOverWindow operation: entry.getValue().operations()) {
+          for (AggregationOverWindow op: entry.getValue().operations()) {
             aggColumnIndexes[opIndex] = columnIndex;
-            aggInstances[opIndex] = operation.createNativeInstance();
-            aggPrecedingWindows[opIndex] = operation.getWindowOptions().getPreceding();
-            aggFollowingWindows[opIndex] = operation.getWindowOptions().getFollowing();
-            aggPrecedingWindowsUnbounded[opIndex] = operation.getWindowOptions().isUnboundedPreceding();
-            aggFollowingWindowsUnbounded[opIndex] = operation.getWindowOptions().isUnboundedFollowing();
-            aggMinPeriods[opIndex] = operation.getWindowOptions().getMinPeriods();
-            assert (operation.getWindowOptions().getFrameType() == WindowOptions.FrameType.RANGE);
-            timestampColumnIndexes[opIndex] = operation.getWindowOptions().getTimestampColumnIndex();
-            isTimestampOrderAscending[opIndex] = operation.getWindowOptions().isTimestampOrderAscending();
-            if (operation.getDefaultOutput() != 0) {
+            aggInstances[opIndex] = op.createNativeInstance();
+            Scalar p = op.getWindowOptions().getPrecedingScalar();
+            Scalar f = op.getWindowOptions().getFollowingScalar();
+            if ((p == null || !p.isValid()) && !op.getWindowOptions().isUnboundedPreceding()) {
+              throw new IllegalArgumentException("Some kind of preceding must be set and a preceding column is not currently supported");
+            }
+            if ((f == null || !f.isValid()) && !op.getWindowOptions().isUnboundedFollowing()) {
+              throw new IllegalArgumentException("some kind of following must be set and a follow column is not currently supported");
+            }
+            aggPrecedingWindows[opIndex] = p == null ? 0 : p.getScalarHandle();
+            aggFollowingWindows[opIndex] = f == null ? 0 : f.getScalarHandle();
+            aggPrecedingWindowsUnbounded[opIndex] = op.getWindowOptions().isUnboundedPreceding();
+            aggFollowingWindowsUnbounded[opIndex] = op.getWindowOptions().isUnboundedFollowing();
+            aggMinPeriods[opIndex] = op.getWindowOptions().getMinPeriods();
+            assert (op.getWindowOptions().getFrameType() == WindowOptions.FrameType.RANGE);
+            orderByColumnIndexes[opIndex] = op.getWindowOptions().getOrderByColumnIndex();
+            isOrderByOrderAscending[opIndex] = op.getWindowOptions().isOrderByOrderAscending();
+            if (op.getDefaultOutput() != 0) {
               throw new IllegalArgumentException("Operations with a default output are not " +
                   "supported on time based rolling windows");
             }
+
             opIndex++;
           }
         }
         assert opIndex == totalOps : opIndex + " == " + totalOps;
 
-        try (Table aggregate = new Table(timeRangeRollingWindowAggregate(
+        try (Table aggregate = new Table(rangeRollingWindowAggregate(
             operation.table.nativeHandle,
             operation.indices,
-            timestampColumnIndexes,
-            isTimestampOrderAscending,
+            orderByColumnIndexes,
+            isOrderByOrderAscending,
             aggColumnIndexes,
             aggInstances, aggMinPeriods, aggPrecedingWindows, aggFollowingWindows,
             aggPrecedingWindowsUnbounded, aggFollowingWindowsUnbounded,
@@ -2658,6 +2622,55 @@ public Table aggregateWindowsOverTimeRanges(AggregationOverWindow... windowAggre
         Aggregation.close(aggInstances);
       }
     }
+
+    /**
+     * Splits the groups in a single table into separate tables according to the grouping keys.
+     * Each split table represents a single group.
+     *
+     * This API will be used by some grouping related operators to process the data
+     * group by group.
+     *
+     * Example:
+     *   Grouping column index: 0
+     *   Input: A table of 3 rows (two groups)
+     *             a    1
+     *             b    2
+     *             b    3
+     *
+     * Result:
+     *   Two tables, one group one table.
+     *   Result[0]:
+     *              a    1
+     *
+     *   Result[1]:
+     *              b    2
+     *              b    3
+     *
+     * Note, the order of the groups returned is NOT always the same with that in the input table.
+     * The split is done in native to avoid copying the offset array to JVM.
+     *
+     * @return The tables split according to the groups in the table. NOTE: It is the
+     * responsibility of the caller to close the result. Each table and column holds a
+     * reference to the original buffer. But both the buffer and the table must be closed
+     * for the memory to be released.
+     */
+    public ContiguousTable[] contiguousSplitGroups() {
+      return Table.contiguousSplitGroups(
+          operation.table.nativeHandle,
+          operation.indices,
+          groupByOptions.getIgnoreNullKeys(),
+          groupByOptions.getKeySorted(),
+          groupByOptions.getKeysDescending(),
+          groupByOptions.getKeysNullSmallest());
+    }
+
+    /**
+     * @deprecated use aggregateWindowsOverRanges
+     */
+    @Deprecated
+    public Table aggregateWindowsOverTimeRanges(AggregationOverWindow... windowAggregates) {
+      return aggregateWindowsOverRanges(windowAggregates);
+    }
   }
 
   public static final class TableOperation {
diff --git a/java/src/main/java/ai/rapids/cudf/WindowAggregate.java b/java/src/main/java/ai/rapids/cudf/WindowAggregate.java
deleted file mode 100644
index 9b8c5cb3f98..00000000000
--- a/java/src/main/java/ai/rapids/cudf/WindowAggregate.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- */
-
-package ai.rapids.cudf;
-
-/**
- * Spec for window-based aggregation (for analytical functions)
- * @deprecated use Aggregation.onColumn.overWindow instead.
- */
-@Deprecated
-public class WindowAggregate extends AggregationOverWindow {
-  private WindowAggregate(AggregationOverWindow other) {
-    super(other.wrapped, other.columnIndex, other.windowOptions);
-  }
-
-  /**
-   * @deprecated please use Aggregation.count().onColumn().overWindow()
-   */
-  @Deprecated
-  public static WindowAggregate count(int columnIndex, WindowOptions windowOptions) {
-    return new WindowAggregate(Aggregation.count(true)
-            .onColumn(columnIndex)
-            .overWindow(windowOptions));
-  }
-
-  /**
-   * @deprecated please use Aggregation.min().onColumn().overWindow()
-   */
-  @Deprecated
-  public static WindowAggregate min(int columnIndex, WindowOptions windowOptions) {
-    return new WindowAggregate(Aggregation.min()
-            .onColumn(columnIndex)
-            .overWindow(windowOptions));
-  }
-
-  /**
-   * @deprecated please use Aggregation.max().onColumn().overWindow()
-   */
-  @Deprecated
-  public static WindowAggregate max(int columnIndex, WindowOptions windowOptions) {
-    return new WindowAggregate(Aggregation.max()
-            .onColumn(columnIndex)
-            .overWindow(windowOptions));
-  }
-
-  /**
-   * @deprecated please use Aggregation.sum().onColumn().overWindow()
-   */
-  @Deprecated
-  public static WindowAggregate sum(int columnIndex, WindowOptions windowOptions) {
-    return new WindowAggregate(Aggregation.sum()
-            .onColumn(columnIndex)
-            .overWindow(windowOptions));
-  }
-
-  /**
-   * @deprecated please use Aggregation.mean().onColumn().overWindow()
-   */
-  @Deprecated
-  public static WindowAggregate mean(int columnIndex, WindowOptions windowOptions) {
-    return new WindowAggregate(Aggregation.mean()
-            .onColumn(columnIndex)
-            .overWindow(windowOptions));
-  }
-
-  /**
-   * @deprecated please use Aggregation.median().onColumn().overWindow()
-   */
-  @Deprecated
-  public static WindowAggregate median(int columnIndex, WindowOptions windowOptions) {
-    return new WindowAggregate(Aggregation.median()
-            .onColumn(columnIndex)
-            .overWindow(windowOptions));
-  }
-
-  /**
-   * @deprecated please use Aggregation.rowNumber().onColumn().overWindow()
-   */
-  @Deprecated
-  public static WindowAggregate row_number(int columnIndex, WindowOptions windowOptions) {
-    return new WindowAggregate(Aggregation.rowNumber()
-            .onColumn(columnIndex)
-            .overWindow(windowOptions));
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/WindowOptions.java b/java/src/main/java/ai/rapids/cudf/WindowOptions.java
index 429d4e1d978..6dd59e0f2fc 100644
--- a/java/src/main/java/ai/rapids/cudf/WindowOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/WindowOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,33 +21,45 @@
 /**
   * Options for rolling windows.
  */
-public class WindowOptions {
+public class WindowOptions implements AutoCloseable {
 
   enum FrameType {ROWS, RANGE}
 
-  private final int preceding;
   private final int minPeriods;
-  private final int following;
+  private final Scalar precedingScalar;
+  private final Scalar followingScalar;
   private final ColumnVector precedingCol;
   private final ColumnVector followingCol;
-  private final int timestampColumnIndex;
-  private final boolean timestampOrderAscending;
+  private final int orderByColumnIndex;
+  private final boolean orderByOrderAscending;
   private final FrameType frameType;
   private final boolean isUnboundedPreceding;
   private final boolean isUnboundedFollowing;
 
-
   private WindowOptions(Builder builder) {
-    this.preceding = builder.preceding;
     this.minPeriods = builder.minPeriods;
-    this.following = builder.following;
+    this.precedingScalar = builder.precedingScalar;
+    if (precedingScalar != null) {
+      precedingScalar.incRefCount();
+    }
+    this.followingScalar = builder.followingScalar;
+    if (followingScalar != null) {
+      followingScalar.incRefCount();
+    }
     this.precedingCol = builder.precedingCol;
+    if (precedingCol != null) {
+      precedingCol.incRefCount();
+    }
     this.followingCol = builder.followingCol;
-    this.timestampColumnIndex = builder.timestampColumnIndex;
-    this.timestampOrderAscending = builder.timestampOrderAscending;
-    this.frameType = timestampColumnIndex == -1? FrameType.ROWS : FrameType.RANGE; 
+    if (followingCol != null) {
+      followingCol.incRefCount();
+    }
+    this.orderByColumnIndex = builder.orderByColumnIndex;
+    this.orderByOrderAscending = builder.orderByOrderAscending;
+    this.frameType = orderByColumnIndex == -1? FrameType.ROWS : FrameType.RANGE;
     this.isUnboundedPreceding = builder.isUnboundedPreceding;
     this.isUnboundedFollowing = builder.isUnboundedFollowing;
+
   }
 
   @Override
@@ -56,11 +68,9 @@ public boolean equals(Object other) {
       return true;
     } else if (other instanceof WindowOptions) {
       WindowOptions o = (WindowOptions) other;
-      boolean ret = this.preceding == o.preceding &&
-              this.following == o.following &&
-              this.minPeriods == o.minPeriods &&
-              this.timestampColumnIndex == o.timestampColumnIndex &&
-              this.timestampOrderAscending == o.timestampOrderAscending &&
+      boolean ret = this.minPeriods == o.minPeriods &&
+              this.orderByColumnIndex == o.orderByColumnIndex &&
+              this.orderByOrderAscending == o.orderByOrderAscending &&
               this.frameType == o.frameType &&
               this.isUnboundedPreceding == o.isUnboundedPreceding &&
               this.isUnboundedFollowing == o.isUnboundedFollowing;
@@ -70,6 +80,12 @@ public boolean equals(Object other) {
       if (followingCol != null) {
         ret = ret && followingCol.equals(o.followingCol);
       }
+      if (precedingScalar != null) {
+        ret = ret && precedingScalar.equals(o.precedingScalar);
+      }
+      if (followingScalar != null) {
+        ret = ret && followingScalar.equals(o.followingScalar);
+      }
       return ret;
     }
     return false;
@@ -78,11 +94,9 @@ public boolean equals(Object other) {
   @Override
   public int hashCode() {
     int ret = 7;
-    ret = 31 * ret + preceding;
-    ret = 31 * ret + following;
     ret = 31 * ret + minPeriods;
-    ret = 31 * ret + timestampColumnIndex;
-    ret = 31 * ret + Boolean.hashCode(timestampOrderAscending);
+    ret = 31 * ret + orderByColumnIndex;
+    ret = 31 * ret + Boolean.hashCode(orderByOrderAscending);
     ret = 31 * ret + frameType.hashCode();
     if (precedingCol != null) {
       ret = 31 * ret + precedingCol.hashCode();
@@ -90,6 +104,12 @@ public int hashCode() {
     if (followingCol != null) {
       ret = 31 * ret + followingCol.hashCode();
     }
+    if (precedingScalar != null) {
+      ret = 31 * ret + precedingScalar.hashCode();
+    }
+    if (followingScalar != null) {
+      ret = 31 * ret + followingScalar.hashCode();
+    }
     ret = 31 * ret + Boolean.hashCode(isUnboundedPreceding);
     ret = 31 * ret + Boolean.hashCode(isUnboundedFollowing);
     return ret;
@@ -101,17 +121,23 @@ public static Builder builder(){
 
   int getMinPeriods() { return  this.minPeriods; }
 
-  int getPreceding() { return this.preceding; }
+  Scalar getPrecedingScalar() { return this.precedingScalar; }
 
-  int getFollowing() { return this.following; }
+  Scalar getFollowingScalar() { return this.followingScalar; }
 
   ColumnVector getPrecedingCol() { return precedingCol; }
 
   ColumnVector getFollowingCol() { return this.followingCol; }
 
-  int getTimestampColumnIndex() { return this.timestampColumnIndex; }
+  @Deprecated
+  int getTimestampColumnIndex() { return getOrderByColumnIndex(); }
+
+  int getOrderByColumnIndex() { return this.orderByColumnIndex; }
 
-  boolean isTimestampOrderAscending() { return this.timestampOrderAscending; }
+  @Deprecated
+  boolean isTimestampOrderAscending() { return isOrderByOrderAscending(); }
+
+  boolean isOrderByOrderAscending() { return this.orderByOrderAscending; }
 
   boolean isUnboundedPreceding() { return this.isUnboundedPreceding; }
 
@@ -121,13 +147,13 @@ public static Builder builder(){
 
   public static class Builder {
     private int minPeriods = 1;
-    private int preceding = 0;
-    private int following = 1;
-    boolean staticSet = false;
+    // for range window
+    private Scalar precedingScalar = null;
+    private Scalar followingScalar = null;
     private ColumnVector precedingCol = null;
     private ColumnVector followingCol = null;
-    private int timestampColumnIndex = -1;
-    private boolean timestampOrderAscending = true;
+    private int orderByColumnIndex = -1;
+    private boolean orderByOrderAscending = true;
     private boolean isUnboundedPreceding = false;
     private boolean isUnboundedFollowing = false;
 
@@ -147,69 +173,159 @@ public Builder minPeriods(int minPeriods) {
      * Set the size of the window, one entry per row. This does not take ownership of the
      * columns passed in so you have to be sure that the life time of the column outlives
      * this operation.
-     * @param precedingCol the number of rows preceding the current row.
-     * @param followingCol the number of rows following the current row.
+     * @param precedingCol the number of rows preceding the current row and
+     *                     precedingCol will be live outside of WindowOptions.
+     * @param followingCol the number of rows following the current row and
+     *                     following will be live outside of WindowOptions.
      */
     public Builder window(ColumnVector precedingCol, ColumnVector followingCol) {
-      assert (precedingCol != null && precedingCol.getNullCount() == 0);
-      assert (followingCol != null && followingCol.getNullCount() == 0);
+      if (precedingCol == null || precedingCol.hasNulls()) {
+        throw new IllegalArgumentException("preceding cannot be null or have nulls");
+      }
+      if (followingCol == null || followingCol.hasNulls()) {
+        throw new IllegalArgumentException("following cannot be null or have nulls");
+      }
+      if (isUnboundedPreceding || precedingScalar != null) {
+        throw new IllegalStateException("preceding has already been set a different way");
+      }
+      if (isUnboundedFollowing || followingScalar != null) {
+        throw new IllegalStateException("following has already been set a different way");
+      }
       this.precedingCol = precedingCol;
       this.followingCol = followingCol;
       return this;
     }
 
+    /**
+     * Set the size of the range window.
+     * @param precedingScalar the relative number preceding the current row and
+     *                        the precedingScalar will be live outside of WindowOptions.
+     * @param followingScalar the relative number following the current row and
+     *                        the followingScalar will be live outside of WindowOptions
+     */
+    public Builder window(Scalar precedingScalar, Scalar followingScalar) {
+      return preceding(precedingScalar).following(followingScalar);
+    }
+
+    /**
+     * @deprecated Use orderByColumnIndex(int index)
+     */
+    @Deprecated
     public Builder timestampColumnIndex(int index) {
-      this.timestampColumnIndex = index;
+      return orderByColumnIndex(index);
+    }
+
+    public Builder orderByColumnIndex(int index) {
+      this.orderByColumnIndex = index;
       return this;
     }
 
+    /**
+     * @deprecated Use orderByAscending()
+     */
+    @Deprecated
     public Builder timestampAscending() {
-      this.timestampOrderAscending = true;
+      return orderByAscending();
+    }
+
+    public Builder orderByAscending() {
+      this.orderByOrderAscending = true;
       return this;
     }
 
-    public Builder timestampDescending() {
-      this.timestampOrderAscending = false;
+    public Builder orderByDescending() {
+      this.orderByOrderAscending = false;
       return this;
     }
 
+    /**
+     * @deprecated Use orderByDescending()
+     */
+    @Deprecated
+    public Builder timestampDescending() {
+      return orderByDescending();
+    }
+
     public Builder unboundedPreceding() {
+      if (precedingCol != null || precedingScalar != null) {
+        throw new IllegalStateException("preceding has already been set a different way");
+      }
       this.isUnboundedPreceding = true;
       return this;
     }
 
     public Builder unboundedFollowing() {
+      if (followingCol != null || followingScalar != null) {
+        throw new IllegalStateException("following has already been set a different way");
+      }
       this.isUnboundedFollowing = true;
       return this;
     }
 
-    public Builder preceding(int preceding) {
-      this.preceding = preceding;
-      return this;
-    }
-
-    public Builder following(int following) {
-      this.following = following;
+    /**
+     * Set the relative number preceding the current row for range window
+     * @return this for chaining
+     */
+    public Builder preceding(Scalar preceding) {
+      if (preceding == null || !preceding.isValid()) {
+        throw new IllegalArgumentException("preceding cannot be null");
+      }
+      if (isUnboundedPreceding || precedingCol != null) {
+        throw new IllegalStateException("preceding has already been set a different way");
+      }
+      this.precedingScalar = preceding;
       return this;
     }
 
     /**
-     * Set the size of the window.
-     * @param preceding the number of rows preceding the current row
-     * @param following the number of rows following the current row.
+     * Set the relative number following the current row for range window
+     * @return this for chaining
      */
-    public Builder window(int preceding, int following) {
-      this.preceding = preceding;
-      this.following = following;
-      staticSet = true;
+    public Builder following(Scalar following) {
+      if (following == null || !following.isValid()) {
+        throw new IllegalArgumentException("following cannot be null");
+      }
+      if (isUnboundedFollowing || followingCol != null) {
+        throw new IllegalStateException("following has already been set a different way");
+      }
+      this.followingScalar = following;
       return this;
     }
 
     public WindowOptions build() {
-      if (staticSet && precedingCol != null) {
-        throw new IllegalArgumentException("Cannot set both a static window and a non-static window");
-      }
       return new WindowOptions(this);
     }
   }
+
+  public synchronized WindowOptions incRefCount() {
+    if (precedingScalar != null) {
+      precedingScalar.incRefCount();
+    }
+    if (followingScalar != null) {
+      followingScalar.incRefCount();
+    }
+    if (precedingCol != null) {
+      precedingCol.incRefCount();
+    }
+    if (followingCol != null) {
+      followingCol.incRefCount();
+    }
+    return this;
+  }
+
+  @Override
+  public void close() {
+    if (precedingScalar != null) {
+      precedingScalar.close();
+    }
+    if (followingScalar != null) {
+      followingScalar.close();
+    }
+    if (precedingCol != null) {
+      precedingCol.close();
+    }
+    if (followingCol != null) {
+      followingCol.close();
+    }
+  }
 }
\ No newline at end of file
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 46b3f0c5a53..b8c9241d756 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -32,7 +32,7 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
   set(CUDF_JNI_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
-project(CUDF_JNI VERSION 0.19 LANGUAGES C CXX)
+project(CUDF_JNI VERSION 21.06.00 LANGUAGES C CXX)
 
 ###################################################################################################
 # - build options ---------------------------------------------------------------------------------
@@ -248,11 +248,19 @@ set(SOURCE_FILES
     "src/RmmJni.cpp"
     "src/ScalarJni.cpp"
     "src/TableJni.cpp"
+    "src/prefix_sum.cu"
     "src/map_lookup.cu")
 add_library(cudfjni SHARED ${SOURCE_FILES})
 
 #Override RPATH for cudfjni
-SET_TARGET_PROPERTIES(cudfjni PROPERTIES BUILD_RPATH "\$ORIGIN")
+SET_TARGET_PROPERTIES(cudfjni
+    PROPERTIES BUILD_RPATH "\$ORIGIN"
+               # set target compile options
+               CXX_STANDARD                        17
+               CXX_STANDARD_REQUIRED               ON
+               CUDA_STANDARD                       17
+               CUDA_STANDARD_REQUIRED              ON
+)
 
 target_compile_options(cudfjni
             PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
@@ -267,7 +275,7 @@ target_compile_definitions(cudfjni
 if(USE_GDS)
     add_library(cufilejni SHARED "src/CuFileJni.cpp")
     target_include_directories(cufilejni PRIVATE "${cuFile_INCLUDE_DIRS}")
-    target_link_libraries(cufilejni PRIVATE "${cuFile_LIBRARIES}")
+    target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}")
 endif(USE_GDS)
 
 ###################################################################################################
@@ -284,4 +292,4 @@ target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM
 ###################################################################################################
 # - link libraries --------------------------------------------------------------------------------
 
-target_link_libraries(cudfjni ${CUDF_LIB} ${ARROW_LIBRARY} ${NVCOMP_LIB} ${CUDART_LIBRARY} cuda nvrtc)
+target_link_libraries(cudfjni ${CUDF_LIB} ${ARROW_LIBRARY} ${NVCOMP_LIB} ${CUDART_LIBRARY} cuda)
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index c5184111edf..63c2c33202e 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -81,11 +81,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
       case 17: // ROW_NUMBER
         ret = cudf::make_row_number_aggregation();
         break;
-      // case 18: COLLECT
-      // case 19: LEAD
-      // case 20: LAG
-      // case 21: PTX
-      // case 22: CUDA
+      // case 18: COLLECT_LIST
+      // case 19: COLLECT_SET
+      // case 20: LEAD
+      // case 21: LAG
+      // case 22: PTX
+      // case 23: CUDA
       default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
     }
 
@@ -186,10 +187,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 19: // LEAD
+      case 20: // LEAD
         ret = cudf::make_lead_aggregation(offset);
         break;
-      case 20: // LAG
+      case 21: // LAG
         ret = cudf::make_lag_aggregation(offset);
         break;
       default: throw std::logic_error("Unsupported Lead/Lag Aggregation Operation");
@@ -199,9 +200,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectAgg(JNIEnv *env,
-                                                                     jclass class_object,
-                                                                     jboolean include_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectListAgg(JNIEnv *env,
+                                                                             jclass class_object,
+                                                                             jboolean include_nulls) {
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_policy policy =
@@ -212,4 +213,25 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectSetAgg(JNIEnv *env,
+                                                                            jclass class_object,
+                                                                            jboolean include_nulls,
+                                                                            jboolean nulls_equal,
+                                                                            jboolean nans_equal) {
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::null_policy null_policy =
+        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+    cudf::null_equality null_equality =
+        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    cudf::nan_equality nan_equality =
+        nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
+    std::unique_ptr<cudf::aggregation> ret = cudf::make_collect_set_aggregation(null_policy,
+                                                                                null_equality,
+                                                                                nan_equality);
+    return reinterpret_cast<jlong>(ret.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index ba0e4f05714..a517fe06c1f 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -23,9 +23,11 @@
 #include <cudf/reshape.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/detail/interop.hpp>
+#include <cudf/lists/combine.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 
 #include "cudf_jni_apis.hpp"
@@ -123,6 +125,93 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv *env, jclass,
+                                                                             jlongArray column_handles,
+                                                                             jlong separator,
+                                                                             jlong narep,
+                                                                             jboolean separate_nulls) {
+  JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
+  JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
+  JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                      : cudf::strings::separator_on_nulls::NO;
+
+    cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
+    std::vector<cudf::column_view> column_views;
+    std::transform(n_cudf_columns.data(),
+                   n_cudf_columns.data() + n_cudf_columns.size(),
+                   std::back_inserter(column_views),
+                   [](auto const &p_column) { return *p_column; });
+
+    std::unique_ptr<cudf::column> result =
+      cudf::strings::concatenate(cudf::table_view(column_views), separator_scalar,
+                                 narep_scalar, null_policy);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv *env, jclass,
+                                                                                   jlongArray column_handles,
+                                                                                   jlong sep_handle,
+                                                                                   jlong separator_narep,
+                                                                                   jlong col_narep,
+                                                                                   jboolean separate_nulls) {
+  JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
+  JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
+  JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
+  JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                      : cudf::strings::separator_on_nulls::NO;
+
+    cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
+    std::vector<cudf::column_view> column_views;
+    std::transform(n_cudf_columns.data(),
+                   n_cudf_columns.data() + n_cudf_columns.size(),
+                   std::back_inserter(column_views),
+                   [](auto const &p_column) { return *p_column; });
+
+    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    cudf::strings_column_view strings_column(*column);
+    std::unique_ptr<cudf::column> result =
+      cudf::strings::concatenate(cudf::table_view(column_views), strings_column,
+                                 separator_narep_scalar, col_narep_scalar, null_policy);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv *env, jclass,
+                                                                         jlongArray column_handles,
+                                                                         jboolean ignore_null) {
+  JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
+                                   : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+
+    cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
+    std::vector<cudf::column_view> column_views;
+    std::transform(n_cudf_columns.data(),
+                   n_cudf_columns.data() + n_cudf_columns.size(),
+                   std::back_inserter(column_views),
+                   [](auto const &p_column) { return *p_column; });
+
+    std::unique_ptr<cudf::column> result =
+      cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, jobject j_object,
                                                                   jlongArray handles,
                                                                   jlong j_type,
@@ -172,17 +261,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
   try {
     cudf::jni::auto_set_device(env);
     auto scalar_val = reinterpret_cast<cudf::scalar const *>(j_scalar);
-    auto dtype = scalar_val->type();
-    cudf::mask_state mask_state =
-        scalar_val->is_valid() ? cudf::mask_state::UNALLOCATED : cudf::mask_state::ALL_NULL;
     std::unique_ptr<cudf::column> col;
-    if (row_count == 0) {
-      col = cudf::make_empty_column(dtype);
-    } else if (cudf::is_fixed_width(dtype)) {
-      col = cudf::make_fixed_width_column(dtype, row_count, mask_state);
-      auto mut_view = col->mutable_view();
-      cudf::fill_in_place(mut_view, 0, row_count, *scalar_val);
-    } else if (dtype.id() == cudf::type_id::STRING) {
+    if (scalar_val->type().id() == cudf::type_id::STRING) {
+      // Tests fail when using the cudf implementation, complaining no child for string column.
+      // So here take care of the String type itself.
       // create a string column of all empty strings to fill (cheapest string column to create)
       auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, row_count + 1,
                                                cudf::mask_state::UNALLOCATED);
@@ -192,8 +274,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
                                                std::move(mask_buffer));
 
       col = cudf::fill(str_col->view(), 0, row_count, *scalar_val);
+    } else if (scalar_val->type().id() == cudf::type_id::STRUCT && row_count == 0) {
+      // Specialize the creation of empty struct column, since libcudf doesn't support it.
+      auto struct_scalar = reinterpret_cast<cudf::struct_scalar const *>(j_scalar);
+      auto children = cudf::empty_like(struct_scalar->view())->release();
+      auto mask_buffer = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
+      col = cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer));
     } else {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
+      col = cudf::make_column_from_scalar(*scalar_val, row_count);
     }
     return reinterpret_cast<jlong>(col.release());
   }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index cec3a1a92a6..d433e8d36f5 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -64,6 +64,7 @@
 #include <map_lookup.hpp>
 #include "cudf/types.hpp"
 
+#include "prefix_sum.hpp"
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
 #include "jni.h"
@@ -278,26 +279,27 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
         reinterpret_cast<cudf::column_view *>(default_output_col);
     cudf::column_view *n_preceding_col = reinterpret_cast<cudf::column_view *>(preceding_col);
     cudf::column_view *n_following_col = reinterpret_cast<cudf::column_view *>(following_col);
-    cudf::aggregation * agg = reinterpret_cast<cudf::aggregation *>(agg_ptr);
+    cudf::rolling_aggregation * agg = dynamic_cast<cudf::rolling_aggregation *>(reinterpret_cast<cudf::aggregation *>(agg_ptr));
+    JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", 0);
 
     std::unique_ptr<cudf::column> ret;
     if (n_default_output_col != nullptr) {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
         CUDF_FAIL("A default output column is not currently supported with variable length preceding and following");
         //ret = cudf::rolling_window(*n_input_col, *n_default_output_col, 
-        //        *n_preceding_col, *n_following_col, min_periods, agg->clone());
+        //        *n_preceding_col, *n_following_col, min_periods, agg);
       } else {
         ret = cudf::rolling_window(*n_input_col, *n_default_output_col,
-                preceding, following, min_periods, agg->clone());
+                preceding, following, min_periods, *agg);
       }
 
     } else {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
         ret = cudf::rolling_window(*n_input_col, *n_preceding_col, *n_following_col,
-                min_periods, agg->clone());
+                min_periods, *agg);
       } else {
         ret = cudf::rolling_window(*n_input_col, preceding, following, min_periods,
-                agg->clone());
+                *agg);
       }
     }
     return reinterpret_cast<jlong>(ret.release());
@@ -1025,29 +1027,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenation(
-    JNIEnv *env, jobject j_object, jlongArray column_handles, jlong separator, jlong narep) {
-  JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
-  JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
-  JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0);
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::string_scalar *separator_scalar = reinterpret_cast<cudf::string_scalar *>(separator);
-    cudf::string_scalar *narep_scalar = reinterpret_cast<cudf::string_scalar *>(narep);
-    cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    std::vector<cudf::column_view> column_views;
-    std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
-                   std::back_inserter(column_views),
-                   [](auto const &p_column) { return *p_column; });
-    cudf::table_view *string_columns = new cudf::table_view(column_views);
-
-    std::unique_ptr<cudf::column> result =
-        cudf::strings::concatenate(*string_columns, *separator_scalar, *narep_scalar);
-    return reinterpret_cast<jlong>(result.release());
-  }
-  CATCH_STD(env, 0);
-}
-
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass,
                                                                   jlong lhs_view, jlong rhs_view,
                                                                   jint int_op, jint out_dtype,
@@ -1191,6 +1170,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv *env, jc
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env, jclass,
+                                                                   jlong map_column_view,
+                                                                   jlong lookup_key) {
+  JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, lookup_key, "target string scalar is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
+    cudf::string_scalar *ss_key = reinterpret_cast<cudf::string_scalar *>(lookup_key);
+
+    std::unique_ptr<cudf::column> result = cudf::jni::map_contains(*cv, *ss_key);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(JNIEnv *env,
                                                                                  jclass,
                                                                                  jlong column_view,
@@ -1347,7 +1342,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
 
     if (n_cudf_columns.size() == 0) {
-      rmm::device_buffer null_mask{0};
+      rmm::device_buffer null_mask{};
       copy->set_null_mask(null_mask);
       return reinterpret_cast<jlong>(copy.release());
     }
@@ -1755,6 +1750,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_prefixSum(JNIEnv *env, jobject j_object,
+                                                                 jlong handle) {
+
+  JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    std::unique_ptr<cudf::column> result = cudf::jni::prefix_sum(*view);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0)
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env, jobject j_object,
                                                                    jlong handle) {
 
@@ -1779,6 +1788,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env,
   CATCH_STD(env, 0)
 }
 
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv *env, jobject j_object,
                                                                jlong handle) {
 
@@ -1858,4 +1868,67 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env
   CATCH_STD(env, 0)
 
 }
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElementsSepCol(JNIEnv *env, jclass,
+                                                                                             jlong column_handle,
+                                                                                             jlong sep_handle,
+                                                                                             jlong separator_narep,
+                                                                                             jlong col_narep,
+                                                                                             jboolean separate_nulls,
+                                                                                             jboolean empty_string_output_if_empty_list) {
+  JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
+  JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
+  JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
+  JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                      : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output =
+        empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                             : cudf::strings::output_if_empty_list::NULL_ELEMENT;
+
+    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    cudf::strings_column_view strings_column(*column);
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::lists_column_view lcv(*cv);
+    std::unique_ptr<cudf::column> result =
+      cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar,
+                                        col_narep_scalar, null_policy, empty_list_output);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElements(JNIEnv *env, jclass,
+                                                                                       jlong column_handle,
+                                                                                       jlong separator,
+                                                                                       jlong narep,
+                                                                                       jboolean separate_nulls,
+                                                                                       jboolean empty_string_output_if_empty_list) {
+  JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
+  JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
+  JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                      : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output =
+        empty_string_output_if_empty_list ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                             : cudf::strings::output_if_empty_list::NULL_ELEMENT;
+
+    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::lists_column_view lcv(*cv);
+    std::unique_ptr<cudf::column> result =
+      cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar,
+                                        null_policy, empty_list_output);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/main/native/src/CuFileJni.cpp b/java/src/main/native/src/CuFileJni.cpp
index 4e1d3c190f4..ef165281bf9 100644
--- a/java/src/main/native/src/CuFileJni.cpp
+++ b/java/src/main/native/src/CuFileJni.cpp
@@ -23,6 +23,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
+#include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
 namespace {
@@ -97,7 +98,7 @@ class cufile_buffer {
    * @param device_pointer Pointer to the device buffer.
    * @param size The size of the allocated device buffer.
    * @param register_buffer Whether to register the buffer with cuFile. This should only be set to
-   * true if this buffer is being reused to fill a larger buffer.
+   * true if this buffer is being reused and is 4KiB aligned.
    */
   cufile_buffer(void *device_pointer, std::size_t size, bool register_buffer = false)
       : device_pointer_{device_pointer}, size_{size}, register_buffer_{register_buffer} {
@@ -226,11 +227,11 @@ class cufile_file {
    * @brief Write a device buffer to the file.
    *
    * @param buffer The device buffer to write.
-   * @param file_offset Starting offset from which to write the file.
+   * @param size The number of bytes to write.
+   * @param file_offset Starting offset from which to write the buffer.
    */
-  void write(cufile_buffer const &buffer, std::size_t file_offset) {
-    auto const status =
-        cuFileWrite(cufile_handle_, buffer.device_pointer(), buffer.size(), file_offset, 0);
+  void write(cufile_buffer const &buffer, std::size_t size, std::size_t file_offset) {
+    auto const status = cuFileWrite(cufile_handle_, buffer.device_pointer(), size, file_offset, 0);
 
     if (status < 0) {
       if (IS_CUFILE_ERR(status)) {
@@ -240,17 +241,18 @@ class cufile_file {
       }
     }
 
-    CUDF_EXPECTS(static_cast<std::size_t>(status) == buffer.size(),
-                 "Size of bytes written is different from buffer size");
+    CUDF_EXPECTS(static_cast<std::size_t>(status) == size,
+                 "Size of bytes written is different from the specified size");
   }
 
   /**
    * @brief Append a device buffer to the file.
    *
-   * @param buffer The device buffer to write.
+   * @param buffer The device buffer to append.
+   * @param size The number of bytes to append.
    * @return The file offset from which the buffer was appended.
    */
-  std::size_t append(cufile_buffer const &buffer) {
+  std::size_t append(cufile_buffer const &buffer, std::size_t size) {
     struct stat stat_buffer;
     auto const status = fstat(file_descriptor_, &stat_buffer);
     if (status < 0) {
@@ -258,7 +260,7 @@ class cufile_file {
     }
 
     auto const file_offset = static_cast<std::size_t>(stat_buffer.st_size);
-    write(buffer, file_offset);
+    write(buffer, size, file_offset);
     return file_offset;
   }
 
@@ -279,8 +281,9 @@ extern "C" {
  * @param env The JNI environment.
  * @return Pointer address to the new driver wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_createDriver(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv *env, jclass) {
   try {
+    cudf::jni::auto_set_device(env);
     return reinterpret_cast<jlong>(new cufile_driver());
   }
   CATCH_STD(env, 0);
@@ -292,13 +295,159 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_createDriver(JNIEnv *env, jcl
  * @param env The JNI environment.
  * @param pointer Pointer address to the driver wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_destroyDriver(JNIEnv *env, jclass,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv *env, jclass,
                                                                 jlong pointer) {
   try {
-    if (pointer != 0) {
-      auto *driver = reinterpret_cast<cufile_driver *>(pointer);
-      delete driver;
-    }
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cufile_driver *>(pointer);
+  }
+  CATCH_STD(env, );
+}
+
+/**
+ * @brief Create a new cuFile buffer wrapper.
+ *
+ * @param env The JNI environment.
+ * @param device_pointer Pointer address to the device buffer.
+ * @param size The size of the device buffer.
+ * @param register_buffer If true, register the cuFile buffer.
+ * @return Pointer address to the new buffer wrapper instance.
+ */
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(JNIEnv *env, jclass,
+                                                                jlong device_pointer, jlong size,
+                                                                jboolean register_buffer) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto *buffer =
+        new cufile_buffer(reinterpret_cast<void *>(device_pointer), size, register_buffer);
+    return reinterpret_cast<jlong>(buffer);
+  }
+  CATCH_STD(env, 0);
+}
+
+/**
+ * @brief Destroy the given cuFile buffer wrapper.
+ *
+ * @param env The JNI environment.
+ * @param pointer Pointer address to the buffer wrapper instance.
+ */
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv *env, jclass,
+                                                                jlong pointer) {
+  try {
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cufile_buffer *>(pointer);
+  }
+  CATCH_STD(env, );
+}
+
+/**
+ * @brief Create a new cuFile file handle wrapper for reading.
+ *
+ * @param env The JNI environment.
+ * @param path The file path to read from.
+ * @return Pointer address to the new file handle wrapper instance.
+ */
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv *env, jclass,
+                                                                    jstring path) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto file = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
+    return reinterpret_cast<jlong>(file.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+/**
+ * @brief Read the content into the specified buffer.
+ *
+ * @param env The JNI environment.
+ * @param file Pointer to the cuFile file object.
+ * @param file_offset The file offset from which to read.
+ * @param buffer Pointer to the cuFile buffer object.
+ */
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(JNIEnv *env, jclass,
+                                                                           jlong file,
+                                                                           jlong file_offset,
+                                                                           jlong buffer) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
+    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    file_ptr->read(*buffer_ptr, file_offset);
+  }
+  CATCH_STD(env, );
+}
+
+/**
+ * @brief Create a new cuFile file handle wrapper for writing.
+ *
+ * @param env The JNI environment.
+ * @param path The file path to write to.
+ * @return Pointer address to the new file handle wrapper instance.
+ */
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv *env, jclass,
+                                                                     jstring path) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto file = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
+    return reinterpret_cast<jlong>(file.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+/**
+ * @brief Write the content of the specified buffer into the file.
+ *
+ * @param env The JNI environment.
+ * @param file Pointer to the cuFile file object.
+ * @param file_offset The file offset from which to write.
+ * @param buffer Pointer to the cuFile buffer object.
+ * @param size Number of bytes to write.
+ */
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_writeFromBuffer(
+    JNIEnv *env, jclass, jlong file, jlong file_offset, jlong buffer, jlong size) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
+    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    file_ptr->write(*buffer_ptr, size, file_offset);
+  }
+  CATCH_STD(env, );
+}
+
+/**
+ * @brief Append the content of the specified buffer into the file.
+ *
+ * @param env The JNI environment.
+ * @param file Pointer to the cuFile file object.
+ * @param buffer Pointer to the cuFile buffer object.
+ * @param size Number of bytes to append
+ * @return The file offset from which the buffer was appended.
+ */
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(JNIEnv *env, jclass,
+                                                                              jlong file,
+                                                                              jlong buffer,
+                                                                              jlong size) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
+    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    return file_ptr->append(*buffer_ptr, size);
+  }
+  CATCH_STD(env, -1);
+}
+
+/**
+ * @brief Destroy the given cuFile file handle wrapper.
+ *
+ * @param env The JNI environment.
+ * @param pointer Pointer address to the file handle wrapper instance.
+ */
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv *env, jclass,
+                                                                jlong pointer) {
+  try {
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cufile_file *>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -310,15 +459,16 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_destroyDriver(JNIEnv *env, jcl
  * @param path Absolute path of the file to copy the buffer to.
  * @param file_offset The file offset from which the buffer was written.
  * @param device_pointer Pointer address to the device buffer.
- * @param size The size of the device buffer.
+ * @param size Number of bytes to write.
  */
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclass, jstring path,
                                                               jlong file_offset,
                                                               jlong device_pointer, jlong size) {
   try {
+    cudf::jni::auto_set_device(env);
     cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
-    writer->write(buffer, file_offset);
+    writer->write(buffer, size, file_offset);
   }
   CATCH_STD(env, );
 }
@@ -329,14 +479,16 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclas
  * @param env The JNI environment.
  * @param path Absolute path of the file to copy the buffer to.
  * @param device_pointer Pointer address to the device buffer.
- * @param size The size of the device buffer.
+ * @param size Number of bytes to append.
+ * @return The file offset from which the buffer was appended.
  */
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jclass, jstring path,
                                                                 jlong device_pointer, jlong size) {
   try {
+    cudf::jni::auto_set_device(env);
     cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
-    return writer->append(buffer);
+    return writer->append(buffer, size);
   }
   CATCH_STD(env, -1);
 }
@@ -354,6 +506,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(JNIEnv *env, jcla
                                                                jlong device_pointer, jlong size,
                                                                jstring path, jlong file_offset) {
   try {
+    cudf::jni::auto_set_device(env);
     cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
     auto const reader = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
     reader->read(buffer, file_offset);
diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp
index b41fae21a74..f5eb09fa2d4 100644
--- a/java/src/main/native/src/CudaJni.cpp
+++ b/java/src/main/native/src/CudaJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <rmm/device_buffer.hpp>
 #include "jni_utils.hpp"
 
 namespace {
@@ -47,6 +48,12 @@ void auto_set_device(JNIEnv *env) {
   }
 }
 
+/** Fills all the bytes in the buffer 'buf' with 'value'. */
+void device_memset_async(JNIEnv *env, rmm::device_buffer& buf, char value) {
+  cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
+  jni_cuda_check(env, cuda_status);
+}
+
 } // namespace jni
 } // namespace cudf
 
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 7f11e19fce8..e604fc7dd46 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <iostream>
 #include <limits>
 
+#include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
@@ -332,7 +333,9 @@ extern "C" {
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz,
                                                                   jint allocation_mode, jint log_to,
                                                                   jstring jpath, jlong pool_size,
-                                                                  jlong max_pool_size) {
+                                                                  jlong max_pool_size,
+                                                                  jlong allocation_alignment,
+                                                                  jlong alignment_threshold) {
   try {
     // make sure the CUDA device is setup in the context
     cudaError_t cuda_status = cudaFree(0);
@@ -351,13 +354,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
             std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       }
     } else if (use_arena_alloc) {
       std::size_t pool_limit = (max_pool_size > 0) ? static_cast<std::size_t>(max_pool_size) :
@@ -365,23 +364,26 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
             std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
             std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
-        auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-        Tracking_memory_resource.reset(wrapped);
       }
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
-      auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-      Tracking_memory_resource.reset(wrapped);
     } else {
       Initialized_resource = std::make_shared<rmm::mr::cuda_memory_resource>();
-      auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
-      Tracking_memory_resource.reset(wrapped);
     }
+
+    if (allocation_alignment != 0) {
+      Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::aligned_resource_adaptor>(
+          Initialized_resource, allocation_alignment, alignment_threshold);
+    }
+
+    auto wrapped = make_tracking_adaptor(
+        Initialized_resource.get(),
+        std::max(RMM_ALLOC_SIZE_ALIGNMENT, static_cast<std::size_t>(allocation_alignment)));
+    Tracking_memory_resource.reset(wrapped);
+
     auto resource = Tracking_memory_resource.get();
     rmm::mr::set_current_device_resource(resource);
 
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index 4e74cab9328..8939c77f234 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,6 +125,35 @@ JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv *env, jcl
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv *env, jclass,
+                                                                       jlong scalar_handle) {
+  JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto s = reinterpret_cast<cudf::list_scalar *>(scalar_handle);
+    // Creates a column view in heap with the stack one, to let JVM take care of its
+    // life cycle.
+    return reinterpret_cast<jlong>(new cudf::column_view(s->view()));
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv *env, jclass,
+                                                                                    jlong scalar_handle) {
+  JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    const auto s                  = reinterpret_cast<cudf::struct_scalar*>(scalar_handle);
+    const cudf::table_view& table = s->view();
+    cudf::jni::native_jpointerArray<cudf::column_view> column_handles(env, table.num_columns());
+    for (int i = 0; i < table.num_columns(); i++) {
+      column_handles[i] = new cudf::column_view(table.column(i));
+    }
+    return column_handles.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv *env, jclass,
                                                                    jboolean value,
                                                                    jboolean is_valid) {
@@ -445,4 +474,44 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv *env, jclass,
+                                                                  jlong view_handle,
+                                                                  jboolean is_valid) {
+  JNI_NULL_CHECK(env, view_handle, "Column view should NOT be null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto col_view = reinterpret_cast<cudf::column_view *>(view_handle);
+
+    // Instead of calling the `cudf::empty_like` to create an empty column when `is_valid`
+    // is false, always passes the input view to the scalar, to avoid copying the column
+    // twice.
+    // Let the Java layer make sure the view is empty when `is_valid` is false.
+    cudf::scalar* s = new cudf::list_scalar(*col_view);
+    s->set_valid(is_valid);
+    return reinterpret_cast<jlong>(s);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv *env, jclass,
+                                                                    jlongArray handles,
+                                                                    jboolean is_valid) {
+  JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
+  try {
+    cudf::jni::auto_set_device(env);
+    std::unique_ptr<cudf::column_view> ret;
+    cudf::jni::native_jpointerArray<cudf::column_view> column_pointers(env, handles);
+    std::vector<cudf::column_view> columns;
+    columns.reserve(column_pointers.size());
+    std::transform(column_pointers.data(),
+                   column_pointers.data() + column_pointers.size(),
+                   std::back_inserter(columns),
+                   [](auto const& col_ptr) { return *col_ptr; });
+    auto s = std::make_unique<cudf::struct_scalar>(
+      cudf::host_span<cudf::column_view const>{columns}, is_valid);
+    return reinterpret_cast<jlong>(s.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 } // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 346ae8435cc..4b01745382b 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -36,10 +36,12 @@
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
+#include <cudf/types.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
+#include "jni_utils.hpp"
 #include "row_conversion.hpp"
 
 #include <algorithm>
@@ -612,8 +614,111 @@ jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &t
   return convert_table_for_return(env, table_result, extra);
 }
 
+// Convert the JNI boolean array of key column sort order to a vector of cudf::order
+// for groupby.
+std::vector<cudf::order> resolve_column_order(JNIEnv *env, jbooleanArray jkeys_sort_desc,
+                                              int key_size) {
+  cudf::jni::native_jbooleanArray keys_sort_desc(env, jkeys_sort_desc);
+  auto keys_sort_num = keys_sort_desc.size();
+  // The number of column order should be 0 or equal to the number of key.
+  if (keys_sort_num != 0 && keys_sort_num != key_size) {
+    throw cudf::jni::jni_exception("key-column and key-sort-order size mismatch.");
+  }
+
+  std::vector<cudf::order> column_order(keys_sort_num);
+  if (keys_sort_num > 0) {
+    std::transform(keys_sort_desc.data(), keys_sort_desc.data() + keys_sort_num,
+        column_order.begin(),
+        [](jboolean is_desc) { return is_desc ? cudf::order::DESCENDING
+                                              : cudf::order::ASCENDING; });
+  }
+  return column_order;
+}
+
+// Convert the JNI boolean array of key column null order to a vector of cudf::null_order
+// for groupby.
+std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray jkeys_null_first,
+                                                      int key_size) {
+  cudf::jni::native_jbooleanArray keys_null_first(env, jkeys_null_first);
+  auto null_order_num = keys_null_first.size();
+  // The number of null order should be 0 or equal to the number of key.
+  if (null_order_num != 0 && null_order_num != key_size) {
+    throw cudf::jni::jni_exception("key-column and key-null-order size mismatch.");
+  }
+
+  std::vector<cudf::null_order> null_precedence(null_order_num);
+  if (null_order_num > 0) {
+    std::transform(keys_null_first.data(), keys_null_first.data() + null_order_num,
+        null_precedence.begin(),
+        [](jboolean null_before) { return null_before ? cudf::null_order::BEFORE
+                                                      : cudf::null_order::AFTER; });
+  }
+  return null_precedence;
+}
+
 namespace {
 
+int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
+                         std::vector<std::string> &col_names, 
+                         cudf::jni::native_jbooleanArray &nullability,
+                         cudf::jni::native_jbooleanArray &isInt96,
+                         cudf::jni::native_jintArray &precisions,
+                         cudf::jni::native_jintArray &children, int num_children, int read_index) {
+  int write_index = 0;
+  for (int i = 0; i < num_children; i++, write_index++) {
+    cudf::io::column_in_metadata child;
+    child.set_name(col_names[read_index])
+        .set_decimal_precision(precisions[read_index])
+        .set_int96_timestamps(isInt96[read_index])
+        .set_nullability(nullability[read_index]);
+    column_metadata.add_child(child);
+    int childs_children = children[read_index++];
+    if (childs_children > 0) {
+      read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability,
+                                       isInt96, precisions, children, childs_children, read_index);
+    }
+  }
+  return read_index;
+}
+
+void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names, jintArray &j_children,
+                          jbooleanArray &j_col_nullability, jobjectArray &j_metadata_keys,
+                          jobjectArray &j_metadata_values, jint j_compression, jint j_stats_freq,
+                          jbooleanArray &j_isInt96, jintArray &j_precisions,
+                          cudf::io::table_input_metadata& metadata) {
+  cudf::jni::auto_set_device(env);
+  cudf::jni::native_jstringArray col_names(env, j_col_names);
+  cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
+  cudf::jni::native_jbooleanArray isInt96(env, j_isInt96);
+  cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+  cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+  cudf::jni::native_jintArray precisions(env, j_precisions);
+  cudf::jni::native_jintArray children(env, j_children);
+
+  auto cpp_names = col_names.as_cpp_vector();
+
+  int top_level_children = num_children;
+
+  metadata.column_metadata.resize(top_level_children);
+  int read_index = 0; // the read_index, which will be used to read the arrays
+  for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) {
+    metadata.column_metadata[write_index]
+        .set_name(cpp_names[read_index])
+        .set_nullability(col_nullability[read_index])
+        .set_int96_timestamps(isInt96[read_index])
+        .set_decimal_precision(precisions[read_index]);
+    int childs_children = children[read_index++];
+    if (childs_children > 0) {
+      read_index = set_column_metadata(metadata.column_metadata[write_index], cpp_names,
+                                       col_nullability, isInt96, precisions, children, childs_children, read_index);
+    }
+  }
+  for (auto i = 0; i < meta_keys.size(); ++i) {
+    metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
+  }
+
+}
+
 // Check that window parameters are valid.
 bool valid_window_parameters(native_jintArray const &values,
                              native_jpointerArray<cudf::aggregation> const &ops,
@@ -623,6 +728,16 @@ bool valid_window_parameters(native_jintArray const &values,
          values.size() == preceding.size() && values.size() == following.size();
 }
 
+// Check that window parameters are valid.
+bool valid_window_parameters(native_jintArray const &values,
+                             native_jpointerArray<cudf::aggregation> const &ops,
+                             native_jintArray const &min_periods,
+                             native_jpointerArray<cudf::scalar> const &preceding,
+                             native_jpointerArray<cudf::scalar> const &following) {
+  return values.size() == ops.size() && values.size() == min_periods.size() &&
+         values.size() == preceding.size() && values.size() == following.size();
+}
+
 // Generate gather maps needed to manifest the result of a join between two tables.
 // The resulting Java long array contains the following at each index:
 //   0: Size of each gather map in bytes
@@ -1069,55 +1184,33 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jbooleanArray j_col_nullability,
-    jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression,
-    jint j_stats_freq, jboolean j_isInt96, jintArray j_precisions, jobject consumer) {
+    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
+    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
+    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
+    jobject consumer) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
   JNI_NULL_CHECK(env, j_metadata_values, "null metadata values", 0);
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
-    cudf::jni::auto_set_device(env);
-    using namespace cudf::io;
-    cudf::jni::native_jstringArray col_names(env, j_col_names);
-    cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
-    cudf::jni::native_jintArray precisions(env, j_precisions);
-
-    auto cpp_names = col_names.as_cpp_vector();
-    table_input_metadata metadata;
-    metadata.column_metadata.resize(col_nullability.size());
-    for (int i = 0; i < col_nullability.size(); i++) {
-       metadata.column_metadata[i]
-           .set_name(cpp_names[i])
-           .set_nullability(col_nullability[i])
-           .set_int96_timestamps(j_isInt96);
-    }
-
-    // Precisions is not always set
-    for (int i = 0; i < precisions.size(); i++) {
-       metadata.column_metadata[i]
-           .set_decimal_precision(precisions[i]);
-    }
-
-    for (auto i = 0; i < meta_keys.size(); ++i) {
-      metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-    }
-
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
+    
+    using namespace cudf::io;
+    using namespace cudf::jni;
     sink_info sink{data_sink.get()};
-    std::vector<uint8_t> const v_precisions(
-        precisions.data(), precisions.data() + precisions.size());
+    table_input_metadata metadata;
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_metadata_keys,
+                         j_metadata_values, j_compression, j_stats_freq, j_isInt96, j_precisions,
+                         metadata);
+
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
             .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
             .build();
-
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
     cudf::jni::native_parquet_writer_handle *ret =
         new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), std::move(data_sink));
@@ -1127,44 +1220,23 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jbooleanArray j_col_nullability,
-    jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression,
-    jint j_stats_freq, jboolean j_isInt96, jintArray j_precisions, jstring j_output_path) {
+    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
+    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
+    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
+    jstring j_output_path) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
   JNI_NULL_CHECK(env, j_metadata_values, "null metadata values", 0);
   JNI_NULL_CHECK(env, j_output_path, "null output path", 0);
   try {
-    cudf::jni::auto_set_device(env);
-    using namespace cudf::io;
-    cudf::jni::native_jstringArray col_names(env, j_col_names);
-    cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
     cudf::jni::native_jstring output_path(env, j_output_path);
-    cudf::jni::native_jintArray precisions(env, j_precisions);
 
-    auto cpp_names = col_names.as_cpp_vector();
+    using namespace cudf::io;                     
+    using namespace cudf::jni;                     
     table_input_metadata metadata;
-    metadata.column_metadata.resize(col_nullability.size());
-    for (int i = 0; i < col_nullability.size(); i++) {
-       metadata.column_metadata[i]
-           .set_name(cpp_names[i])
-           .set_nullability(col_nullability[i])
-           .set_int96_timestamps(j_isInt96);
-    }
-
-    // Precisions is not always set
-    for (int i = 0; i < precisions.size(); i++) {
-       metadata.column_metadata[i]
-           .set_decimal_precision(precisions[i]);
-    }
-
-    for (auto i = 0; i < meta_keys.size(); ++i) {
-      metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-    }
-
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_metadata_keys,
+                         j_metadata_values, j_compression, j_stats_freq, j_isInt96, j_precisions, metadata);
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
@@ -1833,8 +1905,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jc
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv *env, jclass,
                                                                     jlongArray j_cudf_table_view) {
 
-  JNI_NULL_CHECK(env, j_cudf_table_view, "table is null", 0);
-  try {
+  JNI_NULL_CHECK(env, j_cudf_table_view, "table is null", 0);  try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *table_view = reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
     std::unique_ptr<cudf::column> result = cudf::interleave_columns(*table_view);
@@ -1965,7 +2036,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     JNIEnv *env, jclass, jlong input_table, jintArray keys,
-    jintArray aggregate_column_indices, jlongArray agg_instances, jboolean ignore_null_keys) {
+    jintArray aggregate_column_indices, jlongArray agg_instances, jboolean ignore_null_keys,
+    jboolean jkey_sorted, jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -1984,8 +2056,16 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    cudf::groupby::groupby grouper(n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE :
-                                                                    cudf::null_policy::INCLUDE);
+    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc,
+                                                        n_keys.size());
+    auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first,
+                                                              n_keys.size());
+    cudf::groupby::groupby grouper(n_keys_table,
+                                   ignore_null_keys ? cudf::null_policy::EXCLUDE
+                                                    : cudf::null_policy::INCLUDE,
+                                   jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+                                   column_order,
+                                   null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -2224,15 +2304,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
+      cudf::rolling_aggregation * agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
+      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
+
       int agg_column_index = values[i];
       if (default_output[i] != nullptr) {
         result_columns.emplace_back(std::move(cudf::grouped_rolling_window(
             groupby_keys, input_table->column(agg_column_index), *default_output[i], preceding[i],
-            following[i], min_periods[i], agg_instances[i]->clone())));
+            following[i], min_periods[i], *agg)));
       } else {
         result_columns.emplace_back(std::move(cudf::grouped_rolling_window(
             groupby_keys, input_table->column(agg_column_index), preceding[i], following[i],
-            min_periods[i], agg_instances[i]->clone())));
+            min_periods[i], *agg)));
       }
     }
 
@@ -2242,20 +2325,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_timeRangeRollingWindowAggregate(
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggregate(
     JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys,
-    jintArray j_timestamp_column_indices, jbooleanArray j_is_timestamp_ascending,
+    jintArray j_orderby_column_indices, jbooleanArray j_is_orderby_ascending,
     jintArray j_aggregate_column_indices, jlongArray j_agg_instances, jintArray j_min_periods,
-    jintArray j_preceding, jintArray j_following, 
-    jbooleanArray j_unbounded_preceding, jbooleanArray j_unbounded_following, 
+    jlongArray j_preceding, jlongArray j_following,
+    jbooleanArray j_unbounded_preceding, jbooleanArray j_unbounded_following,
     jboolean ignore_null_keys) {
 
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
-  JNI_NULL_CHECK(env, j_timestamp_column_indices, "input timestamp_column_indices are null", NULL);
-  JNI_NULL_CHECK(env, j_is_timestamp_ascending, "input timestamp_ascending is null", NULL);
+  JNI_NULL_CHECK(env, j_orderby_column_indices, "input orderby_column_indices are null", NULL);
+  JNI_NULL_CHECK(env, j_is_orderby_ascending, "input orderby_ascending is null", NULL);
   JNI_NULL_CHECK(env, j_aggregate_column_indices, "input aggregate_column_indices are null", NULL);
   JNI_NULL_CHECK(env, j_agg_instances, "agg_instances are null", NULL);
+  JNI_NULL_CHECK(env, j_preceding, "preceding are null", NULL);
+  JNI_NULL_CHECK(env, j_following, "following are null", NULL);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2265,15 +2350,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_timeRangeRollingWindowAgg
     // Convert from j-types to native.
     cudf::table_view *input_table{reinterpret_cast<cudf::table_view *>(j_input_table)};
     cudf::jni::native_jintArray keys{env, j_keys};
-    cudf::jni::native_jintArray timestamps{env, j_timestamp_column_indices};
-    cudf::jni::native_jbooleanArray timestamp_ascending{env, j_is_timestamp_ascending};
+    cudf::jni::native_jintArray orderbys{env, j_orderby_column_indices};
+    cudf::jni::native_jbooleanArray orderbys_ascending{env, j_is_orderby_ascending};
     cudf::jni::native_jintArray values{env, j_aggregate_column_indices};
     cudf::jni::native_jpointerArray<cudf::aggregation> agg_instances(env, j_agg_instances);
     cudf::jni::native_jintArray min_periods{env, j_min_periods};
-    cudf::jni::native_jintArray preceding{env, j_preceding};
-    cudf::jni::native_jintArray following{env, j_following};
     cudf::jni::native_jbooleanArray unbounded_preceding{env, j_unbounded_preceding};
     cudf::jni::native_jbooleanArray unbounded_following{env, j_unbounded_following};
+    cudf::jni::native_jpointerArray<cudf::scalar> preceding(env, j_preceding);
+    cudf::jni::native_jpointerArray<cudf::scalar> following(env, j_following);
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
@@ -2288,17 +2373,48 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_timeRangeRollingWindowAgg
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
       int agg_column_index = values[i];
+      cudf::column_view const &order_by_column = input_table->column(orderbys[i]);
+      cudf::data_type order_by_type = order_by_column.type();
+      cudf::data_type unbounded_type = order_by_type;
+
+      if (unbounded_preceding[i] || unbounded_following[i]) {
+        switch (order_by_type.id()) {
+          case cudf::type_id::TIMESTAMP_DAYS:
+            unbounded_type = cudf::data_type{cudf::type_id::DURATION_DAYS};
+            break;
+          case cudf::type_id::TIMESTAMP_SECONDS:
+            unbounded_type =cudf::data_type{cudf::type_id::DURATION_SECONDS};
+            break;
+          case cudf::type_id::TIMESTAMP_MILLISECONDS:
+            unbounded_type = cudf::data_type{cudf::type_id::DURATION_MILLISECONDS};
+            break;
+          case cudf::type_id::TIMESTAMP_MICROSECONDS:
+            unbounded_type = cudf::data_type{cudf::type_id::DURATION_MICROSECONDS};
+            break;
+          case cudf::type_id::TIMESTAMP_NANOSECONDS:
+            unbounded_type = cudf::data_type{cudf::type_id::DURATION_NANOSECONDS};
+            break;
+          default:
+            break;
+        }
+      }
+
+      cudf::rolling_aggregation * agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
+      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
+
       result_columns.emplace_back(
         std::move(
-          cudf::grouped_time_range_rolling_window(
-            groupby_keys, 
-            input_table->column(timestamps[i]),
-            timestamp_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
-            input_table->column(agg_column_index), 
-            unbounded_preceding[i] ? cudf::window_bounds::unbounded() : cudf::window_bounds::get(preceding[i]), 
-            unbounded_following[i] ? cudf::window_bounds::unbounded() : cudf::window_bounds::get(following[i]), 
-            min_periods[i],
-            agg_instances[i]->clone()
+          cudf::grouped_range_rolling_window(
+              groupby_keys,
+              order_by_column,
+              orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
+              input_table->column(agg_column_index),
+              unbounded_preceding[i] ? cudf::range_window_bounds::unbounded(unbounded_type) :
+              cudf::range_window_bounds::get(*preceding[i]),
+              unbounded_following[i] ? cudf::range_window_bounds::unbounded(unbounded_type) :
+              cudf::range_window_bounds::get(*following[i]),
+              min_periods[i],
+              *agg
           )
         )
       );
@@ -2377,4 +2493,109 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv* env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(JNIEnv *env, jclass,
+                                                                    jlong jinput_table,
+                                                                    jintArray jkey_indices,
+                                                                    jboolean jignore_null_keys,
+                                                                    jboolean jkey_sorted,
+                                                                    jbooleanArray jkeys_sort_desc,
+                                                                    jbooleanArray jkeys_null_first) {
+  JNI_NULL_CHECK(env, jinput_table, "table native handle is null", 0);
+  JNI_NULL_CHECK(env, jkey_indices, "key indices are null", 0);
+  // Two main steps to split the groups in the input table.
+  //    1) Calls `cudf::groupby::groupby::get_groups` to get the group offsets and
+  //       the grouped table.
+  //    2) Calls `cudf::contiguous_split` to execute the split over the grouped table
+  //       according to the group offsets.
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jintArray n_key_indices(env, jkey_indices);
+    cudf::table_view *input_table = reinterpret_cast<cudf::table_view *>(jinput_table);
+
+    // Prepares arguments for the groupby:
+    //   (keys, null_handling, keys_are_sorted, column_order, null_precedence)
+    std::vector<cudf::size_type> key_indices(n_key_indices.data(),
+                                             n_key_indices.data() + n_key_indices.size());
+    auto keys = input_table->select(key_indices);
+    auto null_handling = jignore_null_keys ? cudf::null_policy::EXCLUDE
+                                           : cudf::null_policy::INCLUDE;
+    auto keys_are_sorted = jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO;
+    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc,
+                                                        key_indices.size());
+    auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first,
+                                                              key_indices.size());
+
+    // Constructs a groupby
+    cudf::groupby::groupby grouper(keys, null_handling, keys_are_sorted,
+                                   column_order, null_precedence);
+
+    // 1) Gets the groups(keys, offsets, values) from groupby.
+    //
+    // Uses only the non-key columns as the input values instead of the whole table,
+    // to avoid duplicated key columns in output of `get_groups`.
+    // The code looks like a little more complicated, but it can reduce the peak memory.
+    auto num_value_cols = input_table->num_columns() - key_indices.size();
+    std::vector<cudf::size_type> value_indices;
+    value_indices.reserve(num_value_cols);
+    // column indices start with 0.
+    cudf::size_type index = 0;
+    while (value_indices.size() < num_value_cols) {
+      if (std::find(key_indices.begin(), key_indices.end(), index) == key_indices.end()) {
+        // not key column, so adds it as value column.
+        value_indices.emplace_back(index);
+      }
+      index ++;
+    }
+    cudf::table_view values_view = input_table->select(value_indices);
+    cudf::groupby::groupby::groups groups = grouper.get_groups(values_view);
+
+    // When builds the table view from keys and values of 'groups', restores the
+    // original order of columns (same order with that in input table).
+    std::vector<cudf::column_view> grouped_cols(key_indices.size() + num_value_cols);
+    // key columns
+    auto key_view = groups.keys->view();
+    auto key_view_it = key_view.begin();
+    for (auto key_id : key_indices) {
+      grouped_cols.at(key_id) = std::move(*key_view_it);
+      key_view_it ++;
+    }
+    // value columns
+    auto value_view = groups.values->view();
+    auto value_view_it = value_view.begin();
+    for (auto value_id : value_indices) {
+      grouped_cols.at(value_id) = std::move(*value_view_it);
+      value_view_it ++;
+    }
+    cudf::table_view grouped_table(grouped_cols);
+    // When no key columns, uses the input table instead, because the output
+    // of 'get_groups' is empty.
+    auto& grouped_view = key_indices.empty() ? *input_table : grouped_table;
+
+    // Resolves the split indices from offsets vector directly to avoid copying. Since
+    // the offsets vector may be very large if there are too many small groups.
+    std::vector<cudf::size_type>& split_indices = groups.offsets;
+    // Offsets laysout is [0, split indices..., num_rows] or [0] for empty keys, so
+    // need to removes the first and last elements.
+    split_indices.erase(split_indices.begin());
+    if (!split_indices.empty()) { split_indices.pop_back(); }
+
+    // 2) Splits the groups.
+    std::vector<cudf::packed_table> result =
+        cudf::contiguous_split(grouped_view, split_indices);
+    // Release the grouped table right away after split done.
+    groups.keys.reset(nullptr);
+    groups.values.reset(nullptr);
+
+    //  Returns the split result.
+    cudf::jni::native_jobjectArray<jobject> n_result =
+        cudf::jni::contiguous_table_array(env, result.size());
+    for (size_t i = 0; i < result.size(); i++) {
+      n_result.set(i, cudf::jni::contiguous_table_from(env, result[i].data,
+                                                       result[i].table.num_rows()));
+    }
+    return n_result.wrapped();
+  }
+  CATCH_STD(env, NULL);
+}
+
 } // extern "C"
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index 76c7e91d335..14999156890 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -70,5 +70,12 @@ void set_cudf_device(int device);
  */
 void auto_set_device(JNIEnv *env);
 
+/**
+ * Fills all the bytes in the buffer 'buf' with 'value'.
+ * The operation has not necessarily completed when this returns, but it could overlap with
+ * operations occurring on other streams.
+ */
+void device_memset_async(JNIEnv *env, rmm::device_buffer& buf, char value);
+
 } // namespace jni
 } // namespace cudf
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index 0ba683b45f1..b20daf84fc1 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -13,14 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/lists/contains.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
@@ -124,27 +128,63 @@ get_gather_map_for_map_values(column_view const &input, string_scalar &lookup_ke
   return gather_map;
 }
 
-} // namespace
-
-namespace jni {
-std::unique_ptr<column> map_lookup(column_view const &map_column, string_scalar lookup_key,
-                                   bool has_nulls, rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource *mr) {
-  // Defensive checks.
+/**
+ * @brief a defensive check for the map column that is going to be processed
+ */
+void map_input_check(column_view const &map_column, rmm::cuda_stream_view stream) {
   CUDF_EXPECTS(map_column.type().id() == type_id::LIST, "Expected LIST<STRUCT<key,value>>.");
 
   lists_column_view lcv{map_column};
-  auto structs_column = lcv.get_sliced_child(stream);
+  column_view structs_column = lcv.get_sliced_child(stream);
 
   CUDF_EXPECTS(structs_column.type().id() == type_id::STRUCT, "Expected LIST<STRUCT<key,value>>.");
 
-  structs_column_view scv{structs_column};
   CUDF_EXPECTS(structs_column.num_children() == 2, "Expected LIST<STRUCT<key,value>>.");
   CUDF_EXPECTS(structs_column.child(0).type().id() == type_id::STRING,
                "Expected LIST<STRUCT<key,value>>.");
   CUDF_EXPECTS(structs_column.child(1).type().id() == type_id::STRING,
                "Expected LIST<STRUCT<key,value>>.");
+}
+
+} // namespace
+
+namespace jni {
+
+std::unique_ptr<column> map_contains(column_view const &map_column, string_scalar lookup_key,
+                                     bool has_nulls, rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource *mr) {
+  // Defensive checks.
+  map_input_check(map_column, stream);
+
+  lists_column_view lcv(map_column);
+  structs_column_view scv(lcv.child());
+
+  std::vector<column_view> children;
+  children.push_back(lcv.offsets());
+  children.push_back(scv.child(0));
+
+  column_view list_of_keys(map_column.type(), map_column.size(),
+    nullptr, map_column.null_mask(), map_column.null_count(), 0, children);
+  auto contains_column  = lists::contains(list_of_keys, lookup_key);
+  // null will be skipped in all-aggregation when checking if all rows contain the key,
+  // so replace all nulls with 0.
+  std::unique_ptr<cudf::scalar> replacement =
+        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
+  replacement->set_valid(true);
+  using ScalarType = cudf::scalar_type_t<int8_t>;
+  static_cast<ScalarType *>(replacement.get())->set_value(0);
+  auto result = cudf::replace_nulls(contains_column->view(), *replacement);
+  return result;
+}
 
+std::unique_ptr<column> map_lookup(column_view const &map_column, string_scalar lookup_key,
+                                   bool has_nulls, rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource *mr) {
+  // Defensive checks.
+  map_input_check(map_column, stream);
+
+  lists_column_view lcv{map_column};
+  column_view structs_column = lcv.get_sliced_child(stream);
   // Two-pass plan: construct gather map, and then gather() on structs_column.child(1). Plan A.
   // (Can do in one pass perhaps, but that's Plan B.)
 
diff --git a/java/src/main/native/src/map_lookup.hpp b/java/src/main/native/src/map_lookup.hpp
index 6d54bfa371d..301293dc188 100644
--- a/java/src/main/native/src/map_lookup.hpp
+++ b/java/src/main/native/src/map_lookup.hpp
@@ -51,6 +51,38 @@ map_lookup(column_view const &map_column, string_scalar lookup_key, bool has_nul
            rmm::cuda_stream_view stream = rmm::cuda_stream_default,
            rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+
+/**
+ * @brief Looks up a "map" column by specified key to see if the key exists or not,
+ *        and returns a cudf column of bool value.
+ *
+ * The map-column is represented as follows:
+ *
+ *  list_view<struct_view< string_view, string_view > >.
+ *                         <---KEY--->  <--VALUE-->
+ *
+ * The string_view struct members are the key and value, respectively.
+ * For each row in the input list column, if the key is not found, false will be returned for that
+ * row.
+ * Note: when search for the scalar key of "null", a column full of "false" will be returned because
+ *       map_contains is leveraging cudf::list:contains.
+ *
+ * @param map_column The input "map" column to be searched. Must be of
+ *                   type list_view<struct_view<string_view, string_view>>.
+ * @param lookup_key The search key, whose index(offset) is to be returned for each list row
+ * @param has_nulls  Whether the input column might contain null list-rows, or null keys.
+ * @param stream     The CUDA stream
+ * @param mr         The device memory resource to be used for allocations
+ * @return           An boolean column reflecting the existence of the key in each row in the map
+ *                   column. True means the lookup_key is found in that row.
+ * @throw cudf::logic_error If the input column is not of type
+ *                          list_view<struct_view<string_view, string_view>>
+ */
+std::unique_ptr<column>
+map_contains(column_view const &map_column, string_scalar lookup_key, bool has_nulls = true,
+           rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 } // namespace jni
 
 } // namespace cudf
diff --git a/java/src/main/native/src/prefix_sum.cu b/java/src/main/native/src/prefix_sum.cu
new file mode 100644
index 00000000000..e3c53696185
--- /dev/null
+++ b/java/src/main/native/src/prefix_sum.cu
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <thrust/scan.h>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/exec_policy.hpp>
+
+
+namespace cudf {
+namespace jni {
+
+std::unique_ptr<column> prefix_sum(column_view const &value_column,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource *mr) {
+  // Defensive checks.
+  CUDF_EXPECTS(value_column.type().id() == type_id::INT64, "Only longs are supported.");
+  CUDF_EXPECTS(!value_column.has_nulls(), "NULLS are not supported");
+
+  auto result = make_numeric_column(value_column.type(), value_column.size(),
+                                    mask_state::ALL_VALID, stream, mr);
+
+  thrust::inclusive_scan(rmm::exec_policy(stream),
+                         value_column.begin<int64_t>(),
+                         value_column.end<int64_t>(),
+                         result->mutable_view().begin<int64_t>());
+
+  return result;
+}
+} // namespace jni
+} // namespace cudf
diff --git a/cpp/src/io/csv/csv.h b/java/src/main/native/src/prefix_sum.hpp
similarity index 54%
rename from cpp/src/io/csv/csv.h
rename to java/src/main/native/src/prefix_sum.hpp
index b20ca4222b2..8f39f9a8c69 100644
--- a/cpp/src/io/csv/csv.h
+++ b/java/src/main/native/src/prefix_sum.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,4 +16,21 @@
 
 #pragma once
 
-#include "csv_common.h"
+#include <cudf/column/column.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+
+namespace jni {
+
+/**
+ * @brief compute the prefix sum of a column of longs
+ */
+std::unique_ptr<column>
+prefix_sum(column_view const &value_column,
+           rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
+} // namespace jni
+
+} // namespace cudf
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index a10ba9a2700..402a592ef99 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -404,7 +404,7 @@ static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
       input_data->data(), input_nm->data(), data->mutable_view().data<int8_t>());
 
   return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{0, 0, mr}, stream, mr);
+                                 rmm::device_buffer{}, stream, mr);
 }
 
 static cudf::data_type get_data_type(const cudf::column_view &v) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 36123704ae6..08585746267 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -29,6 +29,7 @@
 
 import java.math.BigDecimal;
 import java.math.RoundingMode;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -176,6 +177,19 @@ void testStringCreation() {
     }
   }
 
+  @Test
+  void testUTF8StringCreation() {
+    try (ColumnVector cv = ColumnVector.fromUTF8Strings(
+            "d".getBytes(StandardCharsets.UTF_8),
+            "sd".getBytes(StandardCharsets.UTF_8),
+            "sde".getBytes(StandardCharsets.UTF_8),
+            null,
+            "END".getBytes(StandardCharsets.UTF_8));
+         ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) {
+      TableTest.assertColumnsAreEqual(expected, cv);
+    }
+  }
+
   @Test
   void testRefCountLeak() throws InterruptedException {
     assumeTrue(Boolean.getBoolean("ai.rapids.cudf.flaky-tests-enabled"));
@@ -1173,10 +1187,20 @@ void testFromScalarZeroRows() {
         case DURATION_NANOSECONDS:
           s = Scalar.durationFromLong(DType.create(type), 21313);
           break;
-          case EMPTY:
-          case LIST:
-          case STRUCT:
-            continue;
+        case EMPTY:
+          continue;
+        case STRUCT:
+          try (ColumnVector col1 = ColumnVector.fromInts(1);
+               ColumnVector col2 = ColumnVector.fromStrings("A");
+               ColumnVector col3 = ColumnVector.fromDoubles(1.23)) {
+            s = Scalar.structFromColumnViews(col1, col2, col3);
+          }
+          break;
+        case LIST:
+          try (ColumnVector list = ColumnVector.fromInts(1, 2, 3)) {
+            s = Scalar.listFromColumnView(list);
+          }
+          break;
         default:
           throw new IllegalArgumentException("Unexpected type: " + type);
         }
@@ -1349,9 +1373,36 @@ void testFromScalar() {
           break;
         }
         case EMPTY:
-        case LIST:
-        case STRUCT:
           continue;
+        case STRUCT:
+          try (ColumnVector col0 = ColumnVector.fromInts(1);
+               ColumnVector col1 = ColumnVector.fromBoxedDoubles((Double) null);
+               ColumnVector col2 = ColumnVector.fromStrings("a");
+               ColumnVector col3 = ColumnVector.fromDecimals(BigDecimal.TEN);
+               ColumnVector col4 = ColumnVector.daysFromInts(10)) {
+            s = Scalar.structFromColumnViews(col0, col1, col2, col3, col4);
+            StructData structData = new StructData(1, null, "a", BigDecimal.TEN, 10);
+            expected = ColumnVector.fromStructs(new HostColumnVector.StructType(true,
+                    new HostColumnVector.BasicType(true, DType.INT32),
+                    new HostColumnVector.BasicType(true, DType.FLOAT64),
+                    new HostColumnVector.BasicType(true, DType.STRING),
+                    new HostColumnVector.BasicType(true, DType.create(DType.DTypeEnum.DECIMAL32, 0)),
+                    new HostColumnVector.BasicType(true, DType.TIMESTAMP_DAYS)),
+                structData, structData, structData, structData);
+          }
+          break;
+        case LIST:
+          try (ColumnVector list = ColumnVector.fromInts(1, 2, 3)) {
+            s = Scalar.listFromColumnView(list);
+            expected = ColumnVector.fromLists(
+                new HostColumnVector.ListType(true,
+                    new HostColumnVector.BasicType(true, DType.INT32)),
+                Arrays.asList(1, 2, 3),
+                Arrays.asList(1, 2, 3),
+                Arrays.asList(1, 2, 3),
+                Arrays.asList(1, 2, 3));
+          }
+          break;
         default:
           throw new IllegalArgumentException("Unexpected type: " + type);
         }
@@ -1413,6 +1464,156 @@ void testFromScalarNullByte() {
     }
   }
 
+  @Test
+  void testFromScalarNullList() {
+    final int rowCount = 4;
+    for (DType.DTypeEnum typeEnum : DType.DTypeEnum.values()) {
+      DType dType = typeEnum.isDecimalType() ? DType.create(typeEnum, -8): DType.create(typeEnum);
+      DataType hDataType;
+      if (DType.EMPTY.equals(dType)) {
+        continue;
+      } else if (DType.LIST.equals(dType)) {
+        // list of list of int32
+        hDataType = new ListType(true, new BasicType(true, DType.INT32));
+      } else if (DType.STRUCT.equals(dType)) {
+        // list of struct of int32
+        hDataType = new StructType(true, new BasicType(true, DType.INT32));
+      } else {
+        // list of non nested type
+        hDataType = new BasicType(true, dType);
+      }
+      try (Scalar s = Scalar.listFromNull(hDataType);
+           ColumnVector c = ColumnVector.fromScalar(s, rowCount);
+           HostColumnVector hc = c.copyToHost()) {
+        assertEquals(DType.LIST, c.getType());
+        assertEquals(rowCount, c.getRowCount());
+        assertEquals(rowCount, c.getNullCount());
+        for (int i = 0; i < rowCount; ++i) {
+          assertTrue(hc.isNull(i));
+        }
+
+        try (ColumnView child = c.getChildColumnView(0)) {
+          assertEquals(dType, child.getType());
+          assertEquals(0L, child.getRowCount());
+          assertEquals(0L, child.getNullCount());
+          if (child.getType().isNestedType()) {
+            try (ColumnView grandson = child.getChildColumnView(0)) {
+              assertEquals(DType.INT32, grandson.getType());
+              assertEquals(0L, grandson.getRowCount());
+              assertEquals(0L, grandson.getNullCount());
+            }
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFromScalarListOfList() {
+    HostColumnVector.DataType childType = new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.INT32));
+    HostColumnVector.DataType resultType = new HostColumnVector.ListType(true, childType);
+    try (ColumnVector list = ColumnVector.fromLists(childType,
+             Arrays.asList(1, 2, 3),
+             Arrays.asList(4, 5, 6));
+         Scalar s = Scalar.listFromColumnView(list)) {
+      try (ColumnVector ret = ColumnVector.fromScalar(s, 2);
+           ColumnVector expected = ColumnVector.fromLists(resultType,
+                   Arrays.asList(Arrays.asList(1, 2, 3),Arrays.asList(4, 5, 6)),
+                   Arrays.asList(Arrays.asList(1, 2, 3),Arrays.asList(4, 5, 6)))) {
+        assertColumnsAreEqual(expected, ret);
+      }
+      // empty row
+      try (ColumnVector ret = ColumnVector.fromScalar(s, 0)) {
+        assertEquals(ret.getRowCount(), 0);
+        assertEquals(ret.getNullCount(), 0);
+      }
+    }
+  }
+
+  @Test
+  void testFromScalarListOfStruct() {
+    HostColumnVector.DataType childType = new HostColumnVector.StructType(true,
+            new HostColumnVector.BasicType(true, DType.INT32),
+            new HostColumnVector.BasicType(true, DType.STRING));
+    HostColumnVector.DataType resultType = new HostColumnVector.ListType(true, childType);
+    try (ColumnVector list = ColumnVector.fromStructs(childType,
+            new HostColumnVector.StructData(1, "s1"),
+            new HostColumnVector.StructData(2, "s2"));
+         Scalar s = Scalar.listFromColumnView(list)) {
+      try (ColumnVector ret = ColumnVector.fromScalar(s, 2);
+           ColumnVector expected = ColumnVector.fromLists(resultType,
+                   Arrays.asList(new HostColumnVector.StructData(1, "s1"),
+                                 new HostColumnVector.StructData(2, "s2")),
+                   Arrays.asList(new HostColumnVector.StructData(1, "s1"),
+                                 new HostColumnVector.StructData(2, "s2")))) {
+        assertColumnsAreEqual(expected, ret);
+      }
+      // empty row
+      try (ColumnVector ret = ColumnVector.fromScalar(s, 0)) {
+        assertEquals(ret.getRowCount(), 0);
+        assertEquals(ret.getNullCount(), 0);
+      }
+    }
+  }
+
+  @Test
+  void testFromScalarNullStruct() {
+    final int rowCount = 4;
+    for (DType.DTypeEnum typeEnum : DType.DTypeEnum.values()) {
+      DType dType = typeEnum.isDecimalType() ? DType.create(typeEnum, -8) : DType.create(typeEnum);
+      DataType hDataType;
+      if (DType.EMPTY.equals(dType)) {
+        continue;
+      } else if (DType.LIST.equals(dType)) {
+        // list of list of int32
+        hDataType = new ListType(true, new BasicType(true, DType.INT32));
+      } else if (DType.STRUCT.equals(dType)) {
+        // list of struct of int32
+        hDataType = new StructType(true, new BasicType(true, DType.INT32));
+      } else {
+        // list of non nested type
+        hDataType = new BasicType(true, dType);
+      }
+      try (Scalar s = Scalar.structFromNull(hDataType, hDataType, hDataType);
+           ColumnVector c = ColumnVector.fromScalar(s, rowCount);
+           HostColumnVector hc = c.copyToHost()) {
+        assertEquals(DType.STRUCT, c.getType());
+        assertEquals(rowCount, c.getRowCount());
+        assertEquals(rowCount, c.getNullCount());
+        for (int i = 0; i < rowCount; ++i) {
+          assertTrue(hc.isNull(i));
+        }
+        assertEquals(3, c.getNumChildren());
+        ColumnView[] children = new ColumnView[]{c.getChildColumnView(0),
+            c.getChildColumnView(1), c.getChildColumnView(2)};
+        try {
+          for (ColumnView child : children) {
+            assertEquals(dType, child.getType());
+            assertEquals(rowCount, child.getRowCount());
+            assertEquals(rowCount, child.getNullCount());
+            if (child.getType() == DType.LIST) {
+              try (ColumnView childOfChild = child.getChildColumnView(0)) {
+                assertEquals(DType.INT32, childOfChild.getType());
+                assertEquals(0L, childOfChild.getRowCount());
+                assertEquals(0L, childOfChild.getNullCount());
+              }
+            } else if (child.getType() == DType.STRUCT) {
+              assertEquals(1, child.getNumChildren());
+              try (ColumnView childOfChild = child.getChildColumnView(0)) {
+                assertEquals(DType.INT32, childOfChild.getType());
+                assertEquals(rowCount, childOfChild.getRowCount());
+                assertEquals(rowCount, childOfChild.getNullCount());
+              }
+            }
+          }
+        } finally {
+          for (ColumnView cv : children) cv.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testReplaceNullsScalarEmptyColumn() {
     try (ColumnVector input = ColumnVector.fromBoxedBooleans();
@@ -1901,83 +2102,92 @@ void testStringManipulationWithNulls() {
   @Test
   void testStringConcat() {
     try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf",
-                                                   "g\nH", "IJ\"\u0100\u0101\u0500\u0501",
-                                                   "kl m", "Nop1", "\\qRs2", "3tuV\'",
-                                                   "wX4Yz", "\ud720\ud721");
+        "g\nH", "IJ\"\u0100\u0101\u0500\u0501",
+        "kl m", "Nop1", "\\qRs2", "3tuV\'",
+        "wX4Yz", "\ud720\ud721");
          ColumnVector e_concat = ColumnVector.fromStrings("aa", "BB", "cdcd",
-                                                   "\u0480\u0481\u0480\u0481", "E\tfE\tf", "g\nHg\nH",
-                                                   "IJ\"\u0100\u0101\u0500\u0501IJ\"\u0100\u0101\u0500\u0501",
-                                                   "kl mkl m", "Nop1Nop1", "\\qRs2\\qRs2", "3tuV\'3tuV\'",
-                                                   "wX4YzwX4Yz", "\ud720\ud721\ud720\ud721");
+             "\u0480\u0481\u0480\u0481", "E\tfE\tf", "g\nHg\nH",
+             "IJ\"\u0100\u0101\u0500\u0501IJ\"\u0100\u0101\u0500\u0501",
+             "kl mkl m", "Nop1Nop1", "\\qRs2\\qRs2", "3tuV\'3tuV\'",
+             "wX4YzwX4Yz", "\ud720\ud721\ud720\ud721");
          Scalar emptyString = Scalar.fromString("");
-         ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString,
-                                                              new ColumnVector[]{v, v})) {
+         ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString, new ColumnView[]{v, v})) {
       assertColumnsAreEqual(concat, e_concat);
     }
-    assertThrows(AssertionError.class, () -> {
+    assertThrows(CudfException.class, () -> {
       try (ColumnVector sv = ColumnVector.fromStrings("B", "cd", "\u0480\u0481", "E\tf");
            ColumnVector cv = ColumnVector.fromInts(1, 2, 3, 4);
            Scalar emptyString = Scalar.fromString("");
-           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString,
-                                                                new ColumnVector[]{sv, cv})) {}
+           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString, new ColumnView[]{sv, cv})) {
+      }
     });
-    assertThrows(AssertionError.class, () -> {
+    assertThrows(CudfException.class, () -> {
       try (ColumnVector sv1 = ColumnVector.fromStrings("a", "B", "cd");
            ColumnVector sv2 = ColumnVector.fromStrings("a", "B");
            Scalar emptyString = Scalar.fromString("");
            ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString,
-                                                                new ColumnVector[]{sv1, sv2})) {}
+               new ColumnVector[]{sv1, sv2})) {
+      }
     });
     assertThrows(AssertionError.class, () -> {
-      try (ColumnVector sv = ColumnVector.fromStrings("a", "B", "cd");
-           Scalar emptyString = Scalar.fromString("");
-           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString,
-                                                                new ColumnVector[]{sv})) {}
+      try (Scalar emptyString = Scalar.fromString("");
+           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString, new ColumnView[]{})) {
+      }
     });
     assertThrows(CudfException.class, () -> {
       try (ColumnVector sv = ColumnVector.fromStrings("a", "B", "cd");
            Scalar emptyString = Scalar.fromString("");
            Scalar nullString = Scalar.fromString(null);
-           ColumnVector concat = ColumnVector.stringConcatenate(nullString, emptyString,
-                                                                new ColumnVector[]{sv, sv})) {}
+           ColumnVector concat = ColumnVector.stringConcatenate(nullString, emptyString, new ColumnView[]{sv, sv})) {
+      }
     });
     assertThrows(AssertionError.class, () -> {
       try (ColumnVector sv = ColumnVector.fromStrings("a", "B", "cd");
            Scalar emptyString = Scalar.fromString("");
-           ColumnVector concat = ColumnVector.stringConcatenate(null, emptyString,
-                                                                new ColumnVector[]{sv, sv})) {}
+           ColumnVector concat = ColumnVector.stringConcatenate(null, emptyString, new ColumnView[]{sv, sv})) {
+      }
     });
     assertThrows(AssertionError.class, () -> {
       try (ColumnVector sv = ColumnVector.fromStrings("a", "B", "cd");
            Scalar emptyString = Scalar.fromString("");
-           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, null,
-                                                                new ColumnVector[]{sv, sv})) {}
+           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, null, new ColumnView[]{sv, sv})) {
+      }
     });
     assertThrows(AssertionError.class, () -> {
       try (ColumnVector sv = ColumnVector.fromStrings("a", "B", "cd");
            Scalar emptyString = Scalar.fromString("");
-           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString,
-                                                                new ColumnVector[]{sv, null})) {}
+           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, emptyString, new ColumnView[]{sv, null})) {
+      }
     });
   }
 
   @Test
   void testStringConcatWithNulls() {
     try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf",
-                                                   "g\nH", "IJ\"\u0100\u0101\u0500\u0501",
-                                                   "kl m", "Nop1", "\\qRs2", null,
-                                                   "3tuV\'", "wX4Yz", "\ud720\ud721");
+        "g\nH", "IJ\"\u0100\u0101\u0500\u0501",
+        "kl m", "Nop1", "\\qRs2", null,
+        "3tuV\'", "wX4Yz", "\ud720\ud721");
          ColumnVector e_concat = ColumnVector.fromStrings("aa", "BB", "cdcd",
-                                                   "\u0480\u0481\u0480\u0481", "E\tfE\tf", "g\nHg\nH",
-                                                   "IJ\"\u0100\u0101\u0500\u0501IJ\"\u0100\u0101\u0500\u0501",
-                                                   "kl mkl m", "Nop1Nop1", "\\qRs2\\qRs2", "NULLNULL",
-                                                   "3tuV\'3tuV\'", "wX4YzwX4Yz", "\ud720\ud721\ud720\ud721");
-          Scalar emptyString = Scalar.fromString("");
-          Scalar nullSubstitute = Scalar.fromString("NULL");
-         ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute,
-                                                              new ColumnVector[]{v, v})) {
+             "\u0480\u0481\u0480\u0481", "E\tfE\tf", "g\nHg\nH",
+             "IJ\"\u0100\u0101\u0500\u0501IJ\"\u0100\u0101\u0500\u0501",
+             "kl mkl m", "Nop1Nop1", "\\qRs2\\qRs2", "NULLNULL",
+             "3tuV\'3tuV\'", "wX4YzwX4Yz", "\ud720\ud721\ud720\ud721");
+         Scalar emptyString = Scalar.fromString("");
+         Scalar nullSubstitute = Scalar.fromString("NULL");
+         ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v, v})) {
       assertColumnsAreEqual(concat, e_concat);
     }
+
+    assertThrows(CudfException.class, () -> {
+      try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf",
+          "g\nH", "IJ\"\u0100\u0101\u0500\u0501",
+          "kl m", "Nop1", "\\qRs2", null,
+          "3tuV\'", "wX4Yz", "\ud720\ud721");
+           Scalar emptyString = Scalar.fromString("");
+           Scalar nullSubstitute = Scalar.fromString("NULL");
+           ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) {
+      }
+    });
   }
 
   @Test
@@ -1985,71 +2195,570 @@ void testStringConcatSeparators() {
     try (ColumnVector sv1 = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", null, null, "\\G\u0100");
          ColumnVector sv2 = ColumnVector.fromStrings("b", "C", "\u0500\u0501", "x\nYz", null, null, "", null);
          ColumnVector e_concat = ColumnVector.fromStrings("aA1\t\ud721b", "BA1\t\ud721C", "cdA1\t\ud721\u0500\u0501",
-                                                          "\u0480\u0481A1\t\ud721x\nYz", null, null, null, null);
+             "\u0480\u0481A1\t\ud721x\nYz", null, null, null, null);
          Scalar separatorString = Scalar.fromString("A1\t\ud721");
          Scalar nullString = Scalar.fromString(null);
-         ColumnVector concat = ColumnVector.stringConcatenate(separatorString, nullString,
-                                                              new ColumnVector[]{sv1, sv2})) {
+         ColumnVector concat = ColumnVector.stringConcatenate(separatorString, nullString, new ColumnView[]{sv1, sv2})) {
       assertColumnsAreEqual(concat, e_concat);
     }
   }
 
   @Test
-  void testWindowStatic() {
-    WindowOptions options = WindowOptions.builder().window(2, 1)
-        .minPeriods(2).build();
-    try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8)) {
-      try (ColumnVector expected = ColumnVector.fromLongs(9, 16, 17, 21, 14);
-           ColumnVector result = v1.rollingWindow(Aggregation.sum(), options)) {
-        assertColumnsAreEqual(expected, result);
-      }
+  void testStringConcatSeparatorsEmptyStringForNull() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", null, null, "\\G\u0100");
+         ColumnVector sv2 = ColumnVector.fromStrings("b", "C", "\u0500\u0501", "x\nYz", null, null, "", null);
+         ColumnVector e_concat = ColumnVector.fromStrings("aA1\t\ud721b", "BA1\t\ud721C", "cdA1\t\ud721\u0500\u0501",
+             "\u0480\u0481A1\t\ud721x\nYz", "E\tf", "", "", "\\G\u0100");
+         Scalar separatorString = Scalar.fromString("A1\t\ud721");
+         Scalar narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(separatorString, narep, new ColumnView[]{sv1, sv2}, false)) {
+      assertColumnsAreEqual(concat, e_concat);
+    }
+  }
 
-      try (ColumnVector expected = ColumnVector.fromInts(4, 4, 4, 6, 6);
-           ColumnVector result = v1.rollingWindow(Aggregation.min(), options)) {
-        assertColumnsAreEqual(expected, result);
-      }
+  @Test
+  void testConcatWsTypeError() {
+    try (ColumnVector v0 = ColumnVector.fromInts(1, 2, 3, 4);
+         ColumnVector v1 = ColumnVector.fromFloats(5.0f, 6.0f);
+         ColumnVector sep_col = ColumnVector.fromStrings("-*");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar nullString = Scalar.fromString(null)) {
+      assertThrows(CudfException.class, () -> ColumnVector.stringConcatenate(
+          new ColumnView[]{v0, v1}, sep_col, separatorString, nullString, false));
+    }
+  }
 
-      try (ColumnVector expected = ColumnVector.fromInts(5, 7, 7, 8, 8);
-           ColumnVector result = v1.rollingWindow(Aggregation.max(), options)) {
-        assertColumnsAreEqual(expected, result);
+  @Test
+  void testConcatWsNoColumn() {
+    try (ColumnVector sep_col = ColumnVector.fromStrings("-*");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar nullString = Scalar.fromString(null)) {
+      assertThrows(AssertionError.class, () -> ColumnVector.stringConcatenate(
+          new ColumnView[]{}, sep_col, separatorString, nullString, false));
+    }
+  }
+
+  @Test
+  void testStringConcatWsSimple() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a");
+         ColumnVector sv2 = ColumnVector.fromStrings("B");
+         ColumnVector sv3 = ColumnVector.fromStrings("cd");
+         ColumnVector sv4 = ColumnVector.fromStrings("\u0480\u0481");
+         ColumnVector sv5 = ColumnVector.fromStrings("E\tf");
+         ColumnVector sv6 = ColumnVector.fromStrings("M");
+         ColumnVector sv7 = ColumnVector.fromStrings("\\G\u0100");
+         ColumnVector sep_col = ColumnVector.fromStrings("-*");
+         ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(
+             new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col, separatorString,
+             col_narep, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSimpleOtherApi() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a");
+         ColumnVector sv2 = ColumnVector.fromStrings("B");
+         ColumnVector sv3 = ColumnVector.fromStrings("cd");
+         ColumnVector sv4 = ColumnVector.fromStrings("\u0480\u0481");
+         ColumnVector sv5 = ColumnVector.fromStrings("E\tf");
+         ColumnVector sv6 = ColumnVector.fromStrings("M");
+         ColumnVector sv7 = ColumnVector.fromStrings("\\G\u0100");
+         ColumnVector sep_col = ColumnVector.fromStrings("-*");
+         ColumnVector e_concat = ColumnVector.fromStrings("a-*B-*cd-*\u0480\u0481-*E\tf-*M-*\\G\u0100");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(
+             new ColumnView[]{sv1, sv2, sv3, sv4, sv5, sv6, sv7}, sep_col)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsOneCol() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a");
+         ColumnVector sep_col = ColumnVector.fromStrings("-*");
+         ColumnVector e_concat = ColumnVector.fromStrings("a");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(
+             new ColumnView[]{sv1}, sep_col, separatorString,
+             col_narep, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsNullSep() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c");
+         ColumnVector sv2 = ColumnVector.fromStrings("b", "d");
+         Scalar nullString = Scalar.fromString(null);
+         ColumnVector sep_col = ColumnVector.fromScalar(nullString, 2);
+         ColumnVector e_concat = ColumnVector.fromScalar(nullString, 2);
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2},
+             sep_col, separatorString, col_narep, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsNullValueInCol() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", null);
+         ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e");
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "e");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2},
+             sep_col, separatorString, col_narep, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsNullValueInColKeepNull() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", null);
+         ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e");
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", null);
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString(null);
+         ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2},
+             sep_col, separatorString, col_narep, true)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsNullValueInColSepTrue() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", null);
+         ColumnVector sv2 = ColumnVector.fromStrings("b", "", "e");
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-");
+         // this is failing?
+         ColumnVector e_concat = ColumnVector.fromStrings("a-b", "c-", "-e");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2},
+             sep_col, separatorString, col_narep, true)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleCol() {
+    try (ColumnVector sv1 = ColumnVector.fromStrings("a", "c", "e");
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("a", "c", "e");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1},
+             sep_col, separatorString, col_narep, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsNullAllCol() {
+    try (Scalar nullString = Scalar.fromString(null);
+         ColumnVector sv1 = ColumnVector.fromScalar(nullString, 3);
+         ColumnVector sv2 = ColumnVector.fromScalar(nullString, 3);
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("", "", "");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2},
+             sep_col, separatorString, col_narep, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsNullAllColSepTrue() {
+    try (Scalar nullString = Scalar.fromString(null);
+         ColumnVector sv1 = ColumnVector.fromScalar(nullString, 3);
+         ColumnVector sv2 = ColumnVector.fromScalar(nullString, 3);
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("-", "-", "-");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = ColumnVector.stringConcatenate(new ColumnView[]{sv1, sv2},
+             sep_col, separatorString, col_narep, true)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListCol() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa"), Arrays.asList("b", "c", "d"),
+           Arrays.asList("\u0480\u0481", null, "asdfbe", null));
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "*");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString,
+             col_narep, false, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColDefaultApi() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa"), Arrays.asList("b", "c", "d"),
+           Arrays.asList("\u0480\u0481", null, "asdfbe", null));
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-", "*");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481*asdfbe");
+         ColumnVector concat = cv1.stringConcatenateListElements(sep_col)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColScalarSep() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa"), Arrays.asList("b", "c", "d"),
+           Arrays.asList("\u0480\u0481", null, "asdfbe", null));
+         Scalar separatorString = Scalar.fromString("-");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa", "b-c-d", "\u0480\u0481-asdfbe");
+         Scalar narep = Scalar.fromString("");
+         ColumnVector concat = cv1.stringConcatenateListElements(separatorString, narep, false,
+             false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColAllNulls() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa"), Arrays.asList(null, null, null));
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa", null);
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString,
+             col_narep, false, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColAllNullsScalarSep() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa"), Arrays.asList(null, null, null));
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa", null);
+         Scalar separatorString = Scalar.fromString("-");
+         Scalar narep = Scalar.fromString("");
+         ColumnVector concat = cv1.stringConcatenateListElements(separatorString, narep,
+             false, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColAllNullsSepTrue() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa"), Arrays.asList(null, null, null));
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa", null);
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString,
+             col_narep, true, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColAllNullsKeepNulls() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa"), Arrays.asList(null, null, null));
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa", null);
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString(null);
+         ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString,
+             col_narep, true, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColEmptyArray() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa", "bbbb"), Arrays.asList());
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa-bbbb", null);
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         // set the parameter to return null on empty array
+         ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString,
+             col_narep, false, false)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testStringConcatWsSingleListColEmptyArrayReturnEmpty() {
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+           new HostColumnVector.BasicType(true, DType.STRING)),
+           Arrays.asList("aaa", "bbbb"), Arrays.asList());
+         ColumnVector sep_col = ColumnVector.fromStrings("-", "-");
+         ColumnVector e_concat = ColumnVector.fromStrings("aaa-bbbb", "");
+         Scalar separatorString = Scalar.fromString(null);
+         Scalar col_narep = Scalar.fromString("");
+         // set the parameter to return empty string on empty array
+         ColumnVector concat = cv1.stringConcatenateListElements(sep_col, separatorString,
+             col_narep, false, true)) {
+      assertColumnsAreEqual(e_concat, concat);
+    }
+  }
+
+  @Test
+  void testListConcatByRow() {
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.INT32)),
+        Arrays.asList(0), Arrays.asList(1, 2, 3), null, Arrays.asList(), Arrays.asList());
+         ColumnVector result = ColumnVector.listConcatenateByRow(cv)) {
+      assertColumnsAreEqual(cv, result);
+    }
+
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.INT32)),
+        Arrays.asList(0), Arrays.asList(1, 2, 3), null, Arrays.asList(), Arrays.asList());
+         ColumnVector cv2 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.INT32)),
+             Arrays.asList(1, 2, 3), Arrays.asList((Integer) null), Arrays.asList(10, 12), Arrays.asList(100, 200, 300),
+             Arrays.asList());
+         ColumnVector result = ColumnVector.listConcatenateByRow(cv1, cv2);
+         ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.INT32)),
+             Arrays.asList(0, 1, 2, 3), Arrays.asList(1, 2, 3, null), null, Arrays.asList(100, 200, 300),
+             Arrays.asList())) {
+      assertColumnsAreEqual(expect, result);
+    }
+
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.STRING)),
+        Arrays.asList("AAA", "BBB"), Arrays.asList("aaa"), Arrays.asList("111"), Arrays.asList("X"),
+        Arrays.asList());
+         ColumnVector cv2 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList(), Arrays.asList("bbb", "ccc"), null, Arrays.asList((String) null),
+             Arrays.asList());
+         ColumnVector cv3 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("CCC"), Arrays.asList(), Arrays.asList("222", "333"), Arrays.asList("Z"),
+             Arrays.asList());
+         ColumnVector result = ColumnVector.listConcatenateByRow(cv1, cv2, cv3);
+         ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("AAA", "BBB", "CCC"), Arrays.asList("aaa", "bbb", "ccc"), null,
+             Arrays.asList("X", null, "Z"), Arrays.asList())) {
+      assertColumnsAreEqual(expect, result);
+    }
+
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.FLOAT64)),
+        Arrays.asList(1.23, 0.0, Double.NaN), Arrays.asList(), null, Arrays.asList(-1.23e10, null));
+         ColumnVector result = ColumnVector.listConcatenateByRow(cv, cv, cv);
+         ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.FLOAT64)),
+             Arrays.asList(1.23, 0.0, Double.NaN, 1.23, 0.0, Double.NaN, 1.23, 0.0, Double.NaN),
+             Arrays.asList(), null, Arrays.asList(-1.23e10, null, -1.23e10, null, -1.23e10, null))) {
+      assertColumnsAreEqual(expect, result);
+    }
+
+    assertThrows(CudfException.class, () -> {
+      try (ColumnVector cv = ColumnVector.fromInts(1, 2, 3);
+           ColumnVector result = ColumnVector.listConcatenateByRow(cv, cv)) {
       }
+    });
 
-      // The rolling window produces the same result type as the input
-      try (ColumnVector expected = ColumnVector.fromDoubles(4.5, 16.0 / 3, 17.0 / 3, 7, 7);
-           ColumnVector result = v1.rollingWindow(Aggregation.mean(), options)) {
-        assertColumnsAreEqual(expected, result);
+    assertThrows(CudfException.class, () -> {
+      try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+          new HostColumnVector.ListType(true,
+              new HostColumnVector.BasicType(true, DType.INT32))), Arrays.asList(Arrays.asList(1)));
+           ColumnVector result = ColumnVector.listConcatenateByRow(cv, cv)) {
       }
+    });
 
-      try (ColumnVector expected = ColumnVector.fromBoxedInts(4, 7, 6, 8, null);
-           ColumnVector result = v1.rollingWindow(Aggregation.lead(1), options)) {
-        assertColumnsAreEqual(expected, result);
+    assertThrows(CudfException.class, () -> {
+      try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+          new HostColumnVector.BasicType(true, DType.INT32)), Arrays.asList(1, 2, 3));
+           ColumnVector cv2 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+               new HostColumnVector.BasicType(true, DType.INT32)), Arrays.asList(1, 2), Arrays.asList(3));
+           ColumnVector result = ColumnVector.listConcatenateByRow(cv1, cv2)) {
       }
+    });
 
-      try (ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6);
-           ColumnVector result = v1.rollingWindow(Aggregation.lag(1), options)) {
-        assertColumnsAreEqual(expected, result);
+    assertThrows(CudfException.class, () -> {
+      try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+          new HostColumnVector.BasicType(true, DType.INT32)), Arrays.asList(1, 2, 3));
+           ColumnVector cv2 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+               new HostColumnVector.BasicType(true, DType.INT64)), Arrays.asList(1L));
+           ColumnVector result = ColumnVector.listConcatenateByRow(cv1, cv2)) {
       }
+    });
+  }
 
-      try (ColumnVector defaultOutput = ColumnVector.fromInts(-1, -2, -3, -4, -5);
-           ColumnVector expected = ColumnVector.fromBoxedInts(-1,  5,  4,  7,  6);
-           ColumnVector result = v1.rollingWindow(Aggregation.lag(1, defaultOutput), options)) {
-        assertColumnsAreEqual(expected, result);
+  @Test
+  void testListConcatByRowIgnoreNull() {
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.INT32)),
+        Arrays.asList(0), Arrays.asList(1, 2, 3), null, Arrays.asList(), Arrays.asList());
+         ColumnVector result = ColumnVector.listConcatenateByRow(true, cv)) {
+      assertColumnsAreEqual(cv, result);
+    }
+
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.INT32)),
+        Arrays.asList((Integer) null), Arrays.asList(1, 2, 3), null, Arrays.asList(), null);
+         ColumnVector cv2 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.INT32)),
+             Arrays.asList(1, 2, 3), null, Arrays.asList(10, 12), Arrays.asList(100, 200, 300), null);
+         ColumnVector result = ColumnVector.listConcatenateByRow(true, cv1, cv2);
+         ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.INT32)),
+             Arrays.asList(null, 1, 2, 3), Arrays.asList(1, 2, 3), Arrays.asList(10, 12),
+             Arrays.asList(100, 200, 300), null)) {
+      assertColumnsAreEqual(expect, result);
+    }
+
+    try (ColumnVector cv1 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.STRING)),
+        Arrays.asList("AAA", "BBB"), Arrays.asList("aaa"), Arrays.asList("111"), null, null);
+         ColumnVector cv2 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             null, Arrays.asList("bbb", "ccc"), null, Arrays.asList("Y", null), null);
+         ColumnVector cv3 = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("CCC"), Arrays.asList(), Arrays.asList("222", "333"), null, null);
+         ColumnVector result = ColumnVector.listConcatenateByRow(true, cv1, cv2, cv3);
+         ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("AAA", "BBB", "CCC"), Arrays.asList("aaa", "bbb", "ccc"),
+             Arrays.asList("111", "222", "333"), Arrays.asList("Y", null), null)) {
+      assertColumnsAreEqual(expect, result);
+    }
+
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.FLOAT64)),
+        Arrays.asList(1.23, 0.0, Double.NaN), Arrays.asList(), null, Arrays.asList(-1.23e10, null));
+         ColumnVector result = ColumnVector.listConcatenateByRow(true, cv, cv, cv);
+         ColumnVector expect = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.FLOAT64)),
+             Arrays.asList(1.23, 0.0, Double.NaN, 1.23, 0.0, Double.NaN, 1.23, 0.0, Double.NaN),
+             Arrays.asList(), null, Arrays.asList(-1.23e10, null, -1.23e10, null, -1.23e10, null))) {
+      assertColumnsAreEqual(expect, result);
+    }
+  }
+
+  @Test
+  void testPrefixSum() {
+    try (ColumnVector v1 = ColumnVector.fromLongs(1, 2, 3, 5, 8, 10);
+         ColumnVector summed = v1.prefixSum();
+         ColumnVector expected = ColumnVector.fromLongs(1, 3, 6, 11, 19, 29)) {
+      assertColumnsAreEqual(expected, summed);
+    }
+  }
+
+  @Test
+  void testPrefixSumErrors() {
+    try (ColumnVector v1 = ColumnVector.fromBoxedLongs(1L, 2L, 3L, 5L, 8L, null)) {
+      assertThrows(CudfException.class, () -> {
+        try(ColumnVector ignored = v1.prefixSum()) {
+          // empty
+        }
+      });
+    }
+
+    try (ColumnVector v1 = ColumnVector.fromInts(1, 2, 3, 5, 8, 10)) {
+      assertThrows(CudfException.class, () -> {
+        try(ColumnVector ignored = v1.prefixSum()) {
+          // empty
+        }
+      });
+    }
+  }
+
+  @Test
+  void testWindowStatic() {
+    try (Scalar one = Scalar.fromInt(1);
+         Scalar two = Scalar.fromInt(2);
+         WindowOptions options = WindowOptions.builder()
+             .window(two, one)
+             .minPeriods(2).build()) {
+      try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8)) {
+        try (ColumnVector expected = ColumnVector.fromLongs(9, 16, 17, 21, 14);
+             ColumnVector result = v1.rollingWindow(Aggregation.sum(), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        try (ColumnVector expected = ColumnVector.fromInts(4, 4, 4, 6, 6);
+             ColumnVector result = v1.rollingWindow(Aggregation.min(), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        try (ColumnVector expected = ColumnVector.fromInts(5, 7, 7, 8, 8);
+             ColumnVector result = v1.rollingWindow(Aggregation.max(), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        // The rolling window produces the same result type as the input
+        try (ColumnVector expected = ColumnVector.fromDoubles(4.5, 16.0 / 3, 17.0 / 3, 7, 7);
+             ColumnVector result = v1.rollingWindow(Aggregation.mean(), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        try (ColumnVector expected = ColumnVector.fromBoxedInts(4, 7, 6, 8, null);
+             ColumnVector result = v1.rollingWindow(Aggregation.lead(1), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        try (ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6);
+             ColumnVector result = v1.rollingWindow(Aggregation.lag(1), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        try (ColumnVector defaultOutput = ColumnVector.fromInts(-1, -2, -3, -4, -5);
+             ColumnVector expected = ColumnVector.fromBoxedInts(-1, 5, 4, 7, 6);
+             ColumnVector result = v1.rollingWindow(Aggregation.lag(1, defaultOutput), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
       }
     }
   }
 
   @Test
   void testWindowStaticCounts() {
-    WindowOptions options = WindowOptions.builder().window(2, 1)
-            .minPeriods(2).build();
-    try (ColumnVector v1 = ColumnVector.fromBoxedInts(5, 4, null, 6, 8)) {
-      try (ColumnVector expected = ColumnVector.fromInts(2, 2, 2, 2, 2);
-           ColumnVector result = v1.rollingWindow(Aggregation.count(false), options)) {
-        assertColumnsAreEqual(expected, result);
-      }
-      try (ColumnVector expected = ColumnVector.fromInts(2, 3, 3, 3, 2);
-           ColumnVector result = v1.rollingWindow(Aggregation.count(true), options)) {
-        assertColumnsAreEqual(expected, result);
+    try (Scalar one = Scalar.fromInt(1);
+         Scalar two = Scalar.fromInt(2);
+         WindowOptions options = WindowOptions.builder()
+             .window(two, one)
+             .minPeriods(2).build()) {
+      try (ColumnVector v1 = ColumnVector.fromBoxedInts(5, 4, null, 6, 8)) {
+        try (ColumnVector expected = ColumnVector.fromInts(2, 2, 2, 2, 2);
+             ColumnVector result = v1.rollingWindow(Aggregation.count(NullPolicy.EXCLUDE), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+        try (ColumnVector expected = ColumnVector.fromInts(2, 3, 3, 3, 2);
+             ColumnVector result = v1.rollingWindow(Aggregation.count(NullPolicy.INCLUDE), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
       }
     }
   }
@@ -2058,24 +2767,29 @@ void testWindowStaticCounts() {
   void testWindowDynamicNegative() {
     try (ColumnVector precedingCol = ColumnVector.fromInts(3, 3, 3, 4, 4);
          ColumnVector followingCol = ColumnVector.fromInts(-1, -1, -1, -1, 0)) {
-      WindowOptions window = WindowOptions.builder()
-          .minPeriods(2).window(precedingCol, followingCol).build();
-      try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
-           ColumnVector expected = ColumnVector.fromBoxedLongs(null, null, 9L, 16L, 25L);
-           ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) {
-        assertColumnsAreEqual(expected, result);
+      try (WindowOptions window = WindowOptions.builder()
+          .minPeriods(2).window(precedingCol, followingCol).build()) {
+        try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
+             ColumnVector expected = ColumnVector.fromBoxedLongs(null, null, 9L, 16L, 25L);
+             ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) {
+          assertColumnsAreEqual(expected, result);
+        }
       }
     }
   }
 
   @Test
   void testWindowLag() {
-    WindowOptions window = WindowOptions.builder().minPeriods(1)
-        .window(2, -1).build();
-    try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
-         ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6);
-         ColumnVector result = v1.rollingWindow(Aggregation.max(), window)) {
-      assertColumnsAreEqual(expected, result);
+    try (Scalar negOne = Scalar.fromInt(-1);
+         Scalar two = Scalar.fromInt(2);
+         WindowOptions window = WindowOptions.builder()
+             .minPeriods(1)
+             .window(two, negOne).build()) {
+      try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
+           ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6);
+           ColumnVector result = v1.rollingWindow(Aggregation.max(), window)) {
+        assertColumnsAreEqual(expected, result);
+      }
     }
   }
 
@@ -2083,30 +2797,39 @@ void testWindowLag() {
   void testWindowDynamic() {
     try (ColumnVector precedingCol = ColumnVector.fromInts(1, 2, 3, 1, 2);
          ColumnVector followingCol = ColumnVector.fromInts(2, 2, 2, 2, 2)) {
-      WindowOptions window = WindowOptions.builder().minPeriods(2)
-          .window(precedingCol, followingCol).build();
-      try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
-           ColumnVector expected = ColumnVector.fromLongs(16, 22, 30, 14, 14);
-           ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) {
-        assertColumnsAreEqual(expected, result);
+      try (WindowOptions window = WindowOptions.builder().minPeriods(2)
+          .window(precedingCol, followingCol).build()) {
+        try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
+             ColumnVector expected = ColumnVector.fromLongs(16, 22, 30, 14, 14);
+             ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) {
+          assertColumnsAreEqual(expected, result);
+        }
       }
     }
   }
 
   @Test
   void testWindowThrowsException() {
-    try (ColumnVector arraywindowCol = ColumnVector.fromBoxedInts(1, 2, 3 ,1, 1)) {
-      assertThrows(IllegalArgumentException.class, () -> WindowOptions.builder()
-          .window(3, 2).minPeriods(3)
-          .window(arraywindowCol, arraywindowCol).build());
+    try (Scalar one = Scalar.fromInt(1);
+         Scalar two = Scalar.fromInt(2);
+         Scalar three = Scalar.fromInt(3);
+         ColumnVector arraywindowCol = ColumnVector.fromBoxedInts(1, 2, 3 ,1, 1)) {
+      assertThrows(IllegalStateException.class, () -> {
+        try (WindowOptions options = WindowOptions.builder()
+            .window(three, two).minPeriods(3)
+            .window(arraywindowCol, arraywindowCol).build()) {
+        }
+      });
 
-      assertThrows(IllegalArgumentException.class, 
-                   () -> arraywindowCol.rollingWindow(Aggregation.sum(),
-                                                      WindowOptions.builder()
-                                                              .window(2, 1)
-                                                              .minPeriods(1)
-                                                              .timestampColumnIndex(0)
-                                                              .build()));
+      assertThrows(IllegalArgumentException.class, () -> {
+        try (WindowOptions options = WindowOptions.builder()
+            .window(two, one)
+            .minPeriods(1)
+            .orderByColumnIndex(0)
+            .build()) {
+          arraywindowCol.rollingWindow(Aggregation.sum(), options);
+        }
+      });
     }
   }
 
@@ -4144,6 +4867,35 @@ void testGetMapValue() {
     }
   }
 
+  @Test
+  void testGetMapKeyExistence() {
+    List<HostColumnVector.StructData> list1 = Arrays.asList(new HostColumnVector.StructData("a", "b"));
+    List<HostColumnVector.StructData> list2 = Arrays.asList(new HostColumnVector.StructData("a", "c"));
+    List<HostColumnVector.StructData> list3 = Arrays.asList(new HostColumnVector.StructData("e", "d"));
+    List<HostColumnVector.StructData> list4 = Arrays.asList(new HostColumnVector.StructData("a", "g"));
+    List<HostColumnVector.StructData> list5 = Arrays.asList(new HostColumnVector.StructData("a", null));
+    List<HostColumnVector.StructData> list6 = Arrays.asList(new HostColumnVector.StructData(null, null));
+    List<HostColumnVector.StructData> list7 = Arrays.asList(new HostColumnVector.StructData());
+    HostColumnVector.StructType structType = new HostColumnVector.StructType(true, Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
+            new HostColumnVector.BasicType(true, DType.STRING)));
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, structType), list1, list2, list3, list4, list5, list6, list7);
+         ColumnVector resValidKey = cv.getMapKeyExistence(Scalar.fromString("a"));
+         ColumnVector expectedValid = ColumnVector.fromBoxedBooleans(true, true, false, true, true, false, false);
+         ColumnVector expectedNull = ColumnVector.fromBoxedBooleans(false, false, false, false, false, false, false);
+         ColumnVector resNullKey = cv.getMapKeyExistence(Scalar.fromNull(DType.STRING))) {
+      assertColumnsAreEqual(expectedValid, resValidKey);
+      assertColumnsAreEqual(expectedNull, resNullKey);
+    }
+
+    AssertionError e = assertThrows(AssertionError.class, () -> {
+      try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, structType), list1, list2, list3, list4, list5, list6, list7);
+           ColumnVector resNullKey = cv.getMapKeyExistence(null)) {
+      }
+    });
+    assertTrue(e.getMessage().contains("target string may not be null"));
+  }
+
+
   @Test
   void testListOfStructsOfStructs() {
     List<HostColumnVector.StructData> list1 = Arrays.asList(
@@ -4250,37 +5002,64 @@ void testMakeStruct() {
 
   @Test
   void testMakeListEmpty() {
-    final int numRows = 10;
-    try (ColumnVector expected =
+    final int numRows = 4;
+    List<List<String>> emptyListOfList = new ArrayList<>();
+    emptyListOfList.add(Arrays.asList());
+    try (
+        ColumnVector expectedList =
              ColumnVector.fromLists(
                  new ListType(false, new BasicType(false, DType.STRING)),
                  Arrays.asList(),
                  Arrays.asList(),
                  Arrays.asList(),
-                 Arrays.asList(),
-                 Arrays.asList(),
-                 Arrays.asList(),
-                 Arrays.asList(),
-                 Arrays.asList(),
-                 Arrays.asList(),
                  Arrays.asList());
-         ColumnVector created = ColumnVector.makeList(numRows, DType.STRING)) {
-      assertColumnsAreEqual(expected, created);
+         ColumnVector expectedListOfList = ColumnVector.fromLists(new HostColumnVector.ListType(false,
+                 new HostColumnVector.ListType(false,
+                     new HostColumnVector.BasicType(false, DType.STRING))),
+             emptyListOfList, emptyListOfList, emptyListOfList, emptyListOfList);
+
+         ColumnVector createdList = ColumnVector.makeList(numRows, DType.STRING);
+         ColumnVector createdListOfList = ColumnVector.makeList(createdList)) {
+      assertColumnsAreEqual(expectedList, createdList);
+      assertColumnsAreEqual(expectedListOfList, createdListOfList);
     }
   }
 
   @Test
   void testMakeList() {
-    try (ColumnVector expected =
-             ColumnVector.fromLists(
-                 new ListType(false, new BasicType(false, DType.INT32)),
-                 Arrays.asList(1, 3, 5),
-                 Arrays.asList(2, 4, 6));
+    List<Integer> list1 = Arrays.asList(1, 3);
+    List<Integer> list2 = Arrays.asList(2, 4);
+    List<Integer> list3 = Arrays.asList(5, 7, 9);
+    List<Integer> list4 = Arrays.asList(6, 8, 10);
+    List<List<Integer>> mainList1 = new ArrayList<>(Arrays.asList(list1, list3));
+    List<List<Integer>> mainList2 = new ArrayList<>(Arrays.asList(list2, list4));
+    try (ColumnVector expectedList1 =
+             ColumnVector.fromLists(new ListType(false,
+                 new BasicType(false, DType.INT32)), list1, list2);
+         ColumnVector expectedList2 =
+             ColumnVector.fromLists(new ListType(false,
+                 new BasicType(false, DType.INT32)), list3, list4);
+         ColumnVector expectedListOfList = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32))),
+             mainList1, mainList2);
          ColumnVector child1 = ColumnVector.fromInts(1, 2);
          ColumnVector child2 = ColumnVector.fromInts(3, 4);
          ColumnVector child3 = ColumnVector.fromInts(5, 6);
-         ColumnVector created = ColumnVector.makeList(child1, child2, child3)) {
-      assertColumnsAreEqual(expected, created);
+         ColumnVector child4 = ColumnVector.fromInts(7, 8);
+         ColumnVector child5 = ColumnVector.fromInts(9, 10);
+         ColumnVector createdList1 = ColumnVector.makeList(child1, child2);
+         ColumnVector createdList2 = ColumnVector.makeList(child3, child4, child5);
+         ColumnVector createdListOfList = ColumnVector.makeList(createdList1, createdList2);
+         HostColumnVector hcv = createdListOfList.copyToHost()) {
+
+      assertColumnsAreEqual(expectedList1, createdList1);
+      assertColumnsAreEqual(expectedList2, createdList2);
+      assertColumnsAreEqual(expectedListOfList, createdListOfList);
+
+      List<List<Integer>> ret1 = hcv.getList(0);
+      List<List<Integer>> ret2 = hcv.getList(1);
+      assertEquals(mainList1, ret1, "Lists don't match");
+      assertEquals(mainList2, ret2, "Lists don't match");
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/CuFileTest.java b/java/src/test/java/ai/rapids/cudf/CuFileTest.java
index c07a8b85bdf..10415cae893 100644
--- a/java/src/test/java/ai/rapids/cudf/CuFileTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CuFileTest.java
@@ -70,7 +70,7 @@ private void verifyCopyToFile(File tempFile) {
     try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16);) {
+         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16)) {
       orig.setLong(0, 123456789);
       from.copyFromHostBuffer(orig);
       CuFile.writeDeviceBufferToFile(tempFile, 0, from);
@@ -84,14 +84,14 @@ private void verifyAppendToFile(File tempFile) {
     try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(16);
          DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16);
          DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16);
-         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16);) {
+         HostMemoryBuffer dest = HostMemoryBuffer.allocate(16)) {
       orig.setLong(0, 123456789);
       from.copyFromHostBuffer(orig);
-      CuFile.appendDeviceBufferToFile(tempFile, from);
+      assertEquals(0, CuFile.appendDeviceBufferToFile(tempFile, from));
 
       orig.setLong(0, 987654321);
       from.copyFromHostBuffer(orig);
-      CuFile.appendDeviceBufferToFile(tempFile, from);
+      assertEquals(16, CuFile.appendDeviceBufferToFile(tempFile, from));
 
       CuFile.readFileToDeviceBuffer(to, tempFile, 0);
       dest.copyFromDeviceBuffer(to);
@@ -102,4 +102,53 @@ private void verifyAppendToFile(File tempFile) {
       assertEquals(987654321, dest.getLong(0));
     }
   }
+
+  @Test
+  public void testRegisteringUnalignedBufferThrowsException() {
+    assumeTrue(CuFile.libraryLoaded());
+    assertThrows(IllegalArgumentException.class, () -> {
+      //noinspection EmptyTryBlock
+      try (CuFileBuffer ignored = CuFileBuffer.allocate(4095, true)) {
+      }
+    });
+  }
+
+  @Test
+  public void testReadWriteUnregisteredBuffer(@TempDir File tempDir) {
+    assumeTrue(CuFile.libraryLoaded());
+    File tempFile = new File(tempDir, "tempFile");
+    verifyReadWrite(tempFile, 16, false);
+  }
+
+  @Test
+  public void testReadWriteRegisteredBuffer(@TempDir File tempDir) {
+    assumeTrue(CuFile.libraryLoaded());
+    File tempFile = new File(tempDir, "tempFile");
+    verifyReadWrite(tempFile, 4096, true);
+  }
+
+  private void verifyReadWrite(File tempFile, int length, boolean registerBuffer) {
+    try (HostMemoryBuffer orig = HostMemoryBuffer.allocate(length);
+         CuFileBuffer from = CuFileBuffer.allocate(length, registerBuffer);
+         CuFileWriteHandle writer = new CuFileWriteHandle(tempFile.getAbsolutePath())) {
+      orig.setLong(0, 123456789);
+      from.copyFromHostBuffer(orig);
+      writer.write(from, length, 0);
+
+      orig.setLong(0, 987654321);
+      from.copyFromHostBuffer(orig);
+      assertEquals(length, writer.append(from, length));
+    }
+    try (CuFileBuffer to = CuFileBuffer.allocate(length, registerBuffer);
+         CuFileReadHandle reader = new CuFileReadHandle(tempFile.getAbsolutePath());
+         HostMemoryBuffer dest = HostMemoryBuffer.allocate(length)) {
+      reader.read(to, 0);
+      dest.copyFromDeviceBuffer(to);
+      assertEquals(123456789, dest.getLong(0));
+
+      reader.read(to, length);
+      dest.copyFromDeviceBuffer(to);
+      assertEquals(987654321, dest.getLong(0));
+    }
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 627171e4b2f..00de3a696ad 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,10 +18,19 @@
 
 package ai.rapids.cudf;
 
+import ai.rapids.cudf.HostColumnVector.BasicType;
+import ai.rapids.cudf.HostColumnVector.DataType;
+import ai.rapids.cudf.HostColumnVector.ListType;
+import ai.rapids.cudf.HostColumnVector.StructData;
+import ai.rapids.cudf.HostColumnVector.StructType;
+
 import org.junit.jupiter.api.Test;
 
 import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
 
+import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.*;
 
 public class ScalarTest extends CudfTestBase {
@@ -60,6 +69,55 @@ public void testNull() {
           assertFalse(s.isValid(), "null validity for " + type);
         }
       }
+
+      // create elementType for nested types
+      HostColumnVector.DataType hDataType;
+      if (DType.EMPTY.equals(type)) {
+        continue;
+      } else if (DType.LIST.equals(type)) {
+        // list of list of int32
+        hDataType = new ListType(true, new BasicType(true, DType.INT32));
+      } else if (DType.STRUCT.equals(type)) {
+        // list of struct of int32
+        hDataType = new StructType(true, new BasicType(true, DType.INT32));
+      } else {
+        // list of non nested type
+        hDataType = new BasicType(true, type);
+      }
+
+      // test list scalar with elementType(`type`)
+      try (Scalar s = Scalar.listFromNull(hDataType);
+           ColumnView listCv = s.getListAsColumnView()) {
+        assertFalse(s.isValid(), "null validity for " + type);
+        assertEquals(DType.LIST, s.getType());
+        assertEquals(type, listCv.getType());
+        assertEquals(0L, listCv.getRowCount());
+        assertEquals(0L, listCv.getNullCount());
+        if (type.isNestedType()) {
+          try (ColumnView child = listCv.getChildColumnView(0)) {
+            assertEquals(DType.INT32, child.getType());
+            assertEquals(0L, child.getRowCount());
+            assertEquals(0L, child.getNullCount());
+          }
+        }
+      }
+
+      // test struct scalar with elementType(`type`)
+      try (Scalar s = Scalar.structFromNull(hDataType, hDataType, hDataType)) {
+        assertFalse(s.isValid(), "null validity for " + type);
+        assertEquals(DType.STRUCT, s.getType());
+
+        ColumnView[] children = s.getChildrenFromStructScalar();
+        try {
+          for (ColumnView child : children) {
+            assertEquals(hDataType.getType(), child.getType());
+            assertEquals(1L, child.getRowCount());
+            assertEquals(1L, child.getNullCount());
+          }
+        } finally {
+          for (ColumnView child : children) child.close();
+        }
+      }
     }
   }
 
@@ -205,4 +263,148 @@ public void testString() {
       assertArrayEquals(new byte[]{'T', 'E', 'S', 'T'}, s.getUTF8());
     }
   }
+
+  @Test
+  public void testUTF8String() {
+    try (Scalar s = Scalar.fromUTF8String("TEST".getBytes(StandardCharsets.UTF_8))) {
+      assertEquals(DType.STRING, s.getType());
+      assertTrue(s.isValid());
+      assertEquals("TEST", s.getJavaString());
+      assertArrayEquals(new byte[]{'T', 'E', 'S', 'T'}, s.getUTF8());
+    }
+    try (Scalar s = Scalar.fromUTF8String("".getBytes(StandardCharsets.UTF_8))) {
+      assertEquals(DType.STRING, s.getType());
+      assertTrue(s.isValid());
+      assertEquals("", s.getJavaString());
+      assertArrayEquals(new byte[]{}, s.getUTF8());
+    }
+  }
+
+  @Test
+  public void testList() {
+    // list of int
+    try (ColumnVector listInt = ColumnVector.fromInts(1, 2, 3, 4);
+         Scalar s = Scalar.listFromColumnView(listInt)) {
+      assertEquals(DType.LIST, s.getType());
+      assertTrue(s.isValid());
+      try (ColumnView v = s.getListAsColumnView()) {
+        assertColumnsAreEqual(listInt, v);
+      }
+    }
+
+    // list of list
+    HostColumnVector.DataType listDT = new HostColumnVector.ListType(true,
+            new HostColumnVector.BasicType(true, DType.INT32));
+    try (ColumnVector listList = ColumnVector.fromLists(listDT,
+            Arrays.asList(1, 2, 3),
+            Arrays.asList(4, 5, 6));
+         Scalar s = Scalar.listFromColumnView(listList)) {
+      assertEquals(DType.LIST, s.getType());
+      assertTrue(s.isValid());
+      try (ColumnView v = s.getListAsColumnView()) {
+        assertColumnsAreEqual(listList, v);
+      }
+    }
+  }
+
+  @Test
+  public void testStruct() {
+    try (ColumnVector col0 = ColumnVector.fromInts(1);
+         ColumnVector col1 = ColumnVector.fromBoxedDoubles(1.2);
+         ColumnVector col2 = ColumnVector.fromStrings("a");
+         ColumnVector col3 = ColumnVector.fromDecimals(BigDecimal.TEN);
+         ColumnVector col4 = ColumnVector.daysFromInts(10);
+         ColumnVector col5 = ColumnVector.durationSecondsFromLongs(12345L);
+         Scalar s = Scalar.structFromColumnViews(col0, col1, col2, col3, col4, col5, col0, col1)) {
+      assertEquals(DType.STRUCT, s.getType());
+      assertTrue(s.isValid());
+      ColumnView[] children = s.getChildrenFromStructScalar();
+      try {
+        assertColumnsAreEqual(col0, children[0]);
+        assertColumnsAreEqual(col1, children[1]);
+        assertColumnsAreEqual(col2, children[2]);
+        assertColumnsAreEqual(col3, children[3]);
+        assertColumnsAreEqual(col4, children[4]);
+        assertColumnsAreEqual(col5, children[5]);
+        assertColumnsAreEqual(col0, children[6]);
+        assertColumnsAreEqual(col1, children[7]);
+      } finally {
+        for (ColumnView child : children) child.close();
+      }
+    }
+
+    // test Struct Scalar with null members
+    try (ColumnVector col0 = ColumnVector.fromInts(1);
+         ColumnVector col1 = ColumnVector.fromBoxedDoubles((Double) null);
+         ColumnVector col2 = ColumnVector.fromStrings((String) null);
+         Scalar s1 = Scalar.structFromColumnViews(col0, col1, col2);
+         Scalar s2 = Scalar.structFromColumnViews(col1, col2)) {
+      ColumnView[] children = s1.getChildrenFromStructScalar();
+      try {
+        assertColumnsAreEqual(col0, children[0]);
+        assertColumnsAreEqual(col1, children[1]);
+        assertColumnsAreEqual(col2, children[2]);
+      } finally {
+        for (ColumnView child : children) child.close();
+      }
+
+      ColumnView[] children2 = s2.getChildrenFromStructScalar();
+      try {
+        assertColumnsAreEqual(col1, children2[0]);
+        assertColumnsAreEqual(col2, children2[1]);
+      } finally {
+        for (ColumnView child : children2) child.close();
+      }
+    }
+
+    // test Struct Scalar with single column
+    try (ColumnVector col0 = ColumnVector.fromInts(1234);
+         Scalar s = Scalar.structFromColumnViews(col0)) {
+      ColumnView[] children = s.getChildrenFromStructScalar();
+      try {
+        assertColumnsAreEqual(col0, children[0]);
+      } finally {
+        children[0].close();
+      }
+    }
+
+    // test Struct Scalar without column
+    try (Scalar s = Scalar.structFromColumnViews()) {
+      assertEquals(DType.STRUCT, s.getType());
+      assertTrue(s.isValid());
+      ColumnView[] children = s.getChildrenFromStructScalar();
+      assertEquals(0, children.length);
+    }
+
+    // test Struct Scalar with nested types
+    HostColumnVector.DataType listType = new HostColumnVector.ListType(true,
+        new HostColumnVector.BasicType(true, DType.INT32));
+    HostColumnVector.DataType structType = new HostColumnVector.StructType(true,
+        new HostColumnVector.BasicType(true, DType.INT32),
+        new HostColumnVector.BasicType(true, DType.INT64));
+    HostColumnVector.DataType nestedStructType = new HostColumnVector.StructType(true,
+        new HostColumnVector.BasicType(true, DType.STRING),
+        listType, structType);
+    try (ColumnVector strCol = ColumnVector.fromStrings("AAAAAA");
+         ColumnVector listCol = ColumnVector.fromLists(listType, Arrays.asList(1, 2, 3, 4, 5));
+         ColumnVector structCol = ColumnVector.fromStructs(structType,
+             new HostColumnVector.StructData(1, -1L));
+         ColumnVector nestedStructCol = ColumnVector.fromStructs(nestedStructType,
+             new HostColumnVector.StructData(null,
+                 Arrays.asList(1, 2, null),
+                 new HostColumnVector.StructData(null, 10L)));
+         Scalar s = Scalar.structFromColumnViews(strCol, listCol, structCol, nestedStructCol)) {
+      assertEquals(DType.STRUCT, s.getType());
+      assertTrue(s.isValid());
+      ColumnView[] children = s.getChildrenFromStructScalar();
+      try {
+        assertColumnsAreEqual(strCol, children[0]);
+        assertColumnsAreEqual(listCol, children[1]);
+        assertColumnsAreEqual(structCol, children[2]);
+        assertColumnsAreEqual(nestedStructCol, children[3]);
+      } finally {
+        for (ColumnView child : children) child.close();
+      }
+    }
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8b7ece5d60b..0a3156a6862 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -47,14 +47,9 @@
 import java.util.Map;
 import java.util.stream.Collectors;
 
-import static ai.rapids.cudf.Aggregate.max;
-import static ai.rapids.cudf.Aggregate.first;
-import static ai.rapids.cudf.Aggregate.last;
+import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
+import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
-import static ai.rapids.cudf.Table.count;
-import static ai.rapids.cudf.Table.mean;
-import static ai.rapids.cudf.Table.min;
-import static ai.rapids.cudf.Table.sum;
 import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -2693,7 +2688,7 @@ void testGroupByUniqueCountNulls() {
             .build()) {
       try (Table t3 = t1
               .groupBy(0, 1)
-              .aggregate(Aggregation.nunique(true).onColumn(0));
+              .aggregate(Aggregation.nunique(NullPolicy.INCLUDE).onColumn(0));
            Table sorted = t3.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
                    .column( "1",  "1",  "1",  "1")
@@ -2711,7 +2706,8 @@ void testGroupByCount() {
                                            .column(   1,    3,    3,    5,    5,    0)
                                            .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(count(0));
+      try (Table t3 = t1.groupBy(0, 1)
+          .aggregate(Aggregation.count().onColumn(0));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -2748,24 +2744,27 @@ void testWindowingCount() {
         .build()) {
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
+           Scalar two = Scalar.fromInt(2);
+           Scalar one = Scalar.fromInt(1)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
         ColumnVector decSortedAggColumn = decSorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, decSortedAggColumn);
 
-        WindowOptions window = WindowOptions.builder()
+        try (WindowOptions window = WindowOptions.builder()
             .minPeriods(1)
-            .window(2, 1)
-            .build();
+            .window(two, one)
+            .build()) {
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window));
-             Table decWindowAggResults = decSorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expect, decWindowAggResults.getColumn(0));
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window));
+               Table decWindowAggResults = decSorted.groupBy(0, 4)
+                   .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window));
+               ColumnVector expect = ColumnVector.fromBoxedInts(2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2)) {
+            assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            assertColumnsAreEqual(expect, decWindowAggResults.getColumn(0));
+          }
         }
       }
     }
@@ -2785,25 +2784,28 @@ void testWindowingMin() {
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
-           ColumnVector expectDecSortedAggCol = ColumnVector.decimalFromLongs(2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+           ColumnVector expectDecSortedAggCol = ColumnVector.decimalFromLongs(2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
+           Scalar two = Scalar.fromInt(2);
+           Scalar one = Scalar.fromInt(1)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggCol, sortedAggColumn);
         ColumnVector decSortedAggColumn = decSorted.getColumn(6);
         assertColumnsAreEqual(expectDecSortedAggCol, decSortedAggColumn);
 
-        WindowOptions window = WindowOptions.builder()
+        try (WindowOptions window = WindowOptions.builder()
             .minPeriods(1)
-            .window(2, 1)
-            .build();
+            .window(two, one)
+            .build()) {
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation.min().onColumn(3).overWindow(window));
-             Table decWindowAggResults = decSorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation.min().onColumn(6).overWindow(window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6);
-             ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(decExpect, decWindowAggResults.getColumn(0));
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation.min().onColumn(3).overWindow(window));
+               Table decWindowAggResults = decSorted.groupBy(0, 4)
+                   .aggregateWindows(Aggregation.min().onColumn(6).overWindow(window));
+               ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6);
+               ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6)) {
+            assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            assertColumnsAreEqual(decExpect, decWindowAggResults.getColumn(0));
+          }
         }
       }
     }
@@ -2823,25 +2825,28 @@ void testWindowingMax() {
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table decSorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(4), OrderByArg.asc(5));
            ColumnVector expectSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
-           ColumnVector expectDecSortedAggCol = ColumnVector.decimalFromLongs(2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+           ColumnVector expectDecSortedAggCol = ColumnVector.decimalFromLongs(2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
+           Scalar two = Scalar.fromInt(2);
+           Scalar one = Scalar.fromInt(1)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggCol, sortedAggColumn);
         ColumnVector decSortedAggColumn = decSorted.getColumn(6);
         assertColumnsAreEqual(expectDecSortedAggCol, decSortedAggColumn);
 
-        WindowOptions window = WindowOptions.builder()
+        try (WindowOptions window = WindowOptions.builder()
             .minPeriods(1)
-            .window(2, 1)
-            .build();
+            .window(two, one)
+            .build()) {
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation.max().onColumn(3).overWindow(window));
-             Table decWindowAggResults = decSorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation.max().onColumn(6).overWindow(window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6);
-             ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(decExpect, decWindowAggResults.getColumn(0));
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation.max().onColumn(3).overWindow(window));
+               Table decWindowAggResults = decSorted.groupBy(0, 4)
+                   .aggregateWindows(Aggregation.max().onColumn(6).overWindow(window));
+               ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6);
+               ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6)) {
+            assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            assertColumnsAreEqual(decExpect, decWindowAggResults.getColumn(0));
+          }
         }
       }
     }
@@ -2856,19 +2861,22 @@ void testWindowingSum() {
         .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
         .build()) {
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
+           Scalar two = Scalar.fromInt(2);
+           Scalar one = Scalar.fromInt(1)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        WindowOptions window = WindowOptions.builder()
+        try (WindowOptions window = WindowOptions.builder()
             .minPeriods(1)
-            .window(2, 1)
-            .build();
+            .window(two, one)
+            .build()) {
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation.sum().onColumn(3).overWindow(window));
-             ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L)) {
-          assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation.sum().onColumn(3).overWindow(window));
+               ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L)) {
+            assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+          }
         }
       }
     }
@@ -2896,121 +2904,140 @@ void testWindowingRowNumber() {
 
         WindowOptions.Builder windowBuilder = WindowOptions.builder().minPeriods(1);
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .rowNumber()
-                .onColumn(3)
-                .overWindow(windowBuilder.window(2, 1).build()));
-             Table decWindowAggResults = decSorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
-                     .rowNumber()
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(2, 1).build()));
-             ColumnVector expectAggResult = ColumnVector.fromBoxedInts(1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2)) {
-          assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expectAggResult, decWindowAggResults.getColumn(0));
+        try (Scalar two = Scalar.fromInt(2);
+             Scalar one = Scalar.fromInt(1);
+             WindowOptions options = windowBuilder.window(two, one).build();
+             WindowOptions options1 = windowBuilder.window(two, one).build()) {
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation
+                  .rowNumber()
+                  .onColumn(3)
+                  .overWindow(options));
+               Table decWindowAggResults = decSorted.groupBy(0, 4)
+                   .aggregateWindows(Aggregation
+                       .rowNumber()
+                       .onColumn(6)
+                       .overWindow(options1));
+               ColumnVector expectAggResult = ColumnVector.fromBoxedInts(1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2)) {
+            assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+            assertColumnsAreEqual(expectAggResult, decWindowAggResults.getColumn(0));
+          }
         }
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .rowNumber()
-                .onColumn(3)
-                .overWindow(windowBuilder.window(3, 2).build()));
-             Table decWindowAggResults = decSorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
-                     .rowNumber()
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(3, 2).build()));
-             ColumnVector expectAggResult = ColumnVector.fromBoxedInts(1, 2, 3, 3, 1, 2, 3, 3, 1, 2, 3, 3)) {
-          assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expectAggResult, decWindowAggResults.getColumn(0));
+        try (Scalar three = Scalar.fromInt(3);
+             Scalar two = Scalar.fromInt(2);
+             WindowOptions options = windowBuilder.window(three, two).build();
+             WindowOptions options1 = windowBuilder.window(three, two).build()) {
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation
+                  .rowNumber()
+                  .onColumn(3)
+                  .overWindow(options));
+               Table decWindowAggResults = decSorted.groupBy(0, 4)
+                   .aggregateWindows(Aggregation
+                       .rowNumber()
+                       .onColumn(6)
+                       .overWindow(options1));
+               ColumnVector expectAggResult = ColumnVector.fromBoxedInts(1, 2, 3, 3, 1, 2, 3, 3, 1, 2, 3, 3)) {
+            assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+            assertColumnsAreEqual(expectAggResult, decWindowAggResults.getColumn(0));
+          }
         }
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .rowNumber()
-                .onColumn(3)
-                .overWindow(windowBuilder.window(4, 3).build()));
-             Table decWindowAggResults = decSorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
-                     .rowNumber()
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(4, 3).build()));
-             ColumnVector expectAggResult = ColumnVector.fromBoxedInts(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4)) {
-          assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expectAggResult, decWindowAggResults.getColumn(0));
+        try (Scalar four = Scalar.fromInt(4);
+             Scalar three = Scalar.fromInt(3);
+             WindowOptions options = windowBuilder.window(four, three).build();
+             WindowOptions options1 = windowBuilder.window(four, three).build()) {
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation
+                  .rowNumber()
+                  .onColumn(3)
+                  .overWindow(options));
+               Table decWindowAggResults = decSorted.groupBy(0, 4)
+                   .aggregateWindows(Aggregation
+                       .rowNumber()
+                       .onColumn(6)
+                       .overWindow(options1));
+               ColumnVector expectAggResult = ColumnVector.fromBoxedInts(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4)) {
+            assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+            assertColumnsAreEqual(expectAggResult, decWindowAggResults.getColumn(0));
+          }
         }
        }
     }
   }
 
   @Test
-  void testWindowingCollect() {
-    Aggregation aggCollectWithNulls = Aggregation.collect(Aggregation.NullPolicy.INCLUDE);
-    Aggregation aggCollect = Aggregation.collect();
-    WindowOptions winOpts = WindowOptions.builder()
-                                         .minPeriods(1)
-                                         .window(2, 1).build();
-    StructType nestedType = new StructType(false,
-        new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
-    try (Table raw = new Table.TestBuilder()
-             .column( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-             .column( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
-             .column( 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8) // OBY Key
-             .column( 7, 5, 1, 9, 7, 9, 8, 2, null, 0, 6, null) // Agg Column of INT32
-             .column(nestedType,                          // Agg Column of Struct
-                 new StructData(1, "s1"), new StructData(2, "s2"), new StructData(3, "s3"),
-                 new StructData(4, "s4"), new StructData(11, "s11"), new StructData(22, "s22"),
-                 new StructData(33, "s33"), new StructData(44, "s44"), new StructData(111, "s111"),
-                 new StructData(222, "s222"), new StructData(333, "s333"), new StructData(444, "s444")
-              ).build();
-         ColumnVector expectSortedAggColumn = ColumnVector
-             .fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, null, 0, 6, null)) {
-      try (Table sorted = raw.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2))) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
-
-        // Primitive type: INT32
-        //  a) including nulls
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(aggCollectWithNulls.onColumn(3).overWindow(winOpts));
-             ColumnVector expected = ColumnVector.fromLists(
-                 new ListType(false, new BasicType(false, DType.INT32)),
-                 Arrays.asList(7,5), Arrays.asList(7,5,1), Arrays.asList(5,1,9), Arrays.asList(1,9),
-                 Arrays.asList(7,9), Arrays.asList(7,9,8), Arrays.asList(9,8,2), Arrays.asList(8,2),
-                 Arrays.asList(null,0), Arrays.asList(null,0,6), Arrays.asList(0,6,null), Arrays.asList(6,null))) {
-          assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
-        }
-        //  b) excluding nulls
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(aggCollect.onColumn(3).overWindow(winOpts));
-             ColumnVector expected = ColumnVector.fromLists(
-                 new ListType(false, new BasicType(false, DType.INT32)),
-                 Arrays.asList(7,5), Arrays.asList(7,5,1), Arrays.asList(5,1,9), Arrays.asList(1,9),
-                 Arrays.asList(7,9), Arrays.asList(7,9,8), Arrays.asList(9,8,2), Arrays.asList(8,2),
-                 Arrays.asList(0), Arrays.asList(0,6), Arrays.asList(0,6), Arrays.asList(6))) {
-          assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
-        }
+  void testWindowingCollectList() {
+    Aggregation aggCollectWithNulls = Aggregation.collectList(NullPolicy.INCLUDE);
+    Aggregation aggCollect = Aggregation.collectList();
+    try (Scalar two = Scalar.fromInt(2);
+         Scalar one = Scalar.fromInt(1);
+         WindowOptions winOpts = WindowOptions.builder()
+             .minPeriods(1)
+             .window(two, one)
+             .build()) {
+      StructType nestedType = new StructType(false,
+          new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
+      try (Table raw = new Table.TestBuilder()
+          .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+          .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
+          .column(1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8) // OBY Key
+          .column(7, 5, 1, 9, 7, 9, 8, 2, null, 0, 6, null) // Agg Column of INT32
+          .column(nestedType,                          // Agg Column of Struct
+              new StructData(1, "s1"), new StructData(2, "s2"), new StructData(3, "s3"),
+              new StructData(4, "s4"), new StructData(11, "s11"), new StructData(22, "s22"),
+              new StructData(33, "s33"), new StructData(44, "s44"), new StructData(111, "s111"),
+              new StructData(222, "s222"), new StructData(333, "s333"), new StructData(444, "s444")
+          ).build();
+           ColumnVector expectSortedAggColumn = ColumnVector
+               .fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, null, 0, 6, null)) {
+        try (Table sorted = raw.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2))) {
+          ColumnVector sortedAggColumn = sorted.getColumn(3);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          // Primitive type: INT32
+          //  a) including nulls
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(aggCollectWithNulls.onColumn(3).overWindow(winOpts));
+               ColumnVector expected = ColumnVector.fromLists(
+                   new ListType(false, new BasicType(false, DType.INT32)),
+                   Arrays.asList(7, 5), Arrays.asList(7, 5, 1), Arrays.asList(5, 1, 9), Arrays.asList(1, 9),
+                   Arrays.asList(7, 9), Arrays.asList(7, 9, 8), Arrays.asList(9, 8, 2), Arrays.asList(8, 2),
+                   Arrays.asList(null, 0), Arrays.asList(null, 0, 6), Arrays.asList(0, 6, null), Arrays.asList(6, null))) {
+            assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
+          }
+          //  b) excluding nulls
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(aggCollect.onColumn(3).overWindow(winOpts));
+               ColumnVector expected = ColumnVector.fromLists(
+                   new ListType(false, new BasicType(false, DType.INT32)),
+                   Arrays.asList(7, 5), Arrays.asList(7, 5, 1), Arrays.asList(5, 1, 9), Arrays.asList(1, 9),
+                   Arrays.asList(7, 9), Arrays.asList(7, 9, 8), Arrays.asList(9, 8, 2), Arrays.asList(8, 2),
+                   Arrays.asList(0), Arrays.asList(0, 6), Arrays.asList(0, 6), Arrays.asList(6))) {
+            assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
+          }
 
-        // Nested type: Struct
-        List<StructData>[] expectedNestedData = new List[12];
-        expectedNestedData[0] = Arrays.asList(new StructData(1, "s1"),new StructData(2, "s2"));
-        expectedNestedData[1] = Arrays.asList(new StructData(1, "s1"),new StructData(2, "s2"),new StructData(3, "s3"));
-        expectedNestedData[2] = Arrays.asList(new StructData(2, "s2"),new StructData(3, "s3"),new StructData(4, "s4"));
-        expectedNestedData[3] = Arrays.asList(new StructData(3, "s3"),new StructData(4, "s4"));
-        expectedNestedData[4] = Arrays.asList(new StructData(11, "s11"),new StructData(22, "s22"));
-        expectedNestedData[5] = Arrays.asList(new StructData(11, "s11"),new StructData(22, "s22"),new StructData(33, "s33"));
-        expectedNestedData[6] = Arrays.asList(new StructData(22, "s22"),new StructData(33, "s33"), new StructData(44, "s44"));
-        expectedNestedData[7] = Arrays.asList(new StructData(33, "s33"), new StructData(44, "s44"));
-        expectedNestedData[8] = Arrays.asList(new StructData(111, "s111"),new StructData(222, "s222"));
-        expectedNestedData[9] = Arrays.asList(new StructData(111, "s111"),new StructData(222, "s222"),new StructData(333, "s333"));
-        expectedNestedData[10] = Arrays.asList(new StructData(222, "s222"),new StructData(333, "s333"),new StructData(444, "s444"));
-        expectedNestedData[11] = Arrays.asList(new StructData(333, "s333"),new StructData(444, "s444"));
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(aggCollect.onColumn(4).overWindow(winOpts));
-             ColumnVector expected = ColumnVector.fromLists(
-                 new ListType(false, nestedType), expectedNestedData)) {
-          assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
+          // Nested type: Struct
+          List<StructData>[] expectedNestedData = new List[12];
+          expectedNestedData[0] = Arrays.asList(new StructData(1, "s1"), new StructData(2, "s2"));
+          expectedNestedData[1] = Arrays.asList(new StructData(1, "s1"), new StructData(2, "s2"), new StructData(3, "s3"));
+          expectedNestedData[2] = Arrays.asList(new StructData(2, "s2"), new StructData(3, "s3"), new StructData(4, "s4"));
+          expectedNestedData[3] = Arrays.asList(new StructData(3, "s3"), new StructData(4, "s4"));
+          expectedNestedData[4] = Arrays.asList(new StructData(11, "s11"), new StructData(22, "s22"));
+          expectedNestedData[5] = Arrays.asList(new StructData(11, "s11"), new StructData(22, "s22"), new StructData(33, "s33"));
+          expectedNestedData[6] = Arrays.asList(new StructData(22, "s22"), new StructData(33, "s33"), new StructData(44, "s44"));
+          expectedNestedData[7] = Arrays.asList(new StructData(33, "s33"), new StructData(44, "s44"));
+          expectedNestedData[8] = Arrays.asList(new StructData(111, "s111"), new StructData(222, "s222"));
+          expectedNestedData[9] = Arrays.asList(new StructData(111, "s111"), new StructData(222, "s222"), new StructData(333, "s333"));
+          expectedNestedData[10] = Arrays.asList(new StructData(222, "s222"), new StructData(333, "s333"), new StructData(444, "s444"));
+          expectedNestedData[11] = Arrays.asList(new StructData(333, "s333"), new StructData(444, "s444"));
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(aggCollect.onColumn(4).overWindow(winOpts));
+               ColumnVector expected = ColumnVector.fromLists(
+                   new ListType(false, nestedType), expectedNestedData)) {
+            assertColumnsAreEqual(expected, windowAggResults.getColumn(0));
+          }
         }
       }
     }
@@ -3022,10 +3049,20 @@ void testWindowingLead() {
         .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
         .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
         .column(1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
-        .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
+        .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Int Agg Column
         .decimal32Column(-1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // Decimal GBY Key
         .decimal64Column(1, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L) // Decimal OBY Key
         .decimal64Column(-2, 7L, 5L, 1L, 9L, 7L, 9L, 8L, 2L, 8L, 0L, 6L, 6L) // Decimal Agg Column
+        .column(new ListType(false, new BasicType(true, DType.INT32)),
+            Arrays.asList(11, 12, null, 13), Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null),  Arrays.asList(16),
+            Arrays.asList(21, null, null, 22), Arrays.asList(23, 24), Arrays.asList(25, 26, 27),  Arrays.asList(28, 29, null),
+            Arrays.asList(null, 31), Arrays.asList(32, 33, 34), Arrays.asList(35, 36),  Arrays.asList(37, 38, 39)) // List Agg COLUMN
+        .column(new StructType(true,
+                new BasicType(true, DType.INT32),
+                new BasicType(true, DType.STRING)),
+            new StructData(1, "s1"), new StructData(null, "s2"), new StructData(2, null), new StructData(3, "s3"),
+            new StructData(11, "s11"), null, new StructData(13, "s13"), new StructData(14, "s14"),
+            new StructData(111, "s111"), new StructData(null, "s112"), new StructData(2, "s222"), new StructData(3, "s333")) //STRUCT Agg COLUMN
         .build()) {
 
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
@@ -3039,71 +3076,191 @@ void testWindowingLead() {
 
         WindowOptions.Builder windowBuilder = WindowOptions.builder().minPeriods(1);
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .lead(0)
-                .onColumn(3)
-                .overWindow(windowBuilder.window(2, 1).build()));
+        try (Scalar two = Scalar.fromInt(2);
+             Scalar one = Scalar.fromInt(1);
+             WindowOptions options = windowBuilder.window(two, one).build();
+             Table windowAggResults = sorted.groupBy(0, 1)
+                 .aggregateWindows(Aggregation
+                     .lead(0)
+                     .onColumn(3) // Int Agg Column
+                     .overWindow(options));
              Table decWindowAggResults = decSorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lead(0)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(2, 1).build()));
+                     .onColumn(6) // Decimal Agg Column
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(0)
+                     .onColumn(7) // List Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(0)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
-             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 Arrays.asList(11, 12, null, 13), Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null), Arrays.asList(16),
+                 Arrays.asList(21, null, null, 22), Arrays.asList(23, 24), Arrays.asList(25, 26, 27), Arrays.asList(28, 29, null),
+                 Arrays.asList(null, 31), Arrays.asList(32, 33, 34), Arrays.asList(35, 36), Arrays.asList(37, 38, 39));
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 new StructData(1, "s1"), new StructData(null, "s2"), new StructData(2, null), new StructData(3, "s3"),
+                 new StructData(11, "s11"), null, new StructData(13, "s13"), new StructData(14, "s14"),
+                 new StructData(111, "s111"), new StructData(null, "s112"), new StructData(2, "s222"), new StructData(3, "s333"))) {
+
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .lead(1)
-                .onColumn(3)
-                .overWindow(windowBuilder.window(0,1).build()));
+        try (Scalar zero = Scalar.fromInt(0);
+             Scalar one = Scalar.fromInt(1);
+             WindowOptions options = windowBuilder.window(zero, one).build();
+             Table windowAggResults = sorted.groupBy(0, 1)
+                 .aggregateWindows(Aggregation
+                     .lead(1)
+                     .onColumn(3) //Int Agg COLUMN
+                     .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lead(1)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(0,1).build()));
+                     .onColumn(6) //Decimal Agg COLUMN
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(1)
+                     .onColumn(7) //LIST Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(1)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(5, 1, 9, null, 9, 8, 2, null, 0, 6, 6, null);
-             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, 5, 1, 9, null, 9, 8, 2, null, 0, 6, 6, null)) {
+             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, 5, 1, 9, null, 9, 8, 2, null, 0, 6, 6, null);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null), Arrays.asList(16), null,
+                 Arrays.asList(23, 24), Arrays.asList(25, 26, 27), Arrays.asList(28, 29, null), null,
+                 Arrays.asList(32, 33, 34), Arrays.asList(35, 36), Arrays.asList(37, 38, 39), null);
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 new StructData(null, "s2"), new StructData(2, null), new StructData(3, "s3"), null,
+                 null, new StructData(13, "s13"), new StructData(14, "s14"), null,
+                 new StructData(null, "s112"), new StructData(2, "s222"), new StructData(3, "s333"), null)) {
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
 
-        try (ColumnVector defaultOutput = ColumnVector.fromBoxedInts(0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+        try (Scalar zero = Scalar.fromInt(0);
+             Scalar one = Scalar.fromInt(1);
+             WindowOptions options = windowBuilder.window(zero, one).build();
+             ColumnVector defaultOutput = ColumnVector.fromBoxedInts(0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
              ColumnVector decDefaultOutput = ColumnVector.decimalFromLongs(-2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+             ColumnVector listDefaultOutput = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 Arrays.asList(111), Arrays.asList(222), Arrays.asList(333), Arrays.asList(444, null, 555),
+                 Arrays.asList(-11), Arrays.asList(-22), Arrays.asList(-33), Arrays.asList(-44),
+                 Arrays.asList(6), Arrays.asList(6), Arrays.asList(6), Arrays.asList(6, null, null));
+             ColumnVector structDefaultOutput = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 new StructData(-1, "s1"), new StructData(null, "s2"), new StructData(-2, null), new StructData(-3, "s3"),
+                 new StructData(-11, "s11"), null, new StructData(-13, "s13"), new StructData(-14, "s14"),
+                 new StructData(-111, "s111"), new StructData(null, "s112"), new StructData(-222, "s222"), new StructData(-333, "s333"));
+
              Table windowAggResults = sorted.groupBy(0, 1)
                  .aggregateWindows(Aggregation
                      .lead(1, defaultOutput)
-                     .onColumn(3)
-                     .overWindow(windowBuilder.window(0,1).build()));
+                     .onColumn(3) //Int Agg COLUMN
+                     .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lead(1, decDefaultOutput)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(0,1).build()));
+                     .onColumn(6) //Decimal Agg COLUMN
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(1, listDefaultOutput)
+                     .onColumn(7) //LIST Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(1, structDefaultOutput)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(5, 1, 9, -3, 9, 8, 2, -7, 0, 6, 6, -11);
-             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 5, 1, 9, -3, 9, 8, 2, -7, 0, 6, 6, -11)) {
+             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 5, 1, 9, -3, 9, 8, 2, -7, 0, 6, 6, -11);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null), Arrays.asList(16), Arrays.asList(444, null, 555),
+                 Arrays.asList(23, 24), Arrays.asList(25, 26, 27), Arrays.asList(28, 29, null), Arrays.asList(-44),
+                 Arrays.asList(32, 33, 34), Arrays.asList(35, 36), Arrays.asList(37, 38, 39), Arrays.asList(6, null, null));
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 new StructData(null, "s2"), new StructData(2, null), new StructData(3, "s3"), new StructData(-3, "s3"),
+                 null, new StructData(13, "s13"), new StructData(14, "s14"), new StructData(-14, "s14"),
+                 new StructData(null, "s112"), new StructData(2, "s222"), new StructData(3, "s333"), new StructData(-333, "s333"))) {
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+
+          // TODO  this is not gonna work, since libcudf has some issue for lead on struct with default values
+          // assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
 
         // Outside bounds
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .lead(3)
-                .onColumn(3)
-                .overWindow(windowBuilder.window(0,1).build()));
+        try (Scalar zero = Scalar.fromInt(0);
+             Scalar one = Scalar.fromInt(1);
+             WindowOptions options = windowBuilder.window(zero, one).build();
+             Table windowAggResults = sorted.groupBy(0, 1)
+                 .aggregateWindows(Aggregation
+                     .lead(3)
+                     .onColumn(3) //Int Agg COLUMN
+                     .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lead(3)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(0,1).build()));
+                     .onColumn(6) //Decimal Agg COLUMN
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(3)
+                     .onColumn(7) //LIST Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lead(3)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(null, null, null, null, null, null, null, null, null, null, null, null);
-             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, null, null, null, null, null, null, null, null, null, null, null, null)) {
+             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, null, null, null, null, null, null, null, null, null, null, null, null);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 null, null, null, null, null, null, null, null, null, null, null, null);
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 null, null, null, null, null, null, null, null, null, null, null, null)){
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
       }
     }
@@ -3119,6 +3276,16 @@ void testWindowingLag() {
         .decimal32Column(-1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // Decimal GBY Key
         .decimal64Column(1, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L) // Decimal OBY Key
         .decimal64Column(-2, 7L, 5L, 1L, 9L, 7L, 9L, 8L, 2L, 8L, 0L, 6L, 6L) // Decimal Agg Column
+        .column(new ListType(false, new BasicType(true, DType.INT32)),
+            Arrays.asList(11, 12, null, 13), Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null),  Arrays.asList(16),
+            Arrays.asList(21, null, null, 22), Arrays.asList(23, 24), Arrays.asList(25, 26, 27),  Arrays.asList(28, 29, null),
+            Arrays.asList(null, 31), Arrays.asList(32, 33, 34), Arrays.asList(35, 36),  Arrays.asList(37, 38, 39)) // List Agg COLUMN
+        .column(new StructType(true,
+                new BasicType(true, DType.INT32),
+                new BasicType(true, DType.STRING)),
+            new StructData(1, "s1"), new StructData(null, "s2"), new StructData(2, null), new StructData(3, "s3"),
+            new StructData(11, "s11"), null, new StructData(13, "s13"), new StructData(14, "s14"),
+            new StructData(111, "s111"), new StructData(null, "s112"), new StructData(2, "s222"), new StructData(3, "s333")) //STRUCT Agg COLUMN
         .build()) {
 
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
@@ -3132,71 +3299,188 @@ void testWindowingLag() {
 
         WindowOptions.Builder windowBuilder = WindowOptions.builder().minPeriods(1);
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .lag(0)
-                .onColumn(3)
-                .overWindow(windowBuilder.window(2,1).build()));
+        try (Scalar two = Scalar.fromInt(2);
+             Scalar one = Scalar.fromInt(1);
+             WindowOptions options = windowBuilder.window(two, one).build();
+             Table windowAggResults = sorted.groupBy(0, 1)
+                 .aggregateWindows(Aggregation
+                     .lag(0)
+                     .onColumn(3) //Int Agg COLUMN
+                     .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lag(0)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(2,1).build()));
+                     .onColumn(6) //Decimal Agg COLUMN
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(0)
+                     .onColumn(7) //LIST Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(0)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
-             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
+             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 Arrays.asList(11, 12, null, 13), Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null), Arrays.asList(16),
+                 Arrays.asList(21, null, null, 22), Arrays.asList(23, 24), Arrays.asList(25, 26, 27), Arrays.asList(28, 29, null),
+                 Arrays.asList(null, 31), Arrays.asList(32, 33, 34), Arrays.asList(35, 36), Arrays.asList(37, 38, 39));
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 new StructData(1, "s1"), new StructData(null, "s2"), new StructData(2, null), new StructData(3, "s3"),
+                 new StructData(11, "s11"), null, new StructData(13, "s13"), new StructData(14, "s14"),
+                 new StructData(111, "s111"), new StructData(null, "s112"), new StructData(2, "s222"), new StructData(3, "s333"))) {
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .lag(1)
-                .onColumn(3)
-                .overWindow(windowBuilder.window(2,0).build()));
+        try (Scalar zero = Scalar.fromInt(0);
+             Scalar two = Scalar.fromInt(2);
+             WindowOptions options = windowBuilder.window(two, zero).build();
+             Table windowAggResults = sorted.groupBy(0, 1)
+                 .aggregateWindows(Aggregation
+                     .lag(1)
+                     .onColumn(3) //Int Agg COLUMN
+                     .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lag(1)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(2,0).build()));
+                     .onColumn(6) //Decimal Agg COLUMN
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(1)
+                     .onColumn(7) //LIST Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(1)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(null, 7, 5, 1, null, 7, 9, 8, null, 8, 0, 6);
-             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, null, 7, 5, 1, null, 7, 9, 8, null, 8, 0, 6)) {
+             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, null, 7, 5, 1, null, 7, 9, 8, null, 8, 0, 6);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 null, Arrays.asList(11, 12, null, 13), Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null),
+                 null, Arrays.asList(21, null, null, 22), Arrays.asList(23, 24), Arrays.asList(25, 26, 27),
+                 null, Arrays.asList(null, 31), Arrays.asList(32, 33, 34), Arrays.asList(35, 36));
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 null, new StructData(1, "s1"), new StructData(null, "s2"), new StructData(2, null),
+                 null, new StructData(11, "s11"), null, new StructData(13, "s13"),
+                 null, new StructData(111, "s111"), new StructData(null, "s112"), new StructData(2, "s222"))) {
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
 
-        try (ColumnVector defaultOutput = ColumnVector.fromBoxedInts(0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+        try (Scalar zero = Scalar.fromInt(0);
+             Scalar two = Scalar.fromInt(2);
+             WindowOptions options = windowBuilder.window(two, zero).build();
+             ColumnVector defaultOutput = ColumnVector.fromBoxedInts(0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
              ColumnVector decDefaultOutput = ColumnVector.decimalFromLongs(-2, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11);
+             ColumnVector listDefaultOutput = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 Arrays.asList(111), Arrays.asList(222), Arrays.asList(333), Arrays.asList(444, null, 555),
+                 Arrays.asList(-11), Arrays.asList(-22), Arrays.asList(-33), Arrays.asList(-44),
+                 Arrays.asList(6), Arrays.asList(6), Arrays.asList(6), Arrays.asList(6, null, null));
+             ColumnVector structDefaultOutput = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 new StructData(-1, "s1"), new StructData(null, "s2"), new StructData(-2, null), new StructData(-3, "s3"),
+                 new StructData(-11, "s11"), null, new StructData(-13, "s13"), new StructData(-14, "s14"),
+                 new StructData(-111, "s111"), new StructData(null, "s112"), new StructData(-222, "s222"), new StructData(-333, "s333"));
              Table windowAggResults = sorted.groupBy(0, 1)
                  .aggregateWindows(Aggregation
                      .lag(1, defaultOutput)
-                     .onColumn(3)
-                     .overWindow(windowBuilder.window(2, 0).build()));
+                     .onColumn(3) //Int Agg COLUMN
+                     .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lag(1, decDefaultOutput)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(2, 0).build()));
+                     .onColumn(6) //Decimal Agg COLUMN
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(1, listDefaultOutput)
+                     .onColumn(7) //LIST Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(1, structDefaultOutput)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(0, 7, 5, 1, -4, 7, 9, 8, -8, 8, 0, 6);
-             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 0, 7, 5, 1, -4, 7, 9, 8, -8, 8, 0, 6)) {
+             ColumnVector decExpectAggResult = ColumnVector.decimalFromLongs(-2, 0, 7, 5, 1, -4, 7, 9, 8, -8, 8, 0, 6);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 Arrays.asList(111), Arrays.asList(11, 12, null, 13), Arrays.asList(14, null, 15, null), Arrays.asList((Integer) null),
+                 Arrays.asList(-11), Arrays.asList(21, null, null, 22), Arrays.asList(23, 24), Arrays.asList(25, 26, 27),
+                 Arrays.asList(6), Arrays.asList(null, 31), Arrays.asList(32, 33, 34), Arrays.asList(35, 36));
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 new StructData(-1, "s1"), new StructData(1, "s1"), new StructData(null, "s2"), new StructData(2, null),
+                 new StructData(-11, "s11"), new StructData(11, "s11"), null, new StructData(13, "s13"),
+                 new StructData(-111, "s111"), new StructData(111, "s111"), new StructData(null, "s112"), new StructData(2, "s222"))) {
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+          // TODO  this is not gonna work, since libcudf has some issue for lag on struct with default values
+          // assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
 
         // Outside bounds
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindows(Aggregation
-                .lag(3)
-                .onColumn(3)
-                .overWindow(windowBuilder.window(1, 0).build()));
+        try (Scalar zero = Scalar.fromInt(0);
+             Scalar one = Scalar.fromInt(1);
+             WindowOptions options = windowBuilder.window(one, zero).build();
+             Table windowAggResults = sorted.groupBy(0, 1)
+                 .aggregateWindows(Aggregation
+                     .lag(3)
+                     .onColumn(3) //Int Agg COLUMN
+                     .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
                  .aggregateWindows(Aggregation
                      .lag(3)
-                     .onColumn(6)
-                     .overWindow(windowBuilder.window(1, 0).build()));
+                     .onColumn(6) //Decimal Agg COLUMN
+                     .overWindow(options));
+             Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(3)
+                     .onColumn(7) //LIST Agg COLUMN
+                     .overWindow(options));
+             Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
+                 Aggregation
+                     .lag(3)
+                     .onColumn(8) //STRUCT Agg COLUMN
+                     .overWindow(options));
              ColumnVector expectAggResult = ColumnVector.fromBoxedInts(null, null, null, null, null, null, null, null, null, null, null, null);
-             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, null, null, null, null, null, null, null, null, null, null, null, null);) {
+             ColumnVector decExpectAggResult = decimalFromBoxedInts(true, -2, null, null, null, null, null, null, null, null, null, null, null, null);
+             ColumnVector listExpectAggResult = ColumnVector.fromLists(
+                 new HostColumnVector.ListType(true, new HostColumnVector.BasicType(true, DType.INT32)),
+                 null, null, null, null, null, null, null, null, null, null, null, null);
+             ColumnVector structExpectAggResult = ColumnVector.fromStructs(
+                 new StructType(true,
+                     new BasicType(true, DType.INT32),
+                     new BasicType(true, DType.STRING)),
+                 null, null, null, null, null, null, null, null, null, null, null, null);) {
           assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           assertColumnsAreEqual(decExpectAggResult, decWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(listExpectAggResult, listWindowAggResults.getColumn(0));
+          assertColumnsAreEqual(structExpectAggResult, structWindowAggResults.getColumn(0));
         }
       }
     }
@@ -3205,24 +3489,27 @@ void testWindowingLag() {
   @Test
   void testWindowingMean() {
     try (Table unsorted = new Table.TestBuilder().column( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                                 .column( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
-                                                 .column( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
-                                                 .column( 7, 5, 3, 7, 7, 9, 8, 4, 8, 0, 4, 8) // Agg Column
-                                                 .build()) {
+        .column( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
+        .column( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
+        .column( 7, 5, 3, 7, 7, 9, 8, 4, 8, 0, 4, 8) // Agg Column
+        .build()) {
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectedSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 3, 7, 7, 9, 8, 4, 8, 0, 4, 8)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectedSortedAggCol, sortedAggColumn);
 
-        WindowOptions window = WindowOptions.builder()
-                                            .minPeriods(1)
-                                            .window(2, 1)
-                                            .build();
+        try (Scalar one = Scalar.fromInt(1);
+             Scalar two = Scalar.fromInt(2);
+             WindowOptions window = WindowOptions.builder()
+                 .minPeriods(1)
+                 .window(two, one)
+                 .build()) {
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                                            .aggregateWindows(WindowAggregate.mean(3, window));
-             ColumnVector expect = ColumnVector.fromBoxedDoubles(6.0d, 5.0d, 5.0d, 5.0d, 8.0d, 8.0d, 7.0d, 6.0d, 4.0d, 4.0d, 4.0d, 6.0d)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(Aggregation.mean().onColumn(3).overWindow(window));
+               ColumnVector expect = ColumnVector.fromBoxedDoubles(6.0d, 5.0d, 5.0d, 5.0d, 8.0d, 8.0d, 7.0d, 6.0d, 4.0d, 4.0d, 4.0d, 6.0d)) {
+            assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+          }
         }
       }
     }
@@ -3230,49 +3517,54 @@ void testWindowingMean() {
 
   @Test
   void testWindowingOnMultipleDifferentColumns() {
-    try (Table unsorted = new Table.TestBuilder().column( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                                 .column( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
-                                                 .column( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
-                                                 .column( 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
-                                                 .build()) {
+    try (Table unsorted = new Table.TestBuilder()
+        .column( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column( 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
+        .column( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
+        .column( 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
+        .build()) {
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            ColumnVector expectedSortedAggCol = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
         ColumnVector sortedAggColumn = sorted.getColumn(3);
         assertColumnsAreEqual(expectedSortedAggCol, sortedAggColumn);
 
-        // Window (1,1), with a minimum of 1 reading.
-        WindowOptions window_1 = WindowOptions.builder()
-                                              .minPeriods(1)
-                                              .window(2, 1)
-                                              .build();
-
-        // Window (2,2), with a minimum of 2 readings.
-        WindowOptions window_2 = WindowOptions.builder()
-                                              .minPeriods(2)
-                                              .window(3, 2)
-                                              .build();
-
-        // Window (1,1), with a minimum of 3 readings.
-        WindowOptions window_3 = WindowOptions.builder()
-                                              .minPeriods(3)
-                                              .window(2, 1)
-                                              .build();
-
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                                            .aggregateWindows(
-                                                WindowAggregate.sum(3, window_1),
-                                                WindowAggregate.max(3, window_1),
-                                                WindowAggregate.sum(3, window_2),
-                                                WindowAggregate.min(2, window_3)
-                                            );
-             ColumnVector expect_0 = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L);
-             ColumnVector expect_1 = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6);
-             ColumnVector expect_2 = ColumnVector.fromBoxedLongs(13L, 22L, 22L, 15L, 24L, 26L, 26L, 19L, 14L, 20L, 20L, 12L);
-             ColumnVector expect_3 = ColumnVector.fromBoxedInts(null, 1, 1, null, null, 3, 3, null, null, 5, 5, null)) {
-          assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
-          assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
-          assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
+        try (Scalar one = Scalar.fromInt(1);
+             Scalar two = Scalar.fromInt(2);
+             Scalar three = Scalar.fromInt(3);
+             // Window (1,1), with a minimum of 1 reading.
+             WindowOptions window_1 = WindowOptions.builder()
+                 .minPeriods(1)
+                 .window(two, one)
+                 .build();
+
+             // Window (2,2), with a minimum of 2 readings.
+             WindowOptions window_2 = WindowOptions.builder()
+                 .minPeriods(2)
+                 .window(three, two)
+                 .build();
+
+             // Window (1,1), with a minimum of 3 readings.
+             WindowOptions window_3 = WindowOptions.builder()
+                 .minPeriods(3)
+                 .window(two, one)
+                 .build()) {
+
+          try (Table windowAggResults = sorted.groupBy(0, 1)
+              .aggregateWindows(
+                  Aggregation.sum().onColumn(3).overWindow(window_1),
+                  Aggregation.max().onColumn(3).overWindow(window_1),
+                  Aggregation.sum().onColumn(3).overWindow(window_2),
+                  Aggregation.min().onColumn(2).overWindow(window_3)
+              );
+               ColumnVector expect_0 = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L);
+               ColumnVector expect_1 = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6);
+               ColumnVector expect_2 = ColumnVector.fromBoxedLongs(13L, 22L, 22L, 15L, 24L, 26L, 26L, 19L, 14L, 20L, 20L, 12L);
+               ColumnVector expect_3 = ColumnVector.fromBoxedInts(null, 1, 1, null, null, 3, 3, null, null, 5, 5, null)) {
+            assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
+            assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
+            assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
+            assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
+          }
         }
       }
     }
@@ -3281,551 +3573,814 @@ void testWindowingOnMultipleDifferentColumns() {
   @Test
   void testWindowingWithoutGroupByColumns() {
     try (Table unsorted = new Table.TestBuilder().column( 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6) // OBY Key
-                                                 .column( 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
-                                                 .build();
+        .column( 7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6) // Agg Column
+        .build();
          ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
 
       try (Table sorted = unsorted.orderBy(OrderByArg.asc(0))) {
         ColumnVector sortedAggColumn = sorted.getColumn(1);
         assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        WindowOptions window = WindowOptions.builder()
-                                            .minPeriods(1)
-                                            .window(2, 1)
-                                            .build();
+        try (Scalar one = Scalar.fromInt(1);
+             Scalar two = Scalar.fromInt(2);
+             WindowOptions window = WindowOptions.builder()
+                 .minPeriods(1)
+                 .window(two, one)
+                 .build()) {
 
-        try (Table windowAggResults = sorted.groupBy().aggregateWindows(WindowAggregate.sum(1, window));
-             ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 17L, 25L, 24L, 19L, 18L, 10L, 14L, 12L, 12L);
-        ) {
-          assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+          try (Table windowAggResults = sorted.groupBy().aggregateWindows(
+              Aggregation.sum().onColumn(1).overWindow(window));
+               ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 17L, 25L, 24L, 19L, 18L, 10L, 14L, 12L, 12L);
+          ) {
+            assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
+          }
         }
       }
     }
   }
 
-  @Test
-  void testTimeRangeWindowingCount() {
-    try (Table unsorted = new Table.TestBuilder().column(             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                                 .column(             0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
-                                                 .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
-                                                 .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
-                                                 .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+  private Scalar getScalar(DType type, long value) {
+    if (type.equals(DType.INT32)) {
+      return Scalar.fromInt((int) value);
+    } else if (type.equals(DType.INT64)) {
+      return Scalar.fromLong(value);
+    } else if (type.equals(DType.INT16)) {
+      return Scalar.fromShort((short) value);
+    } else if (type.equals(DType.INT8)) {
+      return Scalar.fromByte((byte) value);
+    } else if (type.equals(DType.UINT8)) {
+      return Scalar.fromUnsignedByte((byte) value);
+    } else if (type.equals(DType.UINT16)) {
+      return Scalar.fromUnsignedShort((short) value);
+    } else if (type.equals(DType.UINT32)) {
+      return Scalar.fromUnsignedInt((int) value);
+    } else if (type.equals(DType.UINT64)) {
+      return Scalar.fromUnsignedLong(value);
+    } else if (type.equals(DType.TIMESTAMP_DAYS)) {
+      return Scalar.durationFromLong(DType.DURATION_DAYS, value);
+    } else if (type.equals(DType.TIMESTAMP_SECONDS)) {
+      return Scalar.durationFromLong(DType.DURATION_SECONDS, value);
+    } else if (type.equals(DType.TIMESTAMP_MILLISECONDS)) {
+      return Scalar.durationFromLong(DType.DURATION_MILLISECONDS, value);
+    } else if (type.equals(DType.TIMESTAMP_MICROSECONDS)) {
+      return Scalar.durationFromLong(DType.DURATION_MICROSECONDS, value);
+    } else if (type.equals(DType.TIMESTAMP_NANOSECONDS)) {
+      return Scalar.durationFromLong(DType.DURATION_NANOSECONDS, value);
+    } else {
+      return Scalar.fromNull(type);
+    }
+  }
+
+  @Test
+  void testRangeWindowingCount() {
+    try (
+        Table unsorted = new Table.TestBuilder()
+            .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+            .column(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
+            .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+            .column(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L) // orderBy Key
+            .column((short) 1, (short)1, (short)2, (short)3, (short)3, (short)3, (short)4, (short)4, (short)5, (short)5, (short)6, (short)6, (short)7) // orderBy Key
+            .column(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // orderBy Key
+            .column((byte) 1, (byte)1, (byte)2, (byte)3, (byte)3, (byte)3, (byte)4, (byte)4, (byte)5, (byte)5, (byte)6, (byte)6, (byte)7) // orderBy Key
+            .timestampDayColumn(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // timestamp orderBy Key
+            .timestampSecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+            .timestampMicrosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+            .timestampMillisecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+            .timestampNanosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+            .build()) {
 
-        WindowOptions window = WindowOptions.builder()
-            .minPeriods(1)
-            .window(1, 1)
-            .timestampColumnIndex(2)
-            .build();
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(orderIndex));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverTimeRanges(WindowAggregate.count(3, window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 2, 4, 4, 4, 4, 4, 4, 5, 5, 3)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar preceding = getScalar(type, 1L);
+               Scalar following = getScalar(type, 1L)) {
+            try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .window(preceding, following)
+                .orderByColumnIndex(orderIndex)
+                .build()) {
+              try (Table windowAggResults = sorted.groupBy(0, 1).aggregateWindowsOverRanges(
+                  Aggregation.count().onColumn(2).overWindow(window));
+                   ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 2, 4, 4, 4, 4, 4, 4, 5, 5, 3)) {
+                assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingLead() {
+  void testRangeWindowingLead() {
     try (Table unsorted = new Table.TestBuilder()
-        .column(             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-        .column(             0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
-        .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
-        .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
+        .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L) // orderBy Key
+        .column((short) 1, (short)1, (short)2, (short)3, (short)3, (short)3, (short)4, (short)4, (short)5, (short)5, (short)6, (short)6, (short)7) // orderBy Key
+        .column(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // orderBy Key
+        .column((byte) 1, (byte)1, (byte)2, (byte)3, (byte)3, (byte)3, (byte)4, (byte)4, (byte)5, (byte)5, (byte)6, (byte)6, (byte)7) // orderBy Key
+        .timestampDayColumn(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp orderBy Key
+        .timestampSecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMicrosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMillisecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampNanosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
         .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        WindowOptions window = WindowOptions.builder()
-            .minPeriods(1)
-            .window(1, 1)
-            .timestampColumnIndex(2)
-            .build();
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(orderIndex));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-            .aggregateWindowsOverTimeRanges(Aggregation.lead(1)
-                .onColumn(3)
-                .overWindow(window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 9, null, 9, 8, 2, null, 0, 6, 6, 8, null)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar preceding = getScalar(type, 1L);
+               Scalar following = getScalar(type, 1L)) {
+            try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .window(preceding, following)
+                .orderByColumnIndex(orderIndex)
+                .build()) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(Aggregation.lead(1)
+                      .onColumn(2)
+                      .overWindow(window));
+                   ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 9, null, 9, 8, 2, null, 0, 6, 6, 8, null)) {
+                assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingMax() {
-    try (Table unsorted = new Table.TestBuilder().column(             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                                 .column(             0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
-                                                 .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
-                                                 .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
-                                                 .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
-
-        WindowOptions window = WindowOptions.builder()
-            .minPeriods(1)
-            .window(1, 1)
-            .timestampColumnIndex(2)
-            .build();
+  void testRangeWindowingMax() {
+    try (Table unsorted = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
+        .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L) // orderBy Key
+        .column((short) 1, (short)1, (short)2, (short)3, (short)3, (short)3, (short)4, (short)4, (short)5, (short)5, (short)6, (short)6, (short)7) // orderBy Key
+        .column(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // orderBy Key
+        .column((byte) 1, (byte)1, (byte)2, (byte)3, (byte)3, (byte)3, (byte)4, (byte)4, (byte)5, (byte)5, (byte)6, (byte)6, (byte)7) // orderBy Key
+        .timestampDayColumn(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp orderBy Key
+        .timestampSecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMicrosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMillisecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampNanosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .build()) {
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverTimeRanges(WindowAggregate.max(3, window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(        7, 7, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
-        }
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(orderIndex));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        window = WindowOptions.builder()
-            .minPeriods(1)
-            .window(2, 1)
-            .build();
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar preceding = getScalar(type, 1L);
+               Scalar following = getScalar(type, 1L)) {
+            try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .window(preceding, following)
+                .orderByColumnIndex(orderIndex)
+                .build()) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(Aggregation.max().onColumn(2).overWindow(window));
+                   ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8)) {
+                assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+              }
+            }
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindows(WindowAggregate.max(3, window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(        7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 8, 8)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+            try (Scalar one = Scalar.fromInt(1);
+                 Scalar two = Scalar.fromInt(2);
+                 WindowOptions window = WindowOptions.builder()
+                     .minPeriods(1)
+                     .window(two, one)
+                     .build()) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindows(Aggregation.max().onColumn(2).overWindow(window));
+                   ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 8, 8)) {
+                assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingRowNumber() {
-    try (Table unsorted = new Table.TestBuilder().column(             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                                 .column(             0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
-                                                 .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
-                                                 .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
-                                                 .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+  void testRangeWindowingRowNumber() {
+    try (Table unsorted = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
+        .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L) // orderBy Key
+        .column((short) 1, (short)1, (short)2, (short)3, (short)3, (short)3, (short)4, (short)4, (short)5, (short)5, (short)6, (short)6, (short)7) // orderBy Key
+        .column(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // orderBy Key
+        .column((byte) 1, (byte)1, (byte)2, (byte)3, (byte)3, (byte)3, (byte)4, (byte)4, (byte)5, (byte)5, (byte)6, (byte)6, (byte)7) // orderBy Key
+        .timestampDayColumn(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp orderBy Key
+        .timestampSecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMicrosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMillisecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampNanosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .build()) {
 
-        WindowOptions window = WindowOptions.builder()
-            .minPeriods(1)
-            .window(3, 0)
-            .timestampColumnIndex(2)
-            .build();
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(orderIndex));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverTimeRanges(WindowAggregate.row_number(3, window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(1, 2, 3, 4,  1, 2, 3, 4,  1, 2, 3, 4, 5)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar preceding = getScalar(type, 2L);
+               Scalar following = getScalar(type, 0L)) {
+            try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .window(preceding, following)
+                .orderByColumnIndex(orderIndex)
+                .build()) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(Aggregation.rowNumber().onColumn(2).overWindow(window));
+                   ColumnVector expect = ColumnVector.fromBoxedInts(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5)) {
+                assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingCountDescendingTimestamps() {
-    try (Table unsorted = new Table.TestBuilder().column(             1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1, 1) // GBY Key
-                                                 .column(             0, 0, 0, 0,  1, 1, 1, 1,  2, 2, 2, 2, 2) // GBY Key
-                                                 .timestampDayColumn( 7, 6, 6, 5,  5, 4, 4, 3,  3, 3, 2, 1, 1) // Timestamp Key
-                                                 .column(             7, 5, 1, 9,  7, 9, 8, 2,  8, 0, 6, 6, 8) // Agg Column
-                                                 .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(2));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+  void testRangeWindowingCountDescendingTimestamps() {
+    try (Table unsorted = new Table.TestBuilder()
+        .column(1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0,  1, 1, 1, 1,  2, 2, 2, 2, 2) // GBY Key
+        .column(7, 5, 1, 9,  7, 9, 8, 2,  8, 0, 6, 6, 8) // Agg Column
+        .column((short)7, (short)6, (short)6, (short)5, (short)5, (short)4, (short)4, (short)3, (short)3, (short)3, (short)2, (short)1, (short)1)
+        .column(7L, 6L, 6L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 2L, 1L, 1L)
+        .column(7, 6, 6, 5,  5, 4, 4, 3,  3, 3, 2, 1, 1)
+        .column((byte)7, (byte)6, (byte)6, (byte)5,  (byte)5, (byte)4, (byte)4, (byte)3,  (byte)3, (byte)3, (byte)2, (byte)1, (byte)1)
+        .timestampDayColumn(7, 6, 6, 5,  5, 4, 4, 3,  3, 3, 2, 1, 1) // Timestamp Key
+        .timestampSecondsColumn(7L, 6L, 6L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 2L, 1L, 1L)
+        .timestampMicrosecondsColumn(7L, 6L, 6L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 2L, 1L, 1L)
+        .timestampMillisecondsColumn(7L, 6L, 6L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 2L, 1L, 1L)
+        .timestampNanosecondsColumn(7L, 6L, 6L, 5L, 5L, 4L, 4L, 3L, 3L, 3L, 2L, 1L, 1L)
+        .build()) {
 
-        WindowOptions window_0 = WindowOptions.builder()
-            .minPeriods(1)
-            .window(2, 1)
-            .timestampColumnIndex(2)
-            .timestampDescending()
-            .build();
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(orderIndex));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar preceding_0 = getScalar(type, 2L);
+               Scalar following_0 = getScalar(type, 1L);
+               Scalar preceding_1 = getScalar(type, 3L);
+               Scalar following_1 = getScalar(type, 0L)) {
+
+            try (WindowOptions window_0 = WindowOptions.builder()
+                  .minPeriods(1)
+                  .window(preceding_0, following_0)
+                  .orderByColumnIndex(orderIndex)
+                  .orderByDescending()
+                  .build();
+
+                 WindowOptions window_1 = WindowOptions.builder()
+                  .minPeriods(1)
+                  .window(preceding_1, following_1)
+                  .orderByColumnIndex(orderIndex)
+                  .orderByDescending()
+                  .build();) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(
+                      Aggregation.count().onColumn(2).overWindow(window_0),
+                      Aggregation.sum().onColumn(2).overWindow(window_1));
+                   ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 5, 5, 5);
+                   ColumnVector expect_1 = ColumnVector.fromBoxedLongs(7L, 13L, 13L, 22L, 7L, 24L, 24L, 26L, 8L, 8L, 14L, 28L, 28L)) {
+                assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
+                assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
 
-        WindowOptions window_1 = WindowOptions.builder()
-            .minPeriods(1)
-            .window(3, 0)
-            .timestampColumnIndex(2)
-            .timestampDescending()
-            .build();
+  @Test
+  void testRangeWindowingWithoutGroupByColumns() {
+    try (Table unsorted = new Table.TestBuilder()
+        .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L) // orderBy Key
+        .column((short) 1, (short)1, (short)2, (short)3, (short)3, (short)3, (short)4, (short)4, (short)5, (short)5, (short)6, (short)6, (short)7) // orderBy Key
+        .column(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // orderBy Key
+        .column((byte) 1, (byte)1, (byte)2, (byte)3, (byte)3, (byte)3, (byte)4, (byte)4, (byte)5, (byte)5, (byte)6, (byte)6, (byte)7) // orderBy Key
+        .timestampDayColumn(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp orderBy Key
+        .timestampSecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMicrosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMillisecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampNanosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .build()) {
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverTimeRanges(
-                    WindowAggregate.count(3, window_0),
-                    WindowAggregate.sum  (3, window_1));
-             ColumnVector expect_0 = ColumnVector.fromBoxedInts(3,  4,  4,  4,  3, 4, 4, 4,  3, 3, 5, 5, 5);
-             ColumnVector expect_1 = ColumnVector.fromBoxedLongs(7L, 13L, 13L, 22L,  7L, 24L, 24L, 26L,  8L, 8L, 14L, 28L, 28L)) {
-          assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(orderIndex));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(0);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar preceding = getScalar(type, 1L);
+               Scalar following = getScalar(type, 1L)) {
+            try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .window(preceding, following)
+                .orderByColumnIndex(orderIndex)
+                .build();) {
+
+              try (Table windowAggResults = sorted.groupBy()
+                  .aggregateWindowsOverRanges(Aggregation.count().onColumn(1).overWindow(window));
+                   ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 6, 6, 6, 6, 7, 7, 6, 6, 5, 5, 3)) {
+                assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingWithoutGroupByColumns() {
-    try (Table unsorted = new Table.TestBuilder().timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
-                                                 .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
-                                                 .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(1);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+  void testRangeWindowingOrderByUnsupportedDataTypeExceptions() {
+    try (Table table = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
+        .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(true, false, true, false, true, false, true, false, false, false, false, false, false) // orderBy Key
+        .build()) {
 
-        WindowOptions window = WindowOptions.builder()
-            .minPeriods(1)
-            .window(1, 1)
-            .timestampColumnIndex(0)
-            .build();
+      try (Scalar one = Scalar.fromInt(1);
+           WindowOptions rangeBasedWindow = WindowOptions.builder()
+               .minPeriods(1)
+               .window(one, one)
+               .orderByColumnIndex(3)
+               .build()) {
 
-        try (Table windowAggResults = sorted.groupBy()
-                  .aggregateWindowsOverTimeRanges(WindowAggregate.count(1, window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 6, 6, 6, 6, 7, 7, 6, 6, 5, 5, 3)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
-        }
+        assertThrows(IllegalArgumentException.class,
+            () -> table
+                .groupBy(0, 1)
+                .aggregateWindowsOverRanges(Aggregation.max().onColumn(2).overWindow(rangeBasedWindow)));
       }
     }
   }
 
   @Test
   void testInvalidWindowTypeExceptions() {
-      try (Table table = new Table.TestBuilder().column(             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                                .column(             0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
-                                                .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
-                                                .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
-                                                .build()) {
+    try (Scalar one = Scalar.fromInt(1);
+         Table table = new Table.TestBuilder()
+             .column(             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+             .column(             0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
+             .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
+             .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+             .build()) {
 
-        WindowOptions rowBasedWindow = WindowOptions.builder().minPeriods(1).window(1,1).build();
-        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindowsOverTimeRanges(WindowAggregate.max(3, rowBasedWindow)));
 
-        WindowOptions rangeBasedWindow = WindowOptions.builder().minPeriods(1).window(1,1).timestampColumnIndex(2).build();
-        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindows(WindowAggregate.max(3, rangeBasedWindow)));
+      try (WindowOptions rowBasedWindow = WindowOptions.builder()
+          .minPeriods(1)
+          .window(one, one)
+          .build()) {
+        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindowsOverRanges(Aggregation.max().onColumn(3).overWindow(rowBasedWindow)));
+      }
 
+      try (WindowOptions rangeBasedWindow = WindowOptions.builder()
+          .minPeriods(1)
+          .window(one, one)
+          .orderByColumnIndex(2)
+          .build()) {
+        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindows(Aggregation.max().onColumn(3).overWindow(rangeBasedWindow)));
       }
+    }
   }
 
   @Test
-  void testTimeRangeWindowingCountUnboundedPreceding() {
-    try (Table unsorted = new Table.TestBuilder().column(             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                                 .column(             0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
-                                                 .timestampDayColumn( 1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp Key
-                                                 .column(             7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
-                                                 .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+  void testRangeWindowingCountUnboundedPreceding() {
+    try (Table unsorted = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2) // GBY Key
+        .column(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L) // orderBy Key
+        .column((short) 1, (short)1, (short)2, (short)3, (short)3, (short)3, (short)4, (short)4, (short)5, (short)5, (short)6, (short)6, (short)7) // orderBy Key
+        .column(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // orderBy Key
+        .column((byte) 1, (byte)1, (byte)2, (byte)3, (byte)3, (byte)3, (byte)4, (byte)4, (byte)5, (byte)5, (byte)6, (byte)6, (byte)7) // orderBy Key
+        .timestampDayColumn(1, 1, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7) // Timestamp orderBy Key
+        .timestampSecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMicrosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampMillisecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .timestampNanosecondsColumn(1L, 1L, 2L, 3L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L)
+        .build()) {
 
-        WindowOptions window = WindowOptions.builder()
-            .minPeriods(1)
-            .unboundedPreceding()
-            .following(1)
-            .timestampColumnIndex(2)
-            .build();
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(orderIndex));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
 
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverTimeRanges(WindowAggregate.count(3, window));
-             ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5)) {
-          assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar following = getScalar(type, 1L)) {
+            try (WindowOptions window = WindowOptions.builder()
+                .minPeriods(1)
+                .unboundedPreceding()
+                .following(following)
+                .orderByColumnIndex(orderIndex)
+                .build();) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(Aggregation.count().onColumn(2).overWindow(window));
+                   ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5)) {
+                assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingCountUnboundedASCWithNullsFirst() {
-    Integer X = null;
+  void testRangeWindowingCountUnboundedASCWithNullsFirst() {
     try (Table unsorted = new Table.TestBuilder()
-                                    .column(             1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                    .column(             0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-                                    .timestampDayColumn( X, X, X, 2, 3, 5,  X, X, 1, 2, 4, 5, 7) // Timestamp Key
-                                    .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
-                                    .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2, true));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+        .column(1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column( null, null, null, 2, 3, 5, null, null, 1, 2, 4, 5, 7) // Timestamp Key
+        .column( null, null, null, 2L, 3L, 5L, null, null, 1L, 2L, 4L, 5L, 7L) // orderBy Key
+        .column( null, null, null, (short)2, (short)3, (short)5, null, null, (short)1, (short)2, (short)4, (short)5, (short)7) // orderBy Key
+        .column( null, null, null, (byte)2, (byte)3, (byte)5, null, null, (byte)1, (byte)2, (byte)4, (byte)5, (byte)7) // orderBy Key
+        .timestampDayColumn( null, null, null, 2, 3, 5, null, null, 1, 2, 4, 5, 7) // Timestamp orderBy Key
+        .timestampSecondsColumn( null, null, null, 2L, 3L, 5L, null, null, 1L, 2L, 4L, 5L, 7L)
+        .timestampMicrosecondsColumn( null, null, null, 2L, 3L, 5L, null, null, 1L, 2L, 4L, 5L, 7L)
+        .timestampMillisecondsColumn( null, null, null, 2L, 3L, 5L, null, null, 1L, 2L, 4L, 5L, 7L)
+        .timestampNanosecondsColumn( null, null, null, 2L, 3L, 5L, null, null, 1L, 2L, 4L, 5L, 7L)
+        .build()) {
 
-        WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
-            .minPeriods(1)
-            .unboundedPreceding()
-            .following(1)
-            .timestampColumnIndex(2)
-            .build();
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(orderIndex, true));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar following1 = getScalar(type, 1L);
+               Scalar preceding1 = getScalar(type, 1L);
+               Scalar following0 = getScalar(type, 0L);
+               Scalar preceding0 = getScalar(type, 0L);) {
+            try (WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
+                .minPeriods(1)
+                .unboundedPreceding()
+                .following(following1)
+                .orderByColumnIndex(orderIndex)
+                .build();
 
-        WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
+            WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(1)
+                .preceding(preceding1)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
+                .orderByColumnIndex(orderIndex)
                 .build();
 
-        WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
+                .orderByColumnIndex(orderIndex)
                 .build();
 
-        WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
-                .following(0)
-                .timestampColumnIndex(2)
+                .following(following0)
+                .orderByColumnIndex(orderIndex)
                 .build();
 
-        WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
+            WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(0)
+                .preceding(preceding0)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .build();
-
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverTimeRanges(
-                          Aggregation.count().onColumn(3).overWindow(unboundedPrecedingOneFollowing),
-                          Aggregation.count().onColumn(3).overWindow(onePrecedingUnboundedFollowing),
-                          Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndFollowing),
-                          Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndCurrentRow),
-                          Aggregation.count().onColumn(3).overWindow(currentRowAndUnboundedFollowing));
-             ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 5, 5, 6,  2, 2, 4, 4, 6, 6, 7);
-             ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 3, 1,  7, 7, 5, 5, 3, 3, 1);
-             ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6,  7, 7, 7, 7, 7, 7, 7);
-             ColumnVector expect_3 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 5, 6,  2, 2, 3, 4, 5, 6, 7);
-             ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 1,  7, 7, 5, 4, 3, 2, 1)) {
-
-            assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
-            assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
-            assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
-            assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
-            assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+                .orderByColumnIndex(orderIndex)
+                .build();) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                   ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 5, 5, 6, 2, 2, 4, 4, 6, 6, 7);
+                   ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 3, 1, 7, 7, 5, 5, 3, 3, 1);
+                   ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
+                   ColumnVector expect_3 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 5, 6, 2, 2, 3, 4, 5, 6, 7);
+                   ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 1, 7, 7, 5, 4, 3, 2, 1)) {
+
+                assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
+                assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
+                assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
+                assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
+                assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingCountUnboundedDESCWithNullsFirst() {
-    Integer X = null;
+  void testRangeWindowingCountUnboundedDESCWithNullsFirst() {
     try (Table unsorted = new Table.TestBuilder()
-            .column(             1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-            .column(             0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-            .timestampDayColumn( X, X, X, 5, 3, 2,  X, X, 7, 5, 4, 2, 1) // Timestamp Key
-            .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
-            .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(2, false));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+        .column(1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(null, null, null, 5, 3, 2,  null, null, 7, 5, 4, 2, 1) // Timestamp Key
+        .column(null, null, null, 5L, 3L, 2L,  null, null, 7L, 5L, 4L, 2L, 1L) // orderby Key
+        .column(null, null, null, (short)5, (short)3, (short)2,  null, null, (short)7, (short)5, (short)4, (short)2, (short)1) // orderby Key
+        .column(null, null, null, (byte)5, (byte)3, (byte)2,  null, null, (byte)7, (byte)5, (byte)4, (byte)2, (byte)1) // orderby Key
+        .timestampDayColumn(null, null, null, 5, 3, 2, null, null, 7, 5, 4, 2, 1) // Timestamp orderby Key
+        .timestampSecondsColumn( null, null, null, 5L, 3L, 2L,  null, null, 7L, 5L, 4L, 2L, 1L)
+        .timestampMicrosecondsColumn( null, null, null, 5L, 3L, 2L,  null, null, 7L, 5L, 4L, 2L, 1L)
+        .timestampMillisecondsColumn( null, null, null, 5L, 3L, 2L,  null, null, 7L, 5L, 4L, 2L, 1L)
+        .timestampNanosecondsColumn( null, null, null, 5L, 3L, 2L,  null, null, 7L, 5L, 4L, 2L, 1L)
+        .build()) {
 
-        WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(orderIndex, false));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar following1 = getScalar(type, 1L);
+               Scalar preceding1 = getScalar(type, 1L);
+               Scalar following0 = getScalar(type, 0L);
+               Scalar preceding0 = getScalar(type, 0L);) {
+
+            try (WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
-                .following(1)
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .following(following1)
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
+            WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(1)
+                .preceding(preceding1)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
-                .following(0)
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .following(following0)
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
+            WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(0)
+                .preceding(preceding0)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .timestampDescending()
-                .build();
-
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                .aggregateWindowsOverTimeRanges(
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingOneFollowing),
-                        Aggregation.count().onColumn(3).overWindow(onePrecedingUnboundedFollowing),
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndFollowing),
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndCurrentRow),
-                        Aggregation.count().onColumn(3).overWindow(currentRowAndUnboundedFollowing));
-             ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 6, 6,  2, 2, 3, 5, 5, 7, 7);
-             ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 2,  7, 7, 5, 4, 4, 2, 2);
-             ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6,  7, 7, 7, 7, 7, 7, 7);
-             ColumnVector expect_3 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 5, 6,  2, 2, 3, 4, 5, 6, 7);
-             ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 1,  7, 7, 5, 4, 3, 2, 1)) {
-
-          assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
-          assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
-          assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
-          assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
+                .build();) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                   ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 6, 6, 2, 2, 3, 5, 5, 7, 7);
+                   ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 2, 7, 7, 5, 4, 4, 2, 2);
+                   ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
+                   ColumnVector expect_3 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 5, 6, 2, 2, 3, 4, 5, 6, 7);
+                   ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 1, 7, 7, 5, 4, 3, 2, 1)) {
+
+                assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
+                assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
+                assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
+                assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
+                assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingCountUnboundedASCWithNullsLast() {
-    Integer X = null;
+  void testRangeWindowingCountUnboundedASCWithNullsLast() {
     try (Table unsorted = new Table.TestBuilder()
-            .column(             1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-            .column(             0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-            .timestampDayColumn( 2, 3, 5, X, X, X,  1, 2, 4, 5, 7, X, X) // Timestamp Key
-            .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
-            .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2, false));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
-        WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
+        .column(1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column(2, 3, 5, null, null, null,  1, 2, 4, 5, 7, null, null) // Timestamp Key
+        .column(2L, 3L, 5L, null, null, null, 1L, 2L, 4L, 5L, 7L, null, null) // order by Key
+        .column((short)2, (short)3, (short)5, null, null, null, (short)1, (short)2, (short)4, (short)5, (short)7, null, null) // order by Key
+        .column((byte)2, (byte)3, (byte)5, null, null, null, (byte)1, (byte)2, (byte)4, (byte)5, (byte)7, null, null) // order by Key
+        .timestampDayColumn( 2, 3, 5, null, null, null,  1, 2, 4, 5, 7, null, null) // Timestamp order by Key
+        .timestampSecondsColumn( 2L, 3L, 5L, null, null, null, 1L, 2L, 4L, 5L, 7L, null, null)
+        .timestampMicrosecondsColumn( 2L, 3L, 5L, null, null, null, 1L, 2L, 4L, 5L, 7L, null, null)
+        .timestampMillisecondsColumn( 2L, 3L, 5L, null, null, null, 1L, 2L, 4L, 5L, 7L, null, null)
+        .timestampNanosecondsColumn( 2L, 3L, 5L, null, null, null, 1L, 2L, 4L, 5L, 7L, null, null)
+        .build()) {
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(orderIndex, false));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar following1 = getScalar(type, 1L);
+               Scalar preceding1 = getScalar(type, 1L);
+               Scalar following0 = getScalar(type, 0L);
+               Scalar preceding0 = getScalar(type, 0L);) {
+            try (WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
-                .following(1)
-                .timestampColumnIndex(2)
+                .following(following1)
+                .orderByColumnIndex(orderIndex)
                 .build();
 
-        WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
+            WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(1)
+                .preceding(preceding1)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
+                .orderByColumnIndex(orderIndex)
                 .build();
 
-        WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
+                .orderByColumnIndex(orderIndex)
                 .build();
 
-        WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
-                .following(0)
-                .timestampColumnIndex(2)
+                .following(following0)
+                .orderByColumnIndex(orderIndex)
                 .build();
 
-        WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
+            WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(0)
+                .preceding(preceding0)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .build();
-
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                .aggregateWindowsOverTimeRanges(
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingOneFollowing),
-                        Aggregation.count().onColumn(3).overWindow(onePrecedingUnboundedFollowing),
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndFollowing),
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndCurrentRow),
-                        Aggregation.count().onColumn(3).overWindow(currentRowAndUnboundedFollowing));
-             ColumnVector expect_0 = ColumnVector.fromBoxedInts(2, 2, 3, 6, 6, 6,  2, 2, 4, 4, 5, 7, 7);
-             ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 4, 3, 3, 3,  7, 7, 5, 5, 3, 2, 2);
-             ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6,  7, 7, 7, 7, 7, 7, 7);
-             ColumnVector expect_3 = ColumnVector.fromBoxedInts(1, 2, 3, 6, 6, 6,  1, 2, 3, 4, 5, 7, 7);
-             ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 5, 4, 3, 3, 3,  7, 6, 5, 4, 3, 2, 2)) {
-
-          assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
-          assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
-          assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
-          assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+                .orderByColumnIndex(orderIndex)
+                .build();) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                   ColumnVector expect_0 = ColumnVector.fromBoxedInts(2, 2, 3, 6, 6, 6, 2, 2, 4, 4, 5, 7, 7);
+                   ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 4, 3, 3, 3, 7, 7, 5, 5, 3, 2, 2);
+                   ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
+                   ColumnVector expect_3 = ColumnVector.fromBoxedInts(1, 2, 3, 6, 6, 6, 1, 2, 3, 4, 5, 7, 7);
+                   ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 5, 4, 3, 3, 3, 7, 6, 5, 4, 3, 2, 2)) {
+
+                assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
+                assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
+                assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
+                assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
+                assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+              }
+            }
+          }
         }
       }
     }
   }
 
   @Test
-  void testTimeRangeWindowingCountUnboundedDESCWithNullsLast() {
+  void testRangeWindowingCountUnboundedDESCWithNullsLast() {
     Integer X = null;
     try (Table unsorted = new Table.TestBuilder()
-            .column(             1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-            .column(             0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
-            .timestampDayColumn( 5, 3, 2, X, X, X,  7, 5, 4, 2, 1, X, X) // Timestamp Key
-            .column(             7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
-            .build()) {
-      try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(2, true));
-           ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
-        ColumnVector sortedAggColumn = sorted.getColumn(3);
-        assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
-
-        WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
+        .column(1, 1, 1, 1, 1, 1,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(0, 0, 0, 0, 0, 0,  1, 1, 1, 1, 1, 1, 1) // GBY Key
+        .column(7, 5, 1, 9, 7, 9,  8, 2, 8, 0, 6, 6, 8) // Agg Column
+        .column( 5, 3, 2, null, null, null, 7, 5, 4, 2, 1, null, null) // Timestamp Key
+        .column(5L, 3L, 2L, null, null, null, 7L, 5L, 4L, 2L, 1L, null, null) // Timestamp Key
+        .column((short)5, (short)3, (short)2, null, null, null, (short)7, (short)5, (short)4, (short)2, (short)1, null, null) // Timestamp Key
+        .column((byte)5, (byte)3, (byte)2, null, null, null, (byte)7, (byte)5, (byte)4, (byte)2, (byte)1, null, null) // Timestamp Key
+        .timestampDayColumn( 5, 3, 2, X, X, X,  7, 5, 4, 2, 1, X, X) // Timestamp Key
+        .timestampSecondsColumn( 5L, 3L, 2L, null, null, null, 7L, 5L, 4L, 2L, 1L, null, null)
+        .timestampMicrosecondsColumn( 5L, 3L, 2L, null, null, null, 7L, 5L, 4L, 2L, 1L, null, null)
+        .timestampMillisecondsColumn( 5L, 3L, 2L, null, null, null, 7L, 5L, 4L, 2L, 1L, null, null)
+        .timestampNanosecondsColumn( 5L, 3L, 2L, null, null, null, 7L, 5L, 4L, 2L, 1L, null, null)
+        .build()) {
+      for (int orderIndex = 3; orderIndex < unsorted.getNumberOfColumns(); orderIndex++) {
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.desc(orderIndex, true));
+             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6, 8)) {
+          ColumnVector sortedAggColumn = sorted.getColumn(2);
+          assertColumnsAreEqual(expectSortedAggColumn, sortedAggColumn);
+
+          DType type = unsorted.getColumn(orderIndex).getType();
+          try (Scalar following1 = getScalar(type, 1L);
+               Scalar preceding1 = getScalar(type, 1L);
+               Scalar following0 = getScalar(type, 0L);
+               Scalar preceding0 = getScalar(type, 0L);) {
+            try (WindowOptions unboundedPrecedingOneFollowing = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
-                .following(1)
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .following(following1)
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
+            WindowOptions onePrecedingUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(1)
+                .preceding(preceding1)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndFollowing = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
+            WindowOptions unboundedPrecedingAndCurrentRow = WindowOptions.builder()
                 .minPeriods(1)
                 .unboundedPreceding()
-                .following(0)
-                .timestampColumnIndex(2)
-                .timestampDescending()
+                .following(following0)
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
                 .build();
 
-        WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
+            WindowOptions currentRowAndUnboundedFollowing = WindowOptions.builder()
                 .minPeriods(1)
-                .preceding(0)
+                .preceding(preceding0)
                 .unboundedFollowing()
-                .timestampColumnIndex(2)
-                .timestampDescending()
-                .build();
-
-        try (Table windowAggResults = sorted.groupBy(0, 1)
-                .aggregateWindowsOverTimeRanges(
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingOneFollowing),
-                        Aggregation.count().onColumn(3).overWindow(onePrecedingUnboundedFollowing),
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndFollowing),
-                        Aggregation.count().onColumn(3).overWindow(unboundedPrecedingAndCurrentRow),
-                        Aggregation.count().onColumn(3).overWindow(currentRowAndUnboundedFollowing));
-             ColumnVector expect_0 = ColumnVector.fromBoxedInts(1, 3, 3, 6, 6, 6,  1, 3, 3, 5, 5, 7, 7);
-             ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 5, 5, 3, 3, 3,  7, 6, 6, 4, 4, 2, 2);
-             ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6,  7, 7, 7, 7, 7, 7, 7);
-             ColumnVector expect_3 = ColumnVector.fromBoxedInts(1, 2, 3, 6, 6, 6,  1, 2, 3, 4, 5, 7, 7);
-             ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 5, 4, 3, 3, 3,  7, 6, 5, 4, 3, 2, 2)) {
-
-          assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
-          assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
-          assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
-          assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
-          assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+                .orderByColumnIndex(orderIndex)
+                .orderByDescending()
+                .build();) {
+
+              try (Table windowAggResults = sorted.groupBy(0, 1)
+                  .aggregateWindowsOverRanges(
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                   ColumnVector expect_0 = ColumnVector.fromBoxedInts(1, 3, 3, 6, 6, 6, 1, 3, 3, 5, 5, 7, 7);
+                   ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 5, 5, 3, 3, 3, 7, 6, 6, 4, 4, 2, 2);
+                   ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
+                   ColumnVector expect_3 = ColumnVector.fromBoxedInts(1, 2, 3, 6, 6, 6, 1, 2, 3, 4, 5, 7, 7);
+                   ColumnVector expect_4 = ColumnVector.fromBoxedInts(6, 5, 4, 3, 3, 3, 7, 6, 5, 4, 3, 2, 2)) {
+
+                assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
+                assertColumnsAreEqual(expect_1, windowAggResults.getColumn(1));
+                assertColumnsAreEqual(expect_2, windowAggResults.getColumn(2));
+                assertColumnsAreEqual(expect_3, windowAggResults.getColumn(3));
+                assertColumnsAreEqual(expect_4, windowAggResults.getColumn(4));
+              }
+            }
+          }
         }
       }
     }
@@ -3838,7 +4393,10 @@ void testGroupByCountWithNulls() {
                                            .column(   1,    1, null, null,    1,    1)
                                            .column(   1,    1,    1, null,    1,    1)
                                            .build()) {
-      try (Table tmp = t1.groupBy(0).aggregate(count(1), count(2), count(3));
+      try (Table tmp = t1.groupBy(0).aggregate(
+          Aggregation.count().onColumn(1),
+          Aggregation.count().onColumn(2),
+          Aggregation.count().onColumn(3));
            Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
@@ -3875,7 +4433,11 @@ void testGroupByCountWithNullsIncluded() {
             .column(   1,    1, null, null,    1,    1)
             .column(   1,    1,    1, null,    1,    1)
             .build()) {
-      try (Table tmp = t1.groupBy(0).aggregate(count(1, true), count(2, true), count(3, true), count(3));
+      try (Table tmp = t1.groupBy(0).aggregate(
+          Aggregation.count(NullPolicy.INCLUDE).onColumn(1),
+          Aggregation.count(NullPolicy.INCLUDE).onColumn(2),
+          Aggregation.count(NullPolicy.INCLUDE).onColumn(3),
+          Aggregation.count().onColumn(3));
            Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
@@ -3922,7 +4484,10 @@ void testGroupByCountWithCollapsingNulls() {
           .withIgnoreNullKeys(true)
           .build();
 
-      try (Table tmp = t1.groupBy(options, 0).aggregate(count(1), count(2), count(3));
+      try (Table tmp = t1.groupBy(options, 0).aggregate(
+          Aggregation.count().onColumn(1),
+          Aggregation.count().onColumn(2),
+          Aggregation.count().onColumn(3));
            Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
@@ -3953,8 +4518,7 @@ void testGroupByMax() {
                                            .column(   1,    3,    3,    5,    5,    0)
                                            .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1)
-              .aggregate(max(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.max().onColumn(2));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -4029,7 +4593,7 @@ void testGroupByMinBool() {
     try (Table t1 = new Table.TestBuilder()
         .column(true, null, false, true, null, null)
         .column(   1,    1,     2,    2,    3,    3).build();
-         Table other = t1.groupBy(1).aggregate(min(0));
+         Table other = t1.groupBy(1).aggregate(Aggregation.min().onColumn(0));
          Table ordered = other.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column(1, 2, 3)
@@ -4044,7 +4608,7 @@ void testGroupByMaxBool() {
     try (Table t1 = new Table.TestBuilder()
         .column(false, null, false, true, null, null)
         .column(   1,    1,     2,    2,    3,    3).build();
-         Table other = t1.groupBy(1).aggregate(max(0));
+         Table other = t1.groupBy(1).aggregate(Aggregation.max().onColumn(0));
          Table ordered = other.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column(1, 2, 3)
@@ -4070,7 +4634,13 @@ void testGroupByDuplicateAggregates() {
              .column(12.0, 13.0, 15.0, 18.0)
              .column(   1,    2,    2,    1).build()) {
       try (Table t3 = t1.groupBy(0, 1)
-          .aggregate(max(2), min(2), min(2), max(2), min(2), count(1));
+          .aggregate(
+              Aggregation.max().onColumn(2),
+              Aggregation.min().onColumn(2),
+              Aggregation.min().onColumn(2),
+              Aggregation.max().onColumn(2),
+              Aggregation.min().onColumn(2),
+              Aggregation.count().onColumn(1));
           Table t4 = t3.orderBy(OrderByArg.asc(2))) {
         // verify t4
         assertEquals(4, t4.getRowCount());
@@ -4093,7 +4663,7 @@ void testGroupByMin() {
                                            .column(   1,    3,    3,    5,    5,    0)
                                            .column(  12,   14,   13,   17,   17,   17)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(min(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.min().onColumn(2));
            HostColumnVector aggOut0 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -4128,7 +4698,7 @@ void testGroupBySum() {
                                            .column(   1,    3,    3,    5,    5,    0)
                                            .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(sum(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.sum().onColumn(2));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -4165,7 +4735,8 @@ void testGroupByFirstExcludeNulls() {
                  .column(1, 2)
                  .column(13, 14)
                  .build();
-         Table found = input.groupBy(0).aggregate(first(1, false))) {
+         Table found = input.groupBy(0).aggregate(
+             Aggregation.nth(0, NullPolicy.EXCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -4180,7 +4751,8 @@ void testGroupByLastExcludeNulls() {
                  .column(1, 2)
                  .column(12, 15)
                  .build();
-         Table found = input.groupBy(0).aggregate(last(1, false))) {
+         Table found = input.groupBy(0).aggregate(
+             Aggregation.nth(-1, NullPolicy.EXCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -4195,7 +4767,8 @@ void testGroupByFirstIncludeNulls() {
                  .column(1, 2)
                  .column(null, 14)
                  .build();
-         Table found = input.groupBy(0).aggregate(first(1, true))) {
+         Table found = input.groupBy(0).aggregate(
+             Aggregation.nth(0, NullPolicy.INCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -4210,7 +4783,8 @@ void testGroupByLastIncludeNulls() {
                  .column(1, 2)
                  .column(12, null)
                  .build();
-         Table found = input.groupBy(0).aggregate(last(1, true))) {
+         Table found = input.groupBy(0).aggregate(
+             Aggregation.nth(-1, NullPolicy.INCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -4221,7 +4795,7 @@ void testGroupByAvg() {
                                            .column( 1,  3,  3,  5,  5,  0)
                                            .column(12, 14, 13,  1, 17, 17)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(mean(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.mean().onColumn(2));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -4255,7 +4829,12 @@ void testMultiAgg() {
                                            .column(5.0, 2.3, 3.4, 2.3, 1.3, 12.2)
                                            .column(  3,   1,   7,  -1,   9,    0)
                                            .build()) {
-      try (Table t2 = t1.groupBy(0, 1).aggregate(count(0), max(3), min(2), mean(2), sum(2));
+      try (Table t2 = t1.groupBy(0, 1).aggregate(
+          Aggregation.count().onColumn(0),
+          Aggregation.max().onColumn(3),
+          Aggregation.min().onColumn(2),
+          Aggregation.mean().onColumn(2),
+          Aggregation.sum().onColumn(2));
            HostColumnVector countOut = t2.getColumn(2).copyToHost();
            HostColumnVector maxOut = t2.getColumn(3).copyToHost();
            HostColumnVector minOut = t2.getColumn(4).copyToHost();
@@ -4320,7 +4899,8 @@ void testSumWithStrings() {
         .column("1-URGENT", "3-MEDIUM", "1-URGENT", "3-MEDIUM")
         .column(5289L, 5203L, 5303L, 5206L)
         .build();
-         Table result = t.groupBy(0).aggregate(Table.sum(1));
+         Table result = t.groupBy(0).aggregate(
+             Aggregation.sum().onColumn(1));
          Table expected = new Table.TestBuilder()
              .column("1-URGENT", "3-MEDIUM")
              .column(5289L + 5303L, 5203L + 5206L)
@@ -4344,6 +4924,147 @@ void testGroupByNoAggs() {
     }
   }
 
+  @Test
+  void testGroupByContiguousSplitGroups() {
+    ContiguousTable[] splits = null;
+    try (Table table = new Table.TestBuilder()
+        .column(   1,    1,    1,    1,    1,    1)
+        .column(   1,    3,    3,    5,    5,    5)
+        .column(  12,   14,   13,   17,   16,   18)
+        .column("s1", "s2", "s3", "s4", "s5", "s6")
+        .build()) {
+      // Normal case with primitive types.
+      try (Table expected1 = new Table.TestBuilder()
+              .column(   1)
+              .column(   1)
+              .column(  12)
+              .column("s1").build();
+           Table expected2 = new Table.TestBuilder()
+              .column(   1,    1)
+              .column(   3,    3)
+              .column(  14,   13)
+              .column("s2", "s3").build();
+           Table expected3 = new Table.TestBuilder()
+              .column(   1,    1,    1)
+              .column(   5,    5,    5)
+              .column(  17,   16,   18)
+              .column("s4", "s5", "s6").build()) {
+        try {
+          splits = table.groupBy(0, 1).contiguousSplitGroups();
+          assertEquals(3, splits.length);
+          for (ContiguousTable ct : splits) {
+            if (ct.getRowCount() == 1) {
+              assertTablesAreEqual(expected1, ct.getTable());
+            } else if (ct.getRowCount() == 2) {
+              assertTablesAreEqual(expected2, ct.getTable());
+            } else {
+              assertTablesAreEqual(expected3, ct.getTable());
+            }
+          }
+        } finally {
+          if (splits != null) {
+            for (ContiguousTable t : splits) { t.close(); }
+          }
+          splits = null;
+        }
+      }
+
+      // Empty key columns, the whole table is a group.
+      try {
+        splits = table.groupBy().contiguousSplitGroups();
+        assertEquals(1, splits.length);
+        assertTablesAreEqual(table, splits[0].getTable());
+      } finally {
+        if (splits != null) {
+          for (ContiguousTable t : splits) { t.close(); }
+        }
+      }
+
+    }
+  }
+
+  @Test
+  void testGroupByCollectListIncludeNulls() {
+    try (Table input = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 4)
+        .column(null, 13, null, 12, 14, null, 15, null, null, 0)
+        .build();
+         Table expected = new Table.TestBuilder()
+             .column(1, 2, 3, 4)
+             .column(new ListType(false, new BasicType(true, DType.INT32)),
+                 Arrays.asList(null, 13, null, 12),
+                 Arrays.asList(14, null, 15, null),
+                 Arrays.asList((Integer) null),
+                 Arrays.asList(0))
+             .build();
+         Table found = input.groupBy(0).aggregate(
+             Aggregation.collectList(NullPolicy.INCLUDE).onColumn(1))) {
+      assertTablesAreEqual(expected, found);
+    }
+  }
+
+  @Test
+  void testGroupByCollectSetIncludeNulls() {
+    // test with null unequal and nan unequal
+    Aggregation collectSet = Aggregation.collectSet(NullPolicy.INCLUDE,
+        NullEquality.UNEQUAL, NaNEquality.UNEQUAL);
+    try (Table input = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4)
+        .column(null, 13, null, 13, 14, null, 15, null, 4, 1, 1, 4, 0, 0, 0, 0)
+        .build();
+         Table expected = new Table.TestBuilder()
+             .column(1, 2, 3, 4)
+             .column(new ListType(false, new BasicType(true, DType.INT32)),
+                 Arrays.asList(13, null, null), Arrays.asList(14, 15, null, null),
+                 Arrays.asList(1, 4), Arrays.asList(0))
+             .build();
+         Table found = input.groupBy(0).aggregate(collectSet.onColumn(1))) {
+      assertTablesAreEqual(expected, found);
+    }
+    // test with null equal and nan unequal
+    collectSet = Aggregation.collectSet(NullPolicy.INCLUDE,
+        NullEquality.EQUAL, NaNEquality.UNEQUAL);
+    try (Table input = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4)
+        .column(null, 13.0, null, 13.0,
+            14.1, Double.NaN, 13.9, Double.NaN,
+            Double.NaN, null, 1.0, null,
+            null, null, null, null)
+        .build();
+         Table expected = new Table.TestBuilder()
+             .column(1, 2, 3, 4)
+             .column(new ListType(false, new BasicType(true, DType.FLOAT64)),
+                 Arrays.asList(13.0, null),
+                 Arrays.asList(13.9, 14.1, Double.NaN, Double.NaN),
+                 Arrays.asList(1.0, Double.NaN, null),
+                 Arrays.asList((Integer) null))
+             .build();
+         Table found = input.groupBy(0).aggregate(collectSet.onColumn(1))) {
+      assertTablesAreEqual(expected, found);
+    }
+    // test with null equal and nan equal
+    collectSet = Aggregation.collectSet(NullPolicy.INCLUDE,
+        NullEquality.EQUAL, NaNEquality.ALL_EQUAL);
+    try (Table input = new Table.TestBuilder()
+        .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4)
+        .column(null, 13.0, null, 13.0,
+            14.1, Double.NaN, 13.9, Double.NaN,
+            0.0, 0.0, 0.00, 0.0,
+            Double.NaN, Double.NaN, null, null)
+        .build();
+         Table expected = new Table.TestBuilder()
+             .column(1, 2, 3, 4)
+             .column(new ListType(false, new BasicType(true, DType.FLOAT64)),
+                 Arrays.asList(13.0, null),
+                 Arrays.asList(13.9, 14.1, Double.NaN),
+                 Arrays.asList(0.0),
+                 Arrays.asList(Double.NaN, (Integer) null))
+             .build();
+         Table found = input.groupBy(0).aggregate(collectSet.onColumn(1))) {
+      assertTablesAreEqual(expected, found);
+    }
+  }
+
   @Test
   void testRowBitCount() {
     try (Table t = new Table.TestBuilder()
@@ -4545,36 +5266,42 @@ void testTableBasedFilter() {
   }
 
   private Table getExpectedFileTable() {
-    return getExpectedFileTable(false);
+    return getExpectedFileTable(false, false);
   }
 
   private Table getExpectedFileTable(boolean withNestedColumns) {
+    return getExpectedFileTable(true, true);
+  }
+
+  private Table getExpectedFileTable(boolean withStructColumns, boolean withListColumn) {
     TestBuilder tb = new TestBuilder()
-            .column(true, false, false, true, false)
-            .column(5, 1, 0, 2, 7)
-            .column(new Byte[]{2, 3, 4, 5, 9})
-            .column(3l, 9l, 4l, 2l, 20l)
-            .column("this", "is", "a", "test", "string")
-            .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
-            .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d);
-    if (withNestedColumns) {
-      StructType nestedType = new StructType(true,
-              new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
+        .column(true, false, false, true, false)
+        .column(5, 1, 0, 2, 7)
+        .column(new Byte[]{2, 3, 4, 5, 9})
+        .column(3l, 9l, 4l, 2l, 20l)
+        .column("this", "is", "a", "test", "string")
+        .column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f)
+        .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d);
+    StructType nestedType = new StructType(true,
+        new BasicType(false, DType.INT32), new BasicType(false, DType.STRING));
+    if (withStructColumns) {
       tb.column(nestedType,
-            struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
-            struct(4, "k4"), new HostColumnVector.StructData((List) null))
-        .column(new ListType(false, new BasicType(false, DType.INT32)),
-                Arrays.asList(1, 2),
-                Arrays.asList(3, 4),
-                Arrays.asList(5),
-                Arrays.asList(6, 7),
-                Arrays.asList(8, 9, 10))
-        .column(new ListType(false, nestedType),
-            Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")),
-            Arrays.asList(struct(4, "k4"), struct(5, "k5")),
-            Arrays.asList(struct(6, "k6")),
-            Arrays.asList(new HostColumnVector.StructData((List) null)),
-            Arrays.asList());
+          struct(1, "k1"), struct(2, "k2"), struct(3, "k3"),
+          struct(4, "k4"), new HostColumnVector.StructData((List) null));
+    }
+    if (withListColumn) {
+      tb.column(new ListType(false, new BasicType(false, DType.INT32)),
+          Arrays.asList(1, 2),
+          Arrays.asList(3, 4),
+          Arrays.asList(5),
+          Arrays.asList(6, 7),
+          Arrays.asList(8, 9, 10))
+          .column(new ListType(false, nestedType),
+              Arrays.asList(struct(1, "k1"), struct(2, "k2"), struct(3, "k3")),
+              Arrays.asList(struct(4, "k4"), struct(5, "k5")),
+              Arrays.asList(struct(6, "k6")),
+              Arrays.asList(new HostColumnVector.StructData((List) null)),
+              Arrays.asList());
     }
     return tb.build();
   }
@@ -4642,9 +5369,9 @@ void testParquetWriteToBufferChunkedInt96() {
     try (Table table0 = getExpectedFileTableWithDecimals();
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
-          .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")
-          .withTimestampInt96(true)
-          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 5)
+          .withNonNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
+          .withDecimalColumn("_c7", 5)
+          .withDecimalColumn("_c8", 5)
           .build();
 
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -4659,13 +5386,47 @@ void testParquetWriteToBufferChunkedInt96() {
     }
   }
 
+  @Test
+  void testParquetWriteToBufferChunkedWithNested() {
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
+        .withStructColumn(structBuilder("_c7")
+            .withNullableColumns("_c7-1")
+            .withNullableColumns("_c7-2")
+            .build())
+      .withListColumn(listBuilder("_c8")
+            .withNullableColumns("c8-1").build())
+        .withListColumn(listBuilder("c9")
+            .withStructColumn(structBuilder("c9-1")
+                .withNullableColumns("c9-1-1")
+                .withNullableColumns("c9-1-2").build())
+            .build())
+        .build();
+    try (Table table0 = getExpectedFileTable(true);
+         MyBufferConsumer consumer = new MyBufferConsumer()) {
+      try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
+        writer.write(table0);
+        writer.write(table0);
+        writer.write(table0);
+      }
+      try (Table table1 = Table.readParquet(ParquetOptions.DEFAULT, consumer.buffer, 0,
+          consumer.offset);
+           Table concat = Table.concatenate(table0, table0, table0)) {
+        assertTablesAreEqual(concat, table1);
+      }
+    }
+  }
+
   @Test
   void testParquetWriteToBufferChunked() {
     ParquetWriterOptions options = ParquetWriterOptions.builder()
-        .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7")
-        .withTimestampInt96(true)
+        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
+        .withStructColumn(structBuilder("_c7")
+            .withNullableColumns("_c7-1")
+            .withNullableColumns("_c7-2")
+            .build())
         .build();
-    try (Table table0 = getExpectedFileTable();
+    try (Table table0 = getExpectedFileTable(true, false);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
          try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
            writer.write(table0);
@@ -4684,11 +5445,11 @@ void testParquetWriteToFileWithNames() throws IOException {
     File tempFile = File.createTempFile("test-names", ".parquet");
     try (Table table0 = getExpectedFileTableWithDecimals()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
-          .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh",
-              "eighth", "nineth")
+          .withNonNullableColumns("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
+          .withDecimalColumn("eighth", 5)
+          .withDecimalColumn("ninth", 6)
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 5, 6)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -4706,12 +5467,12 @@ void testParquetWriteToFileWithNamesAndMetadata() throws IOException {
     File tempFile = File.createTempFile("test-names-metadata", ".parquet");
     try (Table table0 = getExpectedFileTableWithDecimals()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
-          .withColumnNames("first", "second", "third", "fourth", "fifth", "sixth", "seventh",
-            "eighth", "nineth")
+          .withNonNullableColumns("first", "second", "third", "fourth", "fifth", "sixth", "seventh")
+          .withDecimalColumn("eighth", 6)
+          .withDecimalColumn("ninth", 8)
           .withMetadata("somekey", "somevalue")
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 6, 8)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -4729,10 +5490,11 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
     File tempFile = File.createTempFile("test-uncompressed", ".parquet");
     try (Table table0 = getExpectedFileTableWithDecimals()) {
       ParquetWriterOptions options = ParquetWriterOptions.builder()
-          .withColumnNames("_c1", "_c2", "_c3", "_c4", "_c5", "_c6", "_c7", "_c8", "_c9")
+          .withNonNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
+          .withDecimalColumn("_c7", 4)
+          .withDecimalColumn("_c8", 6)
           .withCompressionType(CompressionType.NONE)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
-          .withDecimalPrecisions(0, 0, 0, 0, 0, 0, 0, 4, 6)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
@@ -5110,7 +5872,7 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer)
       Table.TestBuilder expectedBuilder = new Table.TestBuilder();
       if (pos) {
         if (outer) {
-          expectedBuilder.column(0, 1, 2, 0, 1, 0, null, null);
+          expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, null);
         } else {
           expectedBuilder.column(0, 1, 2, 0, 1, 0, 0);
         }
diff --git a/python/.flake8 b/python/.flake8
index 1d251eec69f..1ba1c9c644e 100644
--- a/python/.flake8
+++ b/python/.flake8
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 [flake8]
 exclude = __init__.py
@@ -8,3 +8,9 @@ ignore =
     # whitespace before :
     E203
 
+[pydocstyle]
+match = ^.*abc\.py$
+# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather than include using match-dir.
+match-dir = ^(?!ci|cpp|python/dask_cudf|python/cudf_kafka|python/custreamz).*$
+# In addition to numpy style, we additionally ignore magic methods (D105) and newlines before docstrings (D204).
+add-ignore = D105, D204
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 4a9b63cd6aa..8c63b12d972 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -57,8 +57,8 @@ def generate_input(self):
                 # TODO: Remove uint32 below after this bug is fixed
                 # https://github.com/pandas-dev/pandas/issues/37327
                 - {"uint32"}
+                | {"list", "decimal64"}
             )
-            dtypes_list.extend(["list"])
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
@@ -145,8 +145,8 @@ def generate_input(self):
                 # TODO: Remove uint32 below after this bug is fixed
                 # https://github.com/pandas-dev/pandas/issues/37327
                 - {"uint32"}
+                | {"list", "decimal64"}
             )
-            dtypes_list.extend(["list"])
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index efcbd8ca792..71b5a35a225 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -114,6 +114,8 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             meta["value_type"] = random.choice(
                 list(cudf.utils.dtypes.ALL_TYPES - {"category"})
             )
+        elif dtype == "decimal64":
+            meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
 
         meta["dtype"] = dtype
         meta["null_frequency"] = null_frequency
diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd
index bb332c44237..56fa9fdc63e 100644
--- a/python/cudf/cudf/_lib/aggregation.pxd
+++ b/python/cudf/cudf/_lib/aggregation.pxd
@@ -2,9 +2,14 @@
 
 from libcpp.memory cimport unique_ptr
 from cudf._lib.cpp.aggregation cimport aggregation
+from cudf._lib.cpp.aggregation cimport rolling_aggregation
 
 
-cdef unique_ptr[aggregation] make_aggregation(op, kwargs=*) except *
-
 cdef class Aggregation:
     cdef unique_ptr[aggregation] c_obj
+
+cdef class RollingAggregation:
+    cdef unique_ptr[rolling_aggregation] c_obj
+
+cdef Aggregation make_aggregation(op, kwargs=*)
+cdef RollingAggregation make_rolling_aggregation(op, kwargs=*)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 7138bb49743..cda35025c7e 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -19,12 +19,7 @@ from cudf._lib.types cimport (
 )
 from cudf._lib.types import Interpolation
 
-try:
-    # Numba >= 0.49
-    from numba.np import numpy_support
-except ImportError:
-    # Numba <= 0.49
-    from numba import numpy_support
+from numba.np import numpy_support
 
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
@@ -56,86 +51,62 @@ class AggregationKind(Enum):
 
 
 cdef class Aggregation:
-    def __init__(self, op, **kwargs):
-        self.c_obj = move(make_aggregation(op, kwargs))
-
+    """A Cython wrapper for aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
     @property
     def kind(self):
-        return AggregationKind(self.c_obj.get()[0].kind).name.lower()
-
-
-cdef unique_ptr[aggregation] make_aggregation(op, kwargs={}) except *:
-    """
-    Parameters
-    ----------
-    op : str or callable
-        If callable, must meet one of the following requirements:
-
-        * Is of the form lambda x: x.agg(*args, **kwargs), where
-          `agg` is the name of a supported aggregation. Used to
-          to specify aggregations that take arguments, e.g.,
-          `lambda x: x.quantile(0.5)`.
-        * Is a user defined aggregation function that operates on
-          group values. In this case, the output dtype must be
-          specified in the `kwargs` dictionary.
-
-    Returns
-    -------
-    unique_ptr[aggregation]
-    """
-    cdef Aggregation agg
-    if isinstance(op, str):
-        agg = getattr(_AggregationFactory, op)(**kwargs)
-    elif callable(op):
-        if op is list:
-            agg = _AggregationFactory.collect()
-        elif "dtype" in kwargs:
-            agg = _AggregationFactory.from_udf(op, **kwargs)
-        else:
-            agg = op(_AggregationFactory)
-    else:
-        raise TypeError("Unknown aggregation {}".format(op))
-    return move(agg.c_obj)
-
-# The Cython pattern below enables us to create an Aggregation
-# without ever calling its `__init__` method, which would otherwise
-# result in a RecursionError.
-cdef class _AggregationFactory:
+        return AggregationKind(self.c_obj.get()[0].kind).name
 
     @classmethod
     def sum(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_sum_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_sum_aggregation[aggregation]())
         return agg
 
     @classmethod
     def min(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_min_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_min_aggregation[aggregation]())
         return agg
 
     @classmethod
     def max(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_max_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_max_aggregation[aggregation]())
         return agg
 
     @classmethod
     def idxmin(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_argmin_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmin_aggregation[aggregation]())
         return agg
 
     @classmethod
     def idxmax(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_argmax_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmax_aggregation[aggregation]())
         return agg
 
     @classmethod
     def mean(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_mean_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_mean_aggregation[aggregation]())
         return agg
 
     @classmethod
@@ -146,81 +117,93 @@ cdef class _AggregationFactory:
         else:
             c_null_handling = libcudf_types.null_policy.INCLUDE
 
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
-            c_null_handling
-        ))
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[aggregation](
+                c_null_handling
+            ))
         return agg
 
     @classmethod
     def size(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_count_aggregation(
-            <libcudf_types.null_policy><underlying_type_t_null_policy>(
-                NullHandling.INCLUDE
-            )
-        ))
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[aggregation](
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.INCLUDE
+                )
+            ))
         return agg
 
     @classmethod
     def nunique(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_nunique_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_nunique_aggregation[aggregation]())
         return agg
 
     @classmethod
     def nth(cls, libcudf_types.size_type size):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
         agg.c_obj = move(
-            libcudf_aggregation.make_nth_element_aggregation(size)
-        )
+            libcudf_aggregation.make_nth_element_aggregation[aggregation](
+                size))
         return agg
 
     @classmethod
     def any(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_any_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_any_aggregation[aggregation]())
         return agg
 
     @classmethod
     def all(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_all_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_all_aggregation[aggregation]())
         return agg
 
     @classmethod
     def product(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_product_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_product_aggregation[aggregation]())
         return agg
+    prod = product
 
     @classmethod
     def sum_of_squares(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_sum_of_squares_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_sum_of_squares_aggregation[aggregation]()
+        )
         return agg
 
     @classmethod
     def var(cls, ddof=1):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_variance_aggregation(ddof))
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_variance_aggregation[aggregation](ddof))
         return agg
 
     @classmethod
     def std(cls, ddof=1):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_std_aggregation(ddof))
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_std_aggregation[aggregation](ddof))
         return agg
 
     @classmethod
     def median(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_median_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_median_aggregation[aggregation]())
         return agg
 
     @classmethod
     def quantile(cls, q=0.5, interpolation="linear"):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef Aggregation agg = cls()
 
         if not pd.api.types.is_list_like(q):
             q = [q]
@@ -234,25 +217,172 @@ cdef class _AggregationFactory:
             )
         )
         agg.c_obj = move(
-            libcudf_aggregation.make_quantile_aggregation(c_q, c_interp)
+            libcudf_aggregation.make_quantile_aggregation[aggregation](
+                c_q, c_interp)
         )
         return agg
 
     @classmethod
     def collect(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_collect_list_aggregation[aggregation]())
         return agg
 
     @classmethod
     def unique(cls):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
-        agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation())
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_collect_set_aggregation[aggregation]())
+        return agg
+
+    @classmethod
+    def from_udf(cls, op, *args, **kwargs):
+        cdef Aggregation agg = cls()
+
+        cdef libcudf_types.type_id tid
+        cdef libcudf_types.data_type out_dtype
+        cdef string cpp_str
+
+        # Handling UDF type
+        nb_type = numpy_support.from_dtype(kwargs['dtype'])
+        type_signature = (nb_type[:],)
+        compiled_op = cudautils.compile_udf(op, type_signature)
+        output_np_dtype = np.dtype(compiled_op[1])
+        cpp_str = compiled_op[0].encode('UTF-8')
+        if output_np_dtype not in np_to_cudf_types:
+            raise TypeError(
+                "Result of window function has unsupported dtype {}"
+                .format(op[1])
+            )
+        tid = (
+            <libcudf_types.type_id> (
+                <underlying_type_t_type_id> (
+                    np_to_cudf_types[output_np_dtype]
+                )
+            )
+        )
+        out_dtype = libcudf_types.data_type(tid)
+
+        agg.c_obj = move(
+            libcudf_aggregation.make_udf_aggregation[aggregation](
+                libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype
+            ))
+        return agg
+
+    # scan aggregations
+    # TODO: update this after adding per algorithm aggregation derived types
+    # https://github.com/rapidsai/cudf/issues/7106
+    cumsum = sum
+    cummin = min
+    cummax = max
+
+    @classmethod
+    def cumcount(cls):
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[aggregation](
+                libcudf_types.null_policy.INCLUDE
+            ))
+        return agg
+
+cdef class RollingAggregation:
+    """A Cython wrapper for rolling window aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
+    @property
+    def kind(self):
+        return AggregationKind(self.c_obj.get()[0].kind).name
+
+    @classmethod
+    def sum(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_sum_aggregation[rolling_aggregation]())
+        return agg
+
+    @classmethod
+    def min(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_min_aggregation[rolling_aggregation]())
+        return agg
+
+    @classmethod
+    def max(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_max_aggregation[rolling_aggregation]())
+        return agg
+
+    @classmethod
+    def idxmin(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmin_aggregation[
+                rolling_aggregation]())
+        return agg
+
+    @classmethod
+    def idxmax(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmax_aggregation[
+                rolling_aggregation]())
+        return agg
+
+    @classmethod
+    def mean(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_mean_aggregation[rolling_aggregation]())
+        return agg
+
+    @classmethod
+    def count(cls, dropna=True):
+        cdef libcudf_types.null_policy c_null_handling
+        if dropna:
+            c_null_handling = libcudf_types.null_policy.EXCLUDE
+        else:
+            c_null_handling = libcudf_types.null_policy.INCLUDE
+
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[rolling_aggregation](
+                c_null_handling
+            ))
+        return agg
+
+    @classmethod
+    def size(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[rolling_aggregation](
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.INCLUDE)
+            ))
+        return agg
+
+    @classmethod
+    def collect(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_collect_list_aggregation[
+                rolling_aggregation]())
         return agg
 
     @classmethod
     def from_udf(cls, op, *args, **kwargs):
-        cdef Aggregation agg = Aggregation.__new__(Aggregation)
+        cdef RollingAggregation agg = cls()
 
         cdef libcudf_types.type_id tid
         cdef libcudf_types.data_type out_dtype
@@ -278,7 +408,100 @@ cdef class _AggregationFactory:
         )
         out_dtype = libcudf_types.data_type(tid)
 
-        agg.c_obj = move(libcudf_aggregation.make_udf_aggregation(
-            libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype
-        ))
+        agg.c_obj = move(
+            libcudf_aggregation.make_udf_aggregation[rolling_aggregation](
+                libcudf_aggregation.udf_type.PTX, cpp_str, out_dtype
+            ))
+        return agg
+
+    # scan aggregations
+    # TODO: update this after adding per algorithm aggregation derived types
+    # https://github.com/rapidsai/cudf/issues/7106
+    cumsum = sum
+    cummin = min
+    cummax = max
+
+    @classmethod
+    def cumcount(cls):
+        cdef RollingAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[rolling_aggregation](
+                libcudf_types.null_policy.INCLUDE
+            ))
         return agg
+
+cdef Aggregation make_aggregation(op, kwargs=None):
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    Aggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    cdef Aggregation agg
+    if isinstance(op, str):
+        agg = getattr(Aggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = Aggregation.collect()
+        elif "dtype" in kwargs:
+            agg = Aggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(Aggregation)
+    else:
+        raise TypeError(f"Unknown aggregation {op}")
+    return agg
+
+cdef RollingAggregation make_rolling_aggregation(op, kwargs=None):
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    RollingAggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    cdef RollingAggregation agg
+    if isinstance(op, str):
+        agg = getattr(RollingAggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = RollingAggregation.collect()
+        elif "dtype" in kwargs:
+            agg = RollingAggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(RollingAggregation)
+    else:
+        raise TypeError(f"Unknown aggregation {op}")
+    return agg
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index ffc3fdfd70a..a3e01a4ac9d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -326,12 +326,7 @@ cdef class Column:
             col = self
         data_dtype = col.dtype
 
-        cdef libcudf_types.type_id tid = <libcudf_types.type_id> (
-            <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(data_dtype)]
-            )
-        )
-        cdef libcudf_types.data_type dtype = libcudf_types.data_type(tid)
+        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index e9836c11361..839bdae7427 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -40,55 +40,58 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             CUDA 'cudf::aggregation::CUDA'
         Kind kind
 
+    cdef cppclass rolling_aggregation:
+        aggregation.Kind kind
+
     ctypedef enum udf_type:
         CUDA 'cudf::udf_type::CUDA'
         PTX 'cudf::udf_type::PTX'
 
-    cdef unique_ptr[aggregation] make_sum_aggregation() except +
+    cdef unique_ptr[T] make_sum_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_product_aggregation() except +
+    cdef unique_ptr[T] make_product_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_min_aggregation() except +
+    cdef unique_ptr[T] make_min_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_max_aggregation() except +
+    cdef unique_ptr[T] make_max_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_count_aggregation() except +
+    cdef unique_ptr[T] make_count_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_count_aggregation(null_policy) except +
+    cdef unique_ptr[T] make_count_aggregation[T](null_policy) except +
 
-    cdef unique_ptr[aggregation] make_any_aggregation() except +
+    cdef unique_ptr[T] make_any_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_all_aggregation() except +
+    cdef unique_ptr[T] make_all_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_sum_of_squares_aggregation() except +
+    cdef unique_ptr[T] make_sum_of_squares_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_mean_aggregation() except +
+    cdef unique_ptr[T] make_mean_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_variance_aggregation(
+    cdef unique_ptr[T] make_variance_aggregation[T](
         size_type ddof) except +
 
-    cdef unique_ptr[aggregation] make_std_aggregation(size_type ddof) except +
+    cdef unique_ptr[T] make_std_aggregation[T](size_type ddof) except +
 
-    cdef unique_ptr[aggregation] make_median_aggregation() except +
+    cdef unique_ptr[T] make_median_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_quantile_aggregation(
+    cdef unique_ptr[T] make_quantile_aggregation[T](
         vector[double] q, interpolation i) except +
 
-    cdef unique_ptr[aggregation] make_argmax_aggregation() except +
+    cdef unique_ptr[T] make_argmax_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_argmin_aggregation() except +
+    cdef unique_ptr[T] make_argmin_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_nunique_aggregation() except +
+    cdef unique_ptr[T] make_nunique_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_nth_element_aggregation(
+    cdef unique_ptr[T] make_nth_element_aggregation[T](
         size_type n
     ) except +
 
-    cdef unique_ptr[aggregation] make_collect_list_aggregation() except +
+    cdef unique_ptr[T] make_collect_list_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_collect_set_aggregation() except +
+    cdef unique_ptr[T] make_collect_set_aggregation[T]() except +
 
-    cdef unique_ptr[aggregation] make_udf_aggregation(
+    cdef unique_ptr[T] make_udf_aggregation[T](
         udf_type type,
         string user_defined_aggregator,
         data_type output_type) except +
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index dd3c4b25b16..af09b27d916 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -1,16 +1,23 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp cimport bool
 
+from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.aggregation cimport aggregation
+from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport size_type, order, null_order, null_policy
+from cudf._lib.cpp.replace cimport replace_policy
+from cudf._lib.cpp.utilities.host_span cimport host_span
+
+# workaround for https://github.com/cython/cython/issues/3885
+ctypedef const scalar constscalar
 
 
 cdef extern from "cudf/groupby.hpp" \
@@ -65,5 +72,26 @@ cdef extern from "cudf/groupby.hpp" \
             const vector[aggregation_request]& requests,
         ) except +
 
+        pair[
+            unique_ptr[table],
+            vector[aggregation_result]
+        ] scan(
+            const vector[aggregation_request]& requests,
+        ) except +
+
+        pair[
+            unique_ptr[table],
+            unique_ptr[table]
+        ] shift(
+            const table_view values,
+            const vector[size_type] offset,
+            const vector[reference_wrapper[constscalar]] fill_values
+        ) except +
+
         groups get_groups() except +
         groups get_groups(table_view values) except +
+
+        pair[unique_ptr[table], unique_ptr[table]] replace_nulls(
+            const table_view& value,
+            const vector[replace_policy] replace_policy
+        ) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 331dc878ff1..7449f2c510c 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -34,8 +34,6 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_index(bool val) except+
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
-        void enable_decimals_as_float64(bool val) except+
-        void set_forced_decimals_scale(size_type scale) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -53,10 +51,6 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& use_index(bool val) except+
         orc_reader_options_builder& use_np_dtypes(bool val) except+
         orc_reader_options_builder& timestamp_type(data_type type) except+
-        orc_reader_options_builder& decimals_as_float64(bool val) except+
-        orc_reader_options_builder& forced_decimals_scale(
-            size_type scale
-        ) except+
 
         orc_reader_options build() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd
new file mode 100644
index 00000000000..ea9ade178e2
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.table.table_view cimport table_view
+
+cdef extern from "cudf/lists/combine.hpp" namespace \
+        "cudf::lists" nogil:
+    cdef unique_ptr[column] concatenate_rows(
+        const table_view input_table
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd b/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
index 3df0bbc0815..013ce9de8f4 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
+++ b/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
@@ -3,7 +3,8 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint16_t, uint32_t
+
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -17,6 +18,31 @@ cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
         unique_ptr[column] tensor_attention_mask
         unique_ptr[column] tensor_metadata
 
+    cdef struct hashed_vocabulary "nvtext::hashed_vocabulary":
+        uint16_t first_token_id
+        uint16_t separator_token_id
+        uint16_t unknown_token_id
+        uint32_t outer_hash_a
+        uint32_t outer_hash_b
+        uint16_t num_bin
+        unique_ptr[column] table
+        unique_ptr[column] bin_coefficients
+        unique_ptr[column] bin_offsets
+
+    cdef unique_ptr[hashed_vocabulary] load_vocabulary_file(
+        const string &filename_hashed_vocabulary
+    ) except +
+
+    cdef tokenizer_result subword_tokenize(
+        const column_view & strings,
+        hashed_vocabulary & hashed_vocablary_obj,
+        uint32_t max_sequence_length,
+        uint32_t stride,
+        bool do_lower,
+        bool do_truncate,
+        uint32_t max_rows_tensor
+    ) except +
+
     cdef tokenizer_result subword_tokenize(
         const column_view &strings,
         const string &filename_hashed_vocabulary,
diff --git a/python/cudf/cudf/_lib/cpp/rolling.pxd b/python/cudf/cudf/_lib/cpp/rolling.pxd
index 9402f1552c3..4ccc0f5ae9b 100644
--- a/python/cudf/cudf/_lib/cpp/rolling.pxd
+++ b/python/cudf/cudf/_lib/cpp/rolling.pxd
@@ -7,7 +7,7 @@ from cudf._lib.types import np_to_cudf_types, cudf_to_np_types
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.aggregation cimport aggregation
+from cudf._lib.cpp.aggregation cimport rolling_aggregation
 
 
 cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
@@ -16,11 +16,11 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
         column_view preceding_window,
         column_view following_window,
         size_type min_periods,
-        unique_ptr[aggregation] agg) except +
+        rolling_aggregation agg) except +
 
     cdef unique_ptr[column] rolling_window(
         column_view source,
         size_type preceding_window,
         size_type following_window,
         size_type min_periods,
-        unique_ptr[aggregation] agg) except +
+        rolling_aggregation agg) except +
diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
index fec1c6382e6..de5cb05447c 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
@@ -9,6 +9,9 @@ from libcpp.string cimport string
 from cudf._lib.cpp.types cimport data_type
 from cudf._lib.cpp.wrappers.decimals cimport scale_type
 
+from cudf._lib.cpp.column.column_view cimport column_view
+
+
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
     cdef cppclass scalar:
         scalar() except +
@@ -60,3 +63,6 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
                            bool is_valid) except +
         int64_t value() except +
         # TODO: Figure out how to add an int32 overload of value()
+
+    cdef cppclass list_scalar(scalar):
+        column_view view() except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd
index 2670c67908f..35d7516d127 100644
--- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -8,6 +8,14 @@ from cudf._lib.cpp.column.column cimport column
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
 
+    ctypedef enum separator_on_nulls:
+        YES 'cudf::strings::separator_on_nulls::YES'
+        NO  'cudf::strings::separator_on_nulls::NO'
+
+    ctypedef enum output_if_empty_list:
+        EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING'
+        NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT'
+
     cdef unique_ptr[column] concatenate(
         table_view source_strings,
         string_scalar separator,
@@ -17,3 +25,18 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         column_view source_strings,
         string_scalar separator,
         string_scalar narep) except +
+
+    cdef unique_ptr[column] join_list_elements(
+        column_view lists_strings_column,
+        column_view separators,
+        string_scalar separator_narep,
+        string_scalar string_narep,
+        separator_on_nulls separate_nulls,
+        output_if_empty_list empty_list_policy) except +
+
+    cdef unique_ptr[column] join_list_elements(
+        column_view lists_strings_column,
+        string_scalar separator,
+        string_scalar narep,
+        separator_on_nulls separate_nulls,
+        output_if_empty_list empty_list_policy) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/cpp/strings/json.pxd
new file mode 100644
index 00000000000..c0e215f2085
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/strings/json.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.scalar.scalar cimport scalar
+
+
+cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil:
+    cdef unique_ptr[column] get_json_object(
+        column_view col,
+        string_scalar json_path,
+    ) except +
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 4584841dd33..17f6991c25d 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,6 +1,15 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 from collections import defaultdict
+from pandas.core.groupby.groupby import DataError
+from cudf.utils.dtypes import (
+    is_categorical_dtype,
+    is_string_dtype,
+    is_list_dtype,
+    is_interval_dtype,
+    is_struct_dtype,
+    is_decimal_dtype,
+)
 
 import numpy as np
 import rmm
@@ -13,57 +22,36 @@ from libcpp cimport bool
 
 from cudf._lib.column cimport Column
 from cudf._lib.table cimport Table
-from cudf._lib.aggregation cimport make_aggregation, Aggregation
-
+from cudf._lib.scalar cimport DeviceScalar
+from cudf._lib.scalar import as_device_scalar
+from cudf._lib.aggregation cimport Aggregation, make_aggregation
+
+from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table cimport table, table_view
+from cudf._lib.cpp.replace cimport replace_policy
+from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.cpp.types cimport size_type
 cimport cudf._lib.cpp.types as libcudf_types
 cimport cudf._lib.cpp.groupby as libcudf_groupby
-cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 
 
 # The sets below define the possible aggregations that can be performed on
-# different dtypes. The uppercased versions of these strings correspond to
-# elements of the AggregationKind enum.
-_CATEGORICAL_AGGS = {
-    "count",
-    "size",
-    "nunique",
-    "unique",
-}
-
-_STRING_AGGS = {
-    "count",
-    "size",
-    "max",
-    "min",
-    "nunique",
-    "nth",
-    "collect",
-    "unique",
-}
-
-_LIST_AGGS = {
-    "collect",
-}
-
-_STRUCT_AGGS = {
-}
-
-_INTERVAL_AGGS = {
-}
-
-_DECIMAL_AGGS = {
-    "count",
-    "sum",
-    "argmin",
-    "argmax",
-    "min",
-    "max",
-    "nunique",
-    "nth",
-    "collect"
-}
-
+# different dtypes. These strings must be elements of the AggregationKind enum.
+_CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
+_STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
+                "UNIQUE"}
+_LIST_AGGS = {"COLLECT"}
+_STRUCT_AGGS = set()
+_INTERVAL_AGGS = set()
+_DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
+                 "NTH", "COLLECT"}
+
+# workaround for https://github.com/cython/cython/issues/3885
+ctypedef const scalar constscalar
 
 cdef class GroupBy:
     cdef unique_ptr[libcudf_groupby.groupby] c_obj
@@ -132,21 +120,52 @@ cdef class GroupBy:
         """
         from cudf.core.column_accessor import ColumnAccessor
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
+        cdef libcudf_groupby.aggregation_request c_agg_request
         cdef Column col
+        cdef Aggregation agg_obj
 
-        aggregations = _drop_unsupported_aggs(values, aggregations)
+        cdef bool scan = _is_all_scan_aggregate(aggregations)
+        allow_empty = all(len(v) == 0 for v in aggregations.values())
 
+        included_aggregations = defaultdict(list)
         for i, (col_name, aggs) in enumerate(aggregations.items()):
             col = values._data[col_name]
-            c_agg_requests.push_back(
-                move(libcudf_groupby.aggregation_request())
+            dtype = col.dtype
+
+            valid_aggregations = (
+                _LIST_AGGS if is_list_dtype(dtype)
+                else _STRING_AGGS if is_string_dtype(dtype)
+                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
+                else _STRING_AGGS if is_struct_dtype(dtype)
+                else _INTERVAL_AGGS if is_interval_dtype(dtype)
+                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
+                else "ALL"
             )
-            c_agg_requests[i].values = col.view()
+            if (valid_aggregations is _DECIMAL_AGGS
+                    and rmm._cuda.gpu.runtimeGetVersion() < 11000):
+                raise RuntimeError(
+                    "Decimal aggregations are only supported on CUDA >= 11 "
+                    "due to an nvcc compiler bug."
+                )
+
+            c_agg_request = move(libcudf_groupby.aggregation_request())
             for agg in aggs:
-                c_agg_requests[i].aggregations.push_back(
-                    move(make_aggregation(agg))
+                agg_obj = make_aggregation(agg)
+                if (valid_aggregations == "ALL"
+                        or agg_obj.kind in valid_aggregations):
+                    included_aggregations[col_name].append(agg)
+                    c_agg_request.aggregations.push_back(
+                        move(agg_obj.c_obj)
+                    )
+            if not c_agg_request.aggregations.empty():
+                c_agg_request.values = col.view()
+                c_agg_requests.push_back(
+                    move(c_agg_request)
                 )
 
+        if c_agg_requests.empty() and not allow_empty:
+            raise DataError("All requested aggregations are unsupported.")
+
         cdef pair[
             unique_ptr[table],
             vector[libcudf_groupby.aggregation_result]
@@ -154,11 +173,18 @@ cdef class GroupBy:
 
         try:
             with nogil:
-                c_result = move(
-                    self.c_obj.get()[0].aggregate(
-                        c_agg_requests
+                if scan:
+                    c_result = move(
+                        self.c_obj.get()[0].scan(
+                            c_agg_requests
+                        )
+                    )
+                else:
+                    c_result = move(
+                        self.c_obj.get()[0].aggregate(
+                            c_agg_requests
+                        )
                     )
-                )
         except RuntimeError as e:
             # TODO: remove this try..except after
             # https://github.com/rapidsai/cudf/issues/7611
@@ -176,81 +202,109 @@ cdef class GroupBy:
         )
 
         result_data = ColumnAccessor(multiindex=True)
-        for i, col_name in enumerate(aggregations):
-            for j, agg_name in enumerate(aggregations[col_name]):
+        # Note: This loop relies on the included_aggregations dict being
+        # insertion ordered to map results to requested aggregations by index.
+        for i, col_name in enumerate(included_aggregations):
+            for j, agg_name in enumerate(included_aggregations[col_name]):
                 if callable(agg_name):
                     agg_name = agg_name.__name__
                 result_data[(col_name, agg_name)] = (
                     Column.from_unique_ptr(move(c_result.second[i].results[j]))
                 )
 
-        result = Table(data=result_data, index=grouped_keys)
+        return Table(data=result_data, index=grouped_keys)
+
+    def shift(self, Table values, int periods, list fill_values):
+        cdef table_view view = values.view()
+        cdef size_type num_col = view.num_columns()
+        cdef vector[size_type] offsets = vector[size_type](num_col, periods)
+
+        cdef vector[reference_wrapper[constscalar]] c_fill_values
+        cdef DeviceScalar d_slr
+        d_slrs = []
+        c_fill_values.reserve(num_col)
+        for val, col in zip(fill_values, values._columns):
+            d_slr = as_device_scalar(val, dtype=col.dtype)
+            d_slrs.append(d_slr)
+            c_fill_values.push_back(
+                reference_wrapper[constscalar](d_slr.get_raw_ptr()[0])
+            )
+
+        cdef pair[unique_ptr[table], unique_ptr[table]] c_result
+
+        with nogil:
+            c_result = move(
+                self.c_obj.get()[0].shift(view, offsets, c_fill_values)
+            )
+
+        grouped_keys = Table.from_unique_ptr(
+            move(c_result.first),
+            column_names=self.keys._column_names
+        )
+
+        shifted = Table.from_unique_ptr(
+            move(c_result.second), column_names=values._column_names
+        )
+
+        return Table(data=shifted._data, index=grouped_keys)
+
+    def replace_nulls(self, Table values, object method):
+        cdef table_view val_view = values.view()
+        cdef pair[unique_ptr[table], unique_ptr[table]] c_result
+        cdef replace_policy policy = (
+            replace_policy.PRECEDING
+            if method == 'ffill' else replace_policy.FOLLOWING
+        )
+        cdef vector[replace_policy] policies = vector[replace_policy](
+            val_view.num_columns(), policy
+        )
+
+        with nogil:
+            c_result = move(
+                self.c_obj.get()[0].replace_nulls(val_view, policies)
+            )
+
+        sorted_keys = Table.from_unique_ptr(
+            move(c_result.first),
+            column_names=self.keys._column_names
+        )
+        grouped_result = Table.from_unique_ptr(
+            move(c_result.second), column_names=values._column_names
+        )
+
+        result = Table(data=grouped_result, index=sorted_keys)
         return result
 
+_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"}
 
-def _drop_unsupported_aggs(Table values, aggs):
+
+def _is_all_scan_aggregate(aggs):
     """
-    Drop any aggregations that are not supported.
+    Returns true if all are scan aggregations.
+    Raises
+    ------
+    NotImplementedError
+        If both reduction aggregations and scan aggregations are present.
     """
-    from pandas.core.groupby.groupby import DataError
-
-    if all(len(v) == 0 for v in aggs.values()):
-        return aggs
-
-    from cudf.utils.dtypes import (
-        is_categorical_dtype,
-        is_string_dtype,
-        is_list_dtype,
-        is_interval_dtype,
-        is_struct_dtype,
-        is_decimal_dtype,
-    )
-    result = aggs.copy()
-
-    for col_name in aggs:
-        if (
-            is_list_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _LIST_AGGS:
-                    del result[col_name][i]
-        elif (
-            is_string_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _STRING_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_categorical_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_struct_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _STRUCT_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_interval_dtype(values._data[col_name].dtype)
-        ):
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _INTERVAL_AGGS:
-                    del result[col_name][i]
-        elif (
-                is_decimal_dtype(values._data[col_name].dtype)
-        ):
-            if rmm._cuda.gpu.runtimeGetVersion() < 11000:
-                raise RuntimeError(
-                    "Decimal aggregations are only supported on CUDA >= 11 "
-                    "due to an nvcc compiler bug."
-                )
-            for i, agg_name in enumerate(aggs[col_name]):
-                if Aggregation(agg_name).kind not in _DECIMAL_AGGS:
-                    del result[col_name][i]
 
-    if all(len(v) == 0 for v in result.values()):
-        raise DataError("No numeric types to aggregate")
+    def get_name(agg):
+        return agg.__name__ if callable(agg) else agg
 
-    return result
+    all_scan = all(
+        all(
+            get_name(agg_name) in _GROUPBY_SCANS for agg_name in aggs[col_name]
+        )
+        for col_name in aggs
+    )
+    any_scan = any(
+        any(
+            get_name(agg_name) in _GROUPBY_SCANS for agg_name in aggs[col_name]
+        )
+        for col_name in aggs
+    )
+
+    if not all_scan and any_scan:
+        raise NotImplementedError(
+            "Cannot perform both aggregation and scan in one operation"
+        )
+    return all_scan and any_scan
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 69b8004cede..193c2ca9d67 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -2,7 +2,6 @@
 
 import cudf
 
-from collections import OrderedDict
 from itertools import chain
 
 from libcpp.memory cimport unique_ptr, make_unique
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 9bc0550bdf0..7d8909610dc 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -16,6 +16,9 @@ from cudf._lib.cpp.lists.drop_list_duplicates cimport (
 from cudf._lib.cpp.lists.sorting cimport (
     sort_lists as cpp_sort_lists
 )
+from cudf._lib.cpp.lists.combine cimport (
+    concatenate_rows as cpp_concatenate_rows
+)
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.column.column cimport column
@@ -28,6 +31,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport (
     size_type,
     null_equality,
+    null_policy,
     order,
     null_order,
     nan_equality
@@ -163,3 +167,17 @@ def contains_scalar(Column col, object py_search_key):
         ))
     result = Column.from_unique_ptr(move(c_result))
     return result
+
+
+def concatenate_rows(Table tbl):
+    cdef unique_ptr[column] c_result
+
+    cdef table_view c_table_view = tbl.view()
+
+    with nogil:
+        c_result = move(cpp_concatenate_rows(
+            c_table_view,
+        ))
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 8c209cd86bd..81ddbaa48ac 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -45,7 +45,7 @@ def copy_bitmask(Column col):
     cdef unique_ptr[device_buffer] up_db
 
     with nogil:
-        db = cpp_copy_bitmask(col_view)
+        db = move(cpp_copy_bitmask(col_view))
         up_db = make_unique[device_buffer](move(db))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
@@ -91,7 +91,7 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
     )
 
     with nogil:
-        db = cpp_create_null_mask(size, c_mask_state)
+        db = move(cpp_create_null_mask(size, c_mask_state))
         up_db = make_unique[device_buffer](move(db))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index 9e1d73326fc..3cf3cbe1ef2 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -9,27 +9,74 @@ from libc.stdint cimport uintptr_t
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.subword_tokenize cimport (
+from cudf._lib.cpp.nvtext.subword_tokenize cimport(
     subword_tokenize as cpp_subword_tokenize,
+    hashed_vocabulary as cpp_hashed_vocabulary,
+    load_vocabulary_file as cpp_load_vocabulary_file,
     tokenizer_result as cpp_tokenizer_result,
-    move as tr_move
+    move as tr_move,
 )
 from cudf._lib.column cimport Column
 
 
-def subword_tokenize(
+cdef class Hashed_Vocabulary:
+    cdef unique_ptr[cpp_hashed_vocabulary] c_obj
+
+    def __cinit__(self, hash_file):
+        cdef string c_hash_file = <string>str(hash_file).encode()
+        with nogil:
+            self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
+
+
+def subword_tokenize_inmem_hash(
     Column strings,
-    object   hash_file,
+    Hashed_Vocabulary hashed_vocabulary,
     uint32_t max_sequence_length=64,
     uint32_t stride=48,
     bool do_lower=True,
     bool do_truncate=False,
     uint32_t max_rows_tensor=500
 ):
+    """
+    Subword tokenizes text series by using the pre-loaded hashed vocabulary
+    """
     cdef column_view c_strings = strings.view()
-    cdef string c_hash_file = <string>str(hash_file).encode()
     cdef cpp_tokenizer_result c_result
+    with nogil:
+        c_result = tr_move(
+            cpp_subword_tokenize(
+                c_strings,
+                hashed_vocabulary.c_obj.get()[0],
+                max_sequence_length,
+                stride,
+                do_lower,
+                do_truncate,
+                max_rows_tensor
+            )
+        )
+    # return the 3 tensor components
+    tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
+    masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
+    metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
+    return tokens, masks, metadata
+
 
+def subword_tokenize_vocab_file(
+    Column strings,
+    object   hash_file,
+    uint32_t max_sequence_length=64,
+    uint32_t stride=48,
+    bool do_lower=True,
+    bool do_truncate=False,
+    uint32_t max_rows_tensor=500
+):
+    """
+        Subword tokenizes text series by using the hashed vocabulary
+        stored on disk
+    """
+    cdef column_view c_strings = strings.view()
+    cdef cpp_tokenizer_result c_result
+    cdef string c_hash_file = <string>str(hash_file).encode()
     with nogil:
         c_result = tr_move(
             cpp_subword_tokenize(
@@ -42,7 +89,6 @@ def subword_tokenize(
                 max_rows_tensor
             )
         )
-
     # return the 3 tensor components
     tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
     masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 516a518c46f..69d67c5b02d 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -71,8 +71,6 @@ cpdef read_orc(object filepath_or_buffer,
                object skip_rows=None,
                object num_rows=None,
                bool use_index=True,
-               bool decimals_as_float=True,
-               object force_decimal_scale=None,
                object timestamp_type=None):
     """
     Cython function to call into libcudf API, see `read_orc`.
@@ -96,9 +94,7 @@ cpdef read_orc(object filepath_or_buffer,
                 )
             )
         ),
-        use_index,
-        decimals_as_float,
-        get_size_t_arg(force_decimal_scale, "force_decimal_scale")
+        use_index
     )
 
     cdef table_with_metadata c_result
@@ -173,8 +169,6 @@ cdef orc_reader_options make_orc_reader_options(
     size_type num_rows,
     type_id timestamp_type,
     bool use_index,
-    bool decimals_as_float,
-    size_type force_decimal_scale
 ) except*:
 
     cdef vector[string] c_column_names
@@ -192,8 +186,6 @@ cdef orc_reader_options make_orc_reader_options(
         .num_rows(num_rows)
         .timestamp_type(data_type(timestamp_type))
         .use_index(use_index)
-        .decimals_as_float64(decimals_as_float)
-        .forced_decimals_scale(force_decimal_scale)
         .build()
     )
 
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 62013ea88ae..e5723331f3c 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -12,7 +12,7 @@ from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.column cimport Column
 from cudf._lib.types import np_to_cudf_types
 from cudf._lib.types cimport underlying_type_t_type_id, dtype_to_data_type
-from cudf._lib.aggregation cimport make_aggregation, aggregation
+from cudf._lib.aggregation cimport make_aggregation, Aggregation
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 import numpy as np
@@ -45,9 +45,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
-    cdef unique_ptr[aggregation] c_agg = move(make_aggregation(
-        reduction_op, kwargs
-    ))
+    cdef Aggregation cython_agg = make_aggregation(reduction_op, kwargs)
 
     cdef data_type c_out_dtype = dtype_to_data_type(col_dtype)
 
@@ -65,7 +63,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     with nogil:
         c_result = move(cpp_reduce(
             c_incol_view,
-            c_agg,
+            cython_agg.c_obj,
             c_out_dtype
         ))
 
@@ -95,9 +93,7 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     """
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[column] c_result
-    cdef unique_ptr[aggregation] c_agg = move(
-        make_aggregation(scan_op, kwargs)
-    )
+    cdef Aggregation cython_agg = make_aggregation(scan_op, kwargs)
 
     cdef scan_type c_inclusive
     if inclusive is True:
@@ -108,7 +104,7 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     with nogil:
         c_result = move(cpp_scan(
             c_incol_view,
-            c_agg,
+            cython_agg.c_obj,
             c_inclusive
         ))
 
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index 9c818f39c38..6fe661a25a5 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -8,12 +8,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.aggregation cimport make_aggregation
+from cudf._lib.aggregation cimport RollingAggregation, make_rolling_aggregation
 
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.aggregation cimport aggregation
 from cudf._lib.cpp.rolling cimport (
     rolling_window as cpp_rolling_window
 )
@@ -47,14 +46,13 @@ def rolling(Column source_column, Column pre_column_window,
     cdef column_view source_column_view = source_column.view()
     cdef column_view pre_column_window_view
     cdef column_view fwd_column_window_view
-    cdef unique_ptr[aggregation] agg
+    cdef RollingAggregation cython_agg
 
     if callable(op):
-        agg = move(
-            make_aggregation(op, {'dtype': source_column.dtype})
-        )
+        cython_agg = make_rolling_aggregation(
+            op, {'dtype': source_column.dtype})
     else:
-        agg = move(make_aggregation(op))
+        cython_agg = make_rolling_aggregation(op)
 
     if window is None:
         if center:
@@ -71,7 +69,7 @@ def rolling(Column source_column, Column pre_column_window,
                     pre_column_window_view,
                     fwd_column_window_view,
                     c_min_periods,
-                    agg)
+                    cython_agg.c_obj.get()[0])
             )
     else:
         c_min_periods = min_periods
@@ -89,7 +87,7 @@ def rolling(Column source_column, Column pre_column_window,
                     c_window,
                     c_forward_window,
                     c_min_periods,
-                    agg)
+                    cython_agg.c_obj.get()[0])
             )
 
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index b31f0675422..cb355a15f15 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -18,9 +18,18 @@ from libcpp.utility cimport move
 from libcpp cimport bool
 
 import cudf
-from cudf._lib.types import cudf_to_np_types, duration_unit_map
+from cudf.core.dtypes import ListDtype
+from cudf._lib.types import (
+    cudf_to_np_types,
+    duration_unit_map
+)
 from cudf._lib.types import datetime_unit_map
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.types cimport underlying_type_t_type_id, dtype_from_column_view
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.table cimport Table
+from cudf._lib.interop import to_arrow
 
 from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
@@ -41,12 +50,12 @@ from cudf._lib.cpp.scalar.scalar cimport (
     timestamp_scalar,
     duration_scalar,
     string_scalar,
-    fixed_point_scalar
+    fixed_point_scalar,
+    list_scalar,
 )
-from cudf.utils.dtypes import _decimal_to_int64
+from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype
 cimport cudf._lib.cpp.types as libcudf_types
 
-
 cdef class DeviceScalar:
 
     def __init__(self, value, dtype):
@@ -97,6 +106,8 @@ cdef class DeviceScalar:
     def _to_host_scalar(self):
         if isinstance(self.dtype, cudf.Decimal64Dtype):
             result = _get_py_decimal_from_fixed_point(self.c_value)
+        elif is_list_dtype(self.dtype):
+            result = _get_py_list_from_list(self.c_value)
         elif pd.api.types.is_string_dtype(self.dtype):
             result = _get_py_string_from_string(self.c_value)
         elif pd.api.types.is_numeric_dtype(self.dtype):
@@ -159,6 +170,22 @@ cdef class DeviceScalar:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
+        elif cdtype.id() == libcudf_types.LIST:
+            if (
+                <list_scalar*>s.get_raw_ptr()
+            )[0].view().type().id() == libcudf_types.LIST:
+                s._dtype = dtype_from_column_view(
+                    (<list_scalar*>s.get_raw_ptr())[0].view()
+                )
+            else:
+                s._dtype = ListDtype(
+                    cudf_to_np_types[
+                        <underlying_type_t_type_id>(
+                            (<list_scalar*>s.get_raw_ptr())[0]
+                            .view().type().id()
+                        )
+                    ]
+                )
         else:
             if dtype is not None:
                 s._dtype = dtype
@@ -268,6 +295,19 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
         )
     )
 
+cdef _get_py_list_from_list(unique_ptr[scalar]& s):
+
+    if not s.get()[0].is_valid():
+        return cudf.NA
+
+    cdef column_view list_col_view = (<list_scalar*>s.get()).view()
+    cdef Column list_col = Column.from_column_view(list_col_view, None)
+    cdef Table to_arrow_table = Table({"col": list_col})
+
+    arrow_table = to_arrow(to_arrow_table, [["col", []]])
+    result = arrow_table['col'].to_pylist()
+    return _nested_na_replace(result)
+
 cdef _get_py_string_from_string(unique_ptr[scalar]& s):
     if not s.get()[0].is_valid():
         return cudf.NA
@@ -405,7 +445,7 @@ cdef _get_np_scalar_from_timedelta64(unique_ptr[scalar]& s):
 
 def as_device_scalar(val, dtype=None):
     if dtype:
-        if isinstance(val, (cudf.Scalar, DeviceScalar)):
+        if isinstance(val, (cudf.Scalar, DeviceScalar)) and dtype != val.dtype:
             raise TypeError("Can't update dtype of existing GPU scalar")
         else:
             return cudf.Scalar(value=val, dtype=dtype).device_value
@@ -440,3 +480,16 @@ def _create_proxy_nat_scalar(dtype):
         return result
     else:
         raise TypeError('NAT only valid for datetime and timedelta')
+
+
+def _nested_na_replace(input_list):
+    '''
+    Replace `None` with `cudf.NA` in the result of
+    `__getitem__` calls to list type columns
+    '''
+    for idx, value in enumerate(input_list):
+        if isinstance(value, list):
+            _nested_na_replace(value)
+        elif value is None:
+            input_list[idx] = cudf.NA
+    return input_list
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 04fde5be9e8..3d20e5f15b7 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -15,7 +15,10 @@ from cudf._lib.table cimport Table
 
 from cudf._lib.cpp.strings.combine cimport (
     concatenate as cpp_concatenate,
-    join_strings as cpp_join_strings
+    join_strings as cpp_join_strings,
+    join_list_elements as cpp_join_list_elements,
+    separator_on_nulls as separator_on_nulls,
+    output_if_empty_list as output_if_empty_list
 )
 
 
@@ -78,3 +81,78 @@ def join(Column source_strings,
         ))
 
     return Column.from_unique_ptr(move(c_result))
+
+
+def join_lists_with_scalar(
+        Column source_strings,
+        object py_separator,
+        object py_narep):
+    """
+    Returns a Column by concatenating Lists of strings row-wise
+    in `source_strings` with the specified `py_separator`
+    between each string in lists and `<NA>`/`None` values
+    are replaced by `py_narep`
+    """
+
+    cdef DeviceScalar separator = py_separator.device_value
+    cdef DeviceScalar narep = py_narep.device_value
+
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    cdef const string_scalar* scalar_separator = \
+        <const string_scalar*>(separator.get_raw_ptr())
+    cdef const string_scalar* scalar_narep = <const string_scalar*>(
+        narep.get_raw_ptr()
+    )
+
+    with nogil:
+        c_result = move(cpp_join_list_elements(
+            source_view,
+            scalar_separator[0],
+            scalar_narep[0],
+            separator_on_nulls.YES,
+            output_if_empty_list.NULL_ELEMENT
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def join_lists_with_column(
+        Column source_strings,
+        Column separator_strings,
+        object py_source_narep,
+        object py_separator_narep):
+    """
+    Returns a Column by concatenating Lists of strings row-wise in
+    `source_strings` with a corresponding separator at the same
+    position in `separator_strings` and `<NA>`/`None` values in
+    `source_strings` are replaced by `py_source_narep` and
+    `<NA>`/`None` values in `separator_strings` are replaced
+    by `py_separator_narep`
+    """
+
+    cdef DeviceScalar source_narep = py_source_narep.device_value
+    cdef DeviceScalar separator_narep = py_separator_narep.device_value
+
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef column_view separator_view = separator_strings.view()
+
+    cdef const string_scalar* scalar_source_narep = \
+        <const string_scalar*>(source_narep.get_raw_ptr())
+    cdef const string_scalar* scalar_separator_narep = <const string_scalar*>(
+        separator_narep.get_raw_ptr()
+    )
+
+    with nogil:
+        c_result = move(cpp_join_list_elements(
+            source_view,
+            separator_view,
+            scalar_separator_narep[0],
+            scalar_source_narep[0],
+            separator_on_nulls.YES,
+            output_if_empty_list.NULL_ELEMENT
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
new file mode 100644
index 00000000000..211bbe9d4f0
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -0,0 +1,36 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.cpp.types cimport size_type
+from cudf._lib.column cimport Column
+from cudf._lib.scalar cimport DeviceScalar
+from cudf._lib.cpp.column.column cimport column
+
+from cudf._lib.cpp.strings.json cimport (
+    get_json_object as cpp_get_json_object
+)
+
+
+def get_json_object(Column col, object py_json_path):
+    """
+    Apply a JSONPath string to all rows in an input column
+    of json strings.
+    """
+    cdef unique_ptr[column] c_result
+
+    cdef column_view col_view = col.view()
+    cdef DeviceScalar json_path = py_json_path.device_value
+
+    cdef const string_scalar* scalar_json_path = <const string_scalar*>(
+        json_path.get_raw_ptr()
+    )
+    with nogil:
+        c_result = move(cpp_get_json_object(
+            col_view,
+            scalar_json_path[0],
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index f061f8de942..2c83f8b86e0 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -29,12 +29,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
-try:
-    # Numba >= 0.49
-    from numba.np import numpy_support
-except ImportError:
-    # Numba <= 0.49
-    from numba import numpy_support
+from numba.np import numpy_support
 
 cimport cudf._lib.cpp.transform as libcudf_transform
 
diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py
index ccbb16256fb..c05b33f448f 100644
--- a/python/cudf/cudf/api/__init__.py
+++ b/python/cudf/cudf/api/__init__.py
@@ -1 +1,3 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
+
+from cudf.api import types
diff --git a/python/cudf/cudf/api/types/__init__.py b/python/cudf/cudf/api/types/__init__.py
new file mode 100644
index 00000000000..cddac3ae67e
--- /dev/null
+++ b/python/cudf/cudf/api/types/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from cudf.api.types.categoricals import _union_categoricals
diff --git a/python/cudf/cudf/api/types/categoricals.py b/python/cudf/cudf/api/types/categoricals.py
new file mode 100644
index 00000000000..ae0ac4a08d5
--- /dev/null
+++ b/python/cudf/cudf/api/types/categoricals.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+from typing import List, Union
+
+import cudf
+
+ParentType = Union["cudf.Series", "cudf.Index"]
+
+
+def _union_categoricals(
+    to_union: List[Union[cudf.Series, cudf.Index]],
+    sort_categories: bool = False,
+    ignore_order: bool = False,
+):
+    """
+    This is an internal API which combines categorical data.
+    """
+    # TODO(s) in the order specified :
+    # 1. The return type needs to be changed
+    #    to cudf.Categorical once it is implemented.
+    # 2. Make this API public (i.e., to resemble
+    #    pd.api.types.union_categoricals)
+
+    if ignore_order:
+        raise TypeError("ignore_order is not yet implemented")
+
+    result_col = cudf.core.column.CategoricalColumn._concat(
+        [obj._column for obj in to_union]
+    )
+    if sort_categories:
+        sorted_categories = result_col.categories.sort_by_values(
+            ascending=True
+        )[0]
+        result_col = result_col.cat().reorder_categories(
+            new_categories=sorted_categories
+        )
+
+    return cudf.Index(result_col)
diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py
index 0550b1d4de0..d3da544f8b5 100644
--- a/python/cudf/cudf/core/abc.py
+++ b/python/cudf/cudf/core/abc.py
@@ -1,8 +1,7 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+"""Common abstract base classes for cudf."""
 
-import abc
 import sys
-from abc import abstractmethod
 
 import rmm
 
@@ -17,24 +16,81 @@
     import pickle  # type: ignore
 
 
-class Serializable(abc.ABC):
-    @abstractmethod
+class Serializable:
+    """A serializable object composed of device memory buffers.
+
+    This base class defines a standard serialization protocol for objects
+    encapsulating device memory buffers. Serialization proceeds by copying
+    device data onto the host, then returning it along with suitable metadata
+    for reconstruction of the object. Deserialization performs the reverse
+    process, copying the serialized data from the host to new device buffers.
+    Subclasses must define the abstract methods :meth:`~.serialize` and
+    :meth:`~.deserialize`. The former defines the conversion of the object
+    into a representative collection of metadata and data buffers, while the
+    latter converts back from that representation into an equivalent object.
+    """
+
     def serialize(self):
-        pass
+        """Generate an equivalent serializable representation of an object.
+
+        Subclasses must implement this method to define how the attributes of
+        the object are converted into a serializable representation. A common
+        solution is to construct a list containing device buffer attributes in
+        a well-defined order that can be reinterpreted upon deserialization,
+        then place all other lightweight attributes into the metadata
+        dictionary.
+
+        Returns
+        -------
+        Tuple[Dict, List]
+            The first element of the returned tuple is a dict containing any
+            serializable metadata required to reconstruct the object. The
+            second element is a list containing the device data buffers
+            or memoryviews of the object.
+
+        :meta private:
+        """
+        raise NotImplementedError(
+            "Subclasses of Serializable must implement serialize"
+        )
 
     @classmethod
-    @abstractmethod
     def deserialize(cls, header, frames):
-        pass
+        """Generate an object from a serialized representation.
+
+        Subclasses must implement this method to define how objects of that
+        class can be constructed from a serialized representation generalized
+        by :meth:`serialize`.
+
+        Parameters
+        ----------
+        header : dict
+            The metadata required to reconstruct the object.
+        frames : list
+            The Buffers or memoryviews that the object should contain.
+
+        Returns
+        -------
+        Serializable
+            A new instance of `cls` (a subclass of `Serializable`) equivalent
+            to the instance that was serialized to produce the header and
+            frames.
+
+        :meta private:
+        """
+        raise NotImplementedError(
+            "Subclasses of Serializable must implement deserialize"
+        )
 
     def device_serialize(self):
-        """Converts the object into a header and list of Buffer/memoryview
-        objects for file storage or network transmission.
+        """Serialize data and metadata associated with device memory.
 
         Returns
         -------
-            header : dictionary containing any serializable metadata
-            frames : list of Buffer or memoryviews, commonly of length one
+        header : dict
+            The metadata required to reconstruct the object.
+        frames : list
+            The Buffers or memoryviews that the object should contain.
 
         :meta private:
         """
@@ -51,20 +107,24 @@ def device_serialize(self):
 
     @classmethod
     def device_deserialize(cls, header, frames):
-        """Convert serialized header and frames back
-        into respective Object Type
+        """Perform device-side deserialization tasks.
+
+        The primary purpose of this method is the creation of device memory
+        buffers from host buffers where necessary.
 
         Parameters
         ----------
-        cls : class of object
         header : dict
-            dictionary containing any serializable metadata
-        frames : list of Buffer or memoryview objects
+            The metadata required to reconstruct the object.
+        frames : list
+            The Buffers or memoryviews that the object should contain.
 
         Returns
         -------
-        Deserialized Object of type cls extracted
-        from frames and header
+        Serializable
+            A new instance of `cls` (a subclass of `Serializable`) equivalent
+            to the instance that was serialized to produce the header and
+            frames.
 
         :meta private:
         """
@@ -84,13 +144,14 @@ def device_deserialize(cls, header, frames):
         return obj
 
     def host_serialize(self):
-        """Converts the object into a header and list of memoryview
-        objects for file storage or network transmission.
+        """Serialize data and metadata associated with host memory.
 
         Returns
         -------
-            header : dictionary containing any serializable metadata
-            frames : list of memoryviews, commonly of length one
+        header : dict
+            The metadata required to reconstruct the object.
+        frames : list
+            The Buffers or memoryviews that the object should contain.
 
         :meta private:
         """
@@ -104,20 +165,21 @@ def host_serialize(self):
 
     @classmethod
     def host_deserialize(cls, header, frames):
-        """Convert serialized header and frames back
-        into respective Object Type
+        """Perform device-side deserialization tasks.
 
         Parameters
         ----------
-        cls : class of object
         header : dict
-            dictionary containing any serializable metadata
-        frames : list of memoryview objects
+            The metadata required to reconstruct the object.
+        frames : list
+            The Buffers or memoryviews that the object should contain.
 
         Returns
         -------
-        Deserialized Object of type cls extracted
-        from frames and header
+        Serializable
+            A new instance of `cls` (a subclass of `Serializable`) equivalent
+            to the instance that was serialized to produce the header and
+            frames.
 
         :meta private:
         """
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 12bd17e1f6a..9f26ac8ee78 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -18,7 +18,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
 
     Returns
     --------
-    (labels, cats) : (Series, Series)
+    (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
         - *labels* contains the encoded values
         - *cats* contains the categories in order that the N-th
             item corresponds to the (N-1) code.
@@ -29,14 +29,9 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
     >>> data = cudf.Series(['a', 'c', 'c'])
     >>> codes, uniques = cudf.factorize(data)
     >>> codes
-    0    0
-    1    1
-    2    1
-    dtype: int8
+    array([0, 1, 1], dtype=int8)
     >>> uniques
-    0    a
-    1    c
-    dtype: object
+    StringIndex(['a' 'c'], dtype='object')
 
     See Also
     --------
@@ -53,7 +48,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
     if size_hint:
         warn("size_hint is not applicable for cudf.factorize")
 
-    return_cupy_array = isinstance(values, cp.core.core.ndarray)
+    return_cupy_array = isinstance(values, cp.ndarray)
 
     values = Series(values)
 
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 9fc5570e35a..c6875052685 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -141,6 +141,17 @@ def empty(cls, size: int) -> Buffer:
         dbuf = DeviceBuffer(size=size)
         return Buffer(dbuf)
 
+    def copy(self):
+        """
+        Create a new Buffer containing a copy of the data contained
+        in this Buffer.
+        """
+        from rmm._lib.device_buffer import copy_device_to_ptr
+
+        out = Buffer(DeviceBuffer(size=self.size))
+        copy_device_to_ptr(self.ptr, out.ptr, self.size)
+        return out
+
 
 def _buffer_data_from_array_interface(array_interface):
     ptr = array_interface["data"][0]
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index e0aa9471a2f..32cb557548f 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -7,7 +7,6 @@
     as_column,
     build_categorical_column,
     build_column,
-    column_applymap,
     column_empty,
     column_empty_like,
     column_empty_like_same_mask,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index bb1bf3c5d5c..756f175c238 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import pickle
+from collections.abc import MutableSequence
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -17,6 +18,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from numba import cuda
 
 import cudf
@@ -818,7 +820,7 @@ def __contains__(self, item: ScalarLike) -> bool:
         return self._encode(item) in self.as_numerical
 
     def serialize(self) -> Tuple[dict, list]:
-        header = {}  # type: Dict[Any, Any]
+        header: Dict[Any, Any] = {}
         frames = []
         header["type-serialized"] = pickle.dumps(type(self))
         header["dtype"], dtype_frames = self.dtype.serialize()
@@ -1075,10 +1077,7 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
             " if you need this functionality."
         )
 
-    def to_pandas(
-        self, index: ColumnLike = None, nullable: bool = False, **kwargs
-    ) -> pd.Series:
-
+    def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series:
         if self.categories.dtype.kind == "f":
             new_mask = bools_to_mask(self.notnull())
             col = column.build_categorical_column(
@@ -1099,6 +1098,24 @@ def to_pandas(
         )
         return pd.Series(data, index=index)
 
+    def to_arrow(self) -> pa.Array:
+        """Convert to PyArrow Array."""
+        # arrow doesn't support unsigned codes
+        signed_type = (
+            min_signed_type(self.codes.max())
+            if self.codes.size > 0
+            else np.int8
+        )
+        codes = self.codes.astype(signed_type)
+        categories = self.categories
+
+        out_indices = codes.to_arrow()
+        out_dictionary = categories.to_arrow()
+
+        return pa.DictionaryArray.from_arrays(
+            out_indices, out_dictionary, ordered=self.ordered,
+        )
+
     @property
     def values_host(self) -> np.ndarray:
         """
@@ -1327,21 +1344,11 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
 
     @property
     def is_monotonic_increasing(self) -> bool:
-        if not hasattr(self, "_is_monotonic_increasing"):
-            self._is_monotonic_increasing = (
-                bool(self.ordered)
-                and self.as_numerical.is_monotonic_increasing
-            )
-        return self._is_monotonic_increasing
+        return bool(self.ordered) and self.as_numerical.is_monotonic_increasing
 
     @property
     def is_monotonic_decreasing(self) -> bool:
-        if not hasattr(self, "_is_monotonic_decreasing"):
-            self._is_monotonic_decreasing = (
-                bool(self.ordered)
-                and self.as_numerical.is_monotonic_decreasing
-            )
-        return self._is_monotonic_decreasing
+        return bool(self.ordered) and self.as_numerical.is_monotonic_decreasing
 
     def as_categorical_column(
         self, dtype: Dtype, **kwargs
@@ -1456,6 +1463,71 @@ def view(self, dtype: Dtype) -> ColumnBase:
             "Categorical column views are not currently supported"
         )
 
+    @staticmethod
+    def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
+        # TODO: This function currently assumes it is being called from
+        # column._concat_columns, at least to the extent that all the
+        # preprocessing in that function has already been done. That should be
+        # improved as the concatenation API is solidified.
+
+        # Find the first non-null column:
+        head = next((obj for obj in objs if obj.valid_count), objs[0])
+
+        # Combine and de-dupe the categories
+        cats = (
+            cudf.concat([o.cat().categories for o in objs])
+            .drop_duplicates()
+            ._column
+        )
+        objs = [
+            o.cat()._set_categories(o.cat().categories, cats, is_unique=True)
+            for o in objs
+        ]
+        codes = [o.codes for o in objs]
+
+        newsize = sum(map(len, codes))
+        if newsize > libcudf.MAX_COLUMN_SIZE:
+            raise MemoryError(
+                f"Result of concat cannot have "
+                f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
+            )
+        elif newsize == 0:
+            codes_col = column.column_empty(0, head.codes.dtype, masked=True)
+        else:
+            # Filter out inputs that have 0 length, then concatenate.
+            codes = [o for o in codes if len(o)]
+            codes_col = libcudf.concat.concat_columns(objs)
+
+        return column.build_categorical_column(
+            categories=column.as_column(cats),
+            codes=column.as_column(codes_col.base_data, dtype=codes_col.dtype),
+            mask=codes_col.base_mask,
+            size=codes_col.size,
+            offset=codes_col.offset,
+        )
+
+    def _copy_type_metadata(
+        self: CategoricalColumn, other: ColumnBase
+    ) -> ColumnBase:
+        """Copies type metadata from self onto other, returning a new column.
+
+        In addition to the default behavior, if `other` is not a
+        CategoricalColumn, we assume other is a column of codes, and return a
+        CategoricalColumn composed of `other`  and the categories of `self`.
+        """
+        if not isinstance(other, cudf.core.column.CategoricalColumn):
+            other = column.build_categorical_column(
+                categories=self.categories,
+                codes=column.as_column(other.base_data, dtype=other.dtype),
+                mask=other.base_mask,
+                ordered=self.ordered,
+                size=other.size,
+                offset=other.offset,
+                null_count=other.null_count,
+            )
+        # Have to ignore typing here because it misdiagnoses super().
+        return super()._copy_type_metadata(other)  # type: ignore
+
 
 def _create_empty_categorical_column(
     categorical_column: CategoricalColumn, dtype: "CategoricalDtype"
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6a1600d6461..a58b2eda822 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -5,14 +5,12 @@
 import builtins
 import pickle
 import warnings
-from collections.abc import MutableSequence
 from types import SimpleNamespace
 from typing import (
     Any,
-    Callable,
     Dict,
     List,
-    Mapping,
+    MutableSequence,
     Optional,
     Sequence,
     Tuple,
@@ -25,7 +23,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda, njit
+from numba import cuda
 
 import cudf
 from cudf import _lib as libcudf
@@ -41,13 +39,15 @@
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
-from cudf.core.dtypes import CategoricalDtype
-from cudf.core.dtypes import IntervalDtype
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    IntervalDtype,
+    ListDtype,
+    StructDtype,
+)
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
-    NUMERIC_TYPES,
     check_cast_unsupported_dtype,
-    cudf_dtypes_to_pandas_dtypes,
     get_time_unit,
     is_categorical_dtype,
     is_decimal_dtype,
@@ -57,7 +57,6 @@
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
-    min_signed_type,
     min_unsigned_type,
     np_to_pa_dtype,
 )
@@ -117,22 +116,16 @@ def __repr__(self):
             f"dtype: {self.dtype}"
         )
 
-    def to_pandas(
-        self, index: ColumnLike = None, nullable: bool = False, **kwargs
-    ) -> "pd.Series":
-        if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
-            pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
-            arrow_array = self.to_arrow()
-            pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
-            pd_series = pd.Series(pandas_array, copy=False)
-        elif str(self.dtype) in NUMERIC_TYPES and self.null_count == 0:
-            pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
-        elif is_interval_dtype(self.dtype):
-            pd_series = pd.Series(
-                pd.IntervalDtype().__from_arrow__(self.to_arrow())
-            )
-        else:
-            pd_series = self.to_arrow().to_pandas(**kwargs)
+    def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+        """Convert object to pandas type.
+
+        The default implementation falls back to PyArrow for the conversion.
+        """
+        # This default implementation does not handle nulls in any meaningful
+        # way, but must consume the parameter to avoid passing it to PyArrow
+        # (which does not recognize it).
+        kwargs.pop("nullable", None)
+        pd_series = self.to_arrow().to_pandas(**kwargs)
 
         if index is not None:
             pd_series.index = index
@@ -200,114 +193,6 @@ def __sizeof__(self) -> int:
             n += bitmask_allocation_size_bytes(self.size)
         return n
 
-    def cat(
-        self, parent=None
-    ) -> "cudf.core.column.categorical.CategoricalAccessor":
-        raise NotImplementedError()
-
-    def str(self, parent=None) -> "cudf.core.column.string.StringMethods":
-        raise NotImplementedError()
-
-    @classmethod
-    def _concat(
-        cls, objs: "MutableSequence[ColumnBase]", dtype: Dtype = None
-    ) -> ColumnBase:
-        if len(objs) == 0:
-            dtype = pd.api.types.pandas_dtype(dtype)
-            if is_categorical_dtype(dtype):
-                dtype = CategoricalDtype()
-            return column_empty(0, dtype=dtype, masked=True)
-
-        # If all columns are `NumericalColumn` with different dtypes,
-        # we cast them to a common dtype.
-        # Notice, we can always cast pure null columns
-        not_null_cols = list(filter(lambda o: o.valid_count > 0, objs))
-        if len(not_null_cols) > 0 and (
-            len(
-                [
-                    o
-                    for o in not_null_cols
-                    if not is_numerical_dtype(o.dtype)
-                    or np.issubdtype(o.dtype, np.datetime64)
-                ]
-            )
-            == 0
-        ):
-            col_dtypes = [o.dtype for o in not_null_cols]
-            # Use NumPy to find a common dtype
-            common_dtype = np.find_common_type(col_dtypes, [])
-            # Cast all columns to the common dtype
-            for i in range(len(objs)):
-                objs[i] = objs[i].astype(common_dtype)
-
-        # Find the first non-null column:
-        head = objs[0]
-        for i, obj in enumerate(objs):
-            if obj.valid_count > 0:
-                head = obj
-                break
-
-        for i, obj in enumerate(objs):
-            # Check that all columns are the same type:
-            if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype):
-                # if all null, cast to appropriate dtype
-                if obj.valid_count == 0:
-                    objs[i] = column_empty_like(
-                        head, dtype=head.dtype, masked=True, newsize=len(obj)
-                    )
-                else:
-                    raise ValueError("All columns must be the same type")
-
-        cats = None
-        is_categorical = all(is_categorical_dtype(o.dtype) for o in objs)
-
-        # Combine CategoricalColumn categories
-        if is_categorical:
-            # Combine and de-dupe the categories
-            cats = (
-                cudf.concat([o.cat().categories for o in objs])
-                .to_series()
-                .drop_duplicates(ignore_index=True)
-                ._column
-            )
-            objs = [
-                o.cat()._set_categories(
-                    o.cat().categories, cats, is_unique=True
-                )
-                for o in objs
-            ]
-            # Map `objs` into a list of the codes until we port Categorical to
-            # use the libcudf++ Category data type.
-            objs = [o.cat().codes._column for o in objs]
-            head = head.cat().codes._column
-
-        newsize = sum(map(len, objs))
-        if newsize > libcudf.MAX_COLUMN_SIZE:
-            raise MemoryError(
-                f"Result of concat cannot have "
-                f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
-            )
-
-        # Filter out inputs that have 0 length
-        objs = [o for o in objs if len(o) > 0]
-
-        # Perform the actual concatenation
-        if newsize > 0:
-            col = libcudf.concat.concat_columns(objs)
-        else:
-            col = column_empty(0, head.dtype, masked=True)
-
-        if is_categorical:
-            col = build_categorical_column(
-                categories=as_column(cats),
-                codes=as_column(col.base_data, dtype=col.dtype),
-                mask=col.base_mask,
-                size=col.size,
-                offset=col.offset,
-            )
-
-        return col
-
     def dropna(self, drop_nan: bool = False) -> ColumnBase:
         if drop_nan:
             col = self.nans_to_nulls()
@@ -334,30 +219,6 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        if isinstance(self, cudf.core.column.CategoricalColumn):
-            # arrow doesn't support unsigned codes
-            signed_type = (
-                min_signed_type(self.codes.max())
-                if self.codes.size > 0
-                else np.int8
-            )
-            codes = self.codes.astype(signed_type)
-            categories = self.categories
-
-            out_indices = codes.to_arrow()
-            out_dictionary = categories.to_arrow()
-
-            return pa.DictionaryArray.from_arrays(
-                out_indices, out_dictionary, ordered=self.ordered,
-            )
-
-        if isinstance(self, cudf.core.column.StringColumn) and (
-            self.null_count == len(self)
-        ):
-            return pa.NullArray.from_buffers(
-                pa.null(), len(self), [pa.py_buffer((b""))]
-            )
-
         return libcudf.interop.to_arrow(
             libcudf.table.Table(
                 cudf.core.column_accessor.ColumnAccessor({"None": self})
@@ -430,10 +291,13 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         elif isinstance(array.type, pa.Decimal128Type):
             return cudf.core.column.DecimalColumn.from_arrow(array)
 
-        return libcudf.interop.from_arrow(data, data.column_names)._data[
+        result = libcudf.interop.from_arrow(data, data.column_names)._data[
             "None"
         ]
 
+        result = _copy_type_metadata_from_arrow(array, result)
+        return result
+
     def _get_mask_as_column(self) -> ColumnBase:
         return libcudf.transform.mask_to_bools(
             self.base_mask, self.offset, self.offset + len(self)
@@ -702,7 +566,7 @@ def __setitem__(self, key: Any, value: Any):
             nelem = len(key)
 
         if is_scalar(value):
-            value = self.dtype.type(value) if value is not None else value
+            value = cudf.Scalar(value, dtype=self.dtype)
         else:
             if len(value) != nelem:
                 msg = (
@@ -817,7 +681,7 @@ def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         return indices[-1]
 
     def append(self, other: ColumnBase) -> ColumnBase:
-        return ColumnBase._concat([self, as_column(other)])
+        return _concat_columns([self, as_column(other)])
 
     def quantile(
         self,
@@ -869,9 +733,6 @@ def isin(self, values: Sequence) -> ColumnBase:
         result: Column
             Column of booleans indicating if each element is in values.
         """
-        lhs = self
-        rhs = None
-
         try:
             lhs, rhs = self._process_values_for_isin(values)
             res = lhs._isin_earlystop(rhs)
@@ -958,25 +819,15 @@ def is_monotonic(self) -> bool:
 
     @property
     def is_monotonic_increasing(self) -> bool:
-        if not hasattr(self, "_is_monotonic_increasing"):
-            if self.has_nulls:
-                self._is_monotonic_increasing = False
-            else:
-                self._is_monotonic_increasing = self.as_frame()._is_sorted(
-                    ascending=None, null_position=None
-                )
-        return self._is_monotonic_increasing
+        return not self.has_nulls and self.as_frame()._is_sorted(
+            ascending=None, null_position=None
+        )
 
     @property
     def is_monotonic_decreasing(self) -> bool:
-        if not hasattr(self, "_is_monotonic_decreasing"):
-            if self.has_nulls:
-                self._is_monotonic_decreasing = False
-            else:
-                self._is_monotonic_decreasing = self.as_frame()._is_sorted(
-                    ascending=[False], null_position=None
-                )
-        return self._is_monotonic_decreasing
+        return not self.has_nulls and self.as_frame()._is_sorted(
+            ascending=[False], null_position=None
+        )
 
     def get_slice_bound(
         self, label: ScalarLike, side: builtins.str, kind: builtins.str
@@ -1146,32 +997,26 @@ def argsort(
         )
         return sorted_indices
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls:
-
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            mask = SimpleNamespace(
-                __cuda_array_interface__={
-                    "shape": (len(self),),
-                    "typestr": "<t1",
-                    "data": (self.mask_ptr, True),
-                    "version": 1,
-                }
-            )
-            output["mask"] = mask
+    def __arrow_array__(self, type=None):
+        raise TypeError(
+            "Implicit conversion to a host PyArrow Array via __arrow_array__ "
+            "is not allowed, To explicitly construct a PyArrow Array, "
+            "consider using .to_arrow()"
+        )
 
-        return output
+    def __array__(self, dtype=None):
+        raise TypeError(
+            "Implicit conversion to a host NumPy array via __array__ is not "
+            "allowed. To explicitly construct a host array, consider using "
+            ".to_array()"
+        )
+
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            f"dtype {self.dtype} is not yet supported via "
+            "`__cuda_array_interface__`"
+        )
 
     def __add__(self, other):
         return self.binary_operator("add", other)
@@ -1241,7 +1086,7 @@ def unique(self) -> ColumnBase:
         )
 
     def serialize(self) -> Tuple[dict, list]:
-        header = {}  # type: Dict[Any, Any]
+        header: Dict[Any, Any] = {}
         frames = []
         header["type-serialized"] = pickle.dumps(type(self))
         header["dtype"] = self.dtype.str
@@ -1266,11 +1111,26 @@ def deserialize(cls, header: dict, frames: list) -> ColumnBase:
         mask = None
         if "mask" in header:
             mask = Buffer.deserialize(header["mask"], [frames[1]])
-        return build_column(data=data, dtype=dtype, mask=mask)
+        return build_column(
+            data=data, dtype=dtype, mask=mask, size=header.get("size", None)
+        )
+
+    def unary_operator(self, unaryop: builtins.str):
+        raise TypeError(
+            f"Operation {unaryop} not supported for dtype {self.dtype}."
+        )
 
     def binary_operator(
         self, op: builtins.str, other: BinaryOperand, reflect: bool = False
     ) -> ColumnBase:
+        raise TypeError(
+            f"Operation {op} not supported between dtypes {self.dtype} and "
+            f"{other.dtype}."
+        )
+
+    def normalize_binop_value(
+        self, other: ScalarLike
+    ) -> Union[ColumnBase, ScalarLike]:
         raise NotImplementedError
 
     def min(self, skipna: bool = None, dtype: Dtype = None):
@@ -1407,46 +1267,18 @@ def scatter_to_table(
             }
         )
 
-    def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
+    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
         """
         Copies type metadata from self onto other, returning a new column.
 
-        * when `self` is a CategoricalColumn and `other` is not, we assume
-          other is a column of codes, and return a CategoricalColumn composed
-          of `other`  and the categories of `self`.
-        * when both `self` and `other` are StructColumns, rename the fields
-          of `other` to the field names of `self`.
-        * when both `self` and `other` are DecimalColumns, copy the precision
-          from self.dtype to other.dtype
         * when `self` and `other` are nested columns of the same type,
           recursively apply this function on the children of `self` to the
           and the children of `other`.
         * if none of the above, return `other` without any changes
         """
-        if isinstance(self, cudf.core.column.CategoricalColumn) and not (
-            isinstance(other, cudf.core.column.CategoricalColumn)
-        ):
-            other = build_categorical_column(
-                categories=self.categories,
-                codes=as_column(other.base_data, dtype=other.dtype),
-                mask=other.base_mask,
-                ordered=self.ordered,
-                size=other.size,
-                offset=other.offset,
-                null_count=other.null_count,
-            )
-
-        if isinstance(other, cudf.core.column.StructColumn) and isinstance(
-            self, cudf.core.column.StructColumn
-        ):
-            other = other._rename_fields(self.dtype.fields.keys())
-
-        if isinstance(other, cudf.core.column.DecimalColumn) and isinstance(
-            self, cudf.core.column.DecimalColumn
-        ):
-            other.dtype.precision = self.dtype.precision
-
-        if type(self) is type(other):
+        # TODO: This logic should probably be moved to a common nested column
+        # class.
+        if isinstance(other, type(self)):
             if self.base_children and other.base_children:
                 base_children = tuple(
                     self.base_children[i]._copy_type_metadata(
@@ -1874,7 +1706,9 @@ def as_column(
                 col = col.set_mask(mask)
         elif np.issubdtype(col.dtype, np.datetime64):
             if nan_as_null or (mask is None and nan_as_null is None):
-                col = utils.time_col_replace_nulls(col)
+                # Ignore typing error since this method is only defined for
+                # DatetimeColumn, not the ColumnBase class.
+                col = col._make_copy_with_na_as_null()  # type: ignore
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
@@ -1945,7 +1779,7 @@ def as_column(
         data = as_column(
             utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
         )
-        if not nan_as_null:
+        if not nan_as_null and not is_decimal_dtype(data.dtype):
             if np.issubdtype(data.dtype, np.floating):
                 data = data.fillna(np.nan)
             elif np.issubdtype(data.dtype, np.datetime64):
@@ -1995,7 +1829,7 @@ def as_column(
                 data = as_column(
                     buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null
                 )
-                data = utils.time_col_replace_nulls(data)
+                data = data._make_copy_with_na_as_null()
                 mask = data.mask
 
             data = cudf.core.column.datetime.DatetimeColumn(
@@ -2015,7 +1849,7 @@ def as_column(
                 data = as_column(
                     buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null
                 )
-                data = utils.time_col_replace_nulls(data)
+                data = data._make_copy_with_na_as_null()
                 mask = data.mask
 
             data = cudf.core.column.timedelta.TimeDeltaColumn(
@@ -2196,58 +2030,6 @@ def _construct_array(
     return arbitrary
 
 
-def column_applymap(
-    udf: Callable[[ScalarLike], ScalarLike],
-    column: ColumnBase,
-    out_dtype: Dtype,
-) -> ColumnBase:
-    """Apply an element-wise function to transform the values in the Column.
-
-    Parameters
-    ----------
-    udf : function
-        Wrapped by numba jit for call on the GPU as a device function.
-    column : Column
-        The source column.
-    out_dtype  : numpy.dtype
-        The dtype for use in the output.
-
-    Returns
-    -------
-    result : Column
-    """
-    core = njit(udf)
-    results = column_empty(len(column), dtype=out_dtype)
-    values = column.data_array_view
-    if column.nullable:
-        # For masked columns
-        @cuda.jit
-        def kernel_masked(values, masks, results):
-            i = cuda.grid(1)
-            # in range?
-            if i < values.size:
-                # valid?
-                if utils.mask_get(masks, i):
-                    # call udf
-                    results[i] = core(values[i])
-
-        masks = column.mask_array_view
-        kernel_masked.forall(len(column))(values, masks, results)
-    else:
-        # For non-masked columns
-        @cuda.jit
-        def kernel_non_masked(values, results):
-            i = cuda.grid(1)
-            # in range?
-            if i < values.size:
-                # call udf
-                results[i] = core(values[i])
-
-        kernel_non_masked.forall(len(column))(values, results)
-
-    return as_column(results)
-
-
 def _data_from_cuda_array_interface_desc(obj) -> Buffer:
     desc = obj.__cuda_array_interface__
     ptr = desc["data"][0]
@@ -2296,7 +2078,7 @@ def serialize_columns(columns) -> Tuple[List[dict], List]:
     frames : list
         list of frames
     """
-    headers = []  # type List[Dict[Any, Any], ...]
+    headers: List[Dict[Any, Any]] = []
     frames = []
 
     if len(columns) > 0:
@@ -2416,3 +2198,119 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
     dtype: int8
     """
     return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size)
+
+
+def _copy_type_metadata_from_arrow(
+    arrow_array: pa.array, cudf_column: ColumnBase
+) -> ColumnBase:
+    """
+    Similar to `Column._copy_type_metadata`, except copies type metadata
+    from arrow array into a cudf column. Recursive for every level.
+    * When `arrow_array` is struct type and `cudf_column` is StructDtype, copy
+    field names.
+    * When `arrow_array` is decimal type and `cudf_column` is
+    Decimal64Dtype, copy precisions.
+    """
+    if pa.types.is_decimal(arrow_array.type) and isinstance(
+        cudf_column, cudf.core.column.DecimalColumn
+    ):
+        cudf_column.dtype.precision = arrow_array.type.precision
+    elif pa.types.is_struct(arrow_array.type) and isinstance(
+        cudf_column, cudf.core.column.StructColumn
+    ):
+        base_children = tuple(
+            _copy_type_metadata_from_arrow(arrow_array.field(i), col_child)
+            for i, col_child in enumerate(cudf_column.base_children)
+        )
+        cudf_column.set_base_children(base_children)
+        return cudf.core.column.StructColumn(
+            data=None,
+            size=cudf_column.base_size,
+            dtype=StructDtype.from_arrow(arrow_array.type),
+            mask=cudf_column.base_mask,
+            offset=cudf_column.offset,
+            null_count=cudf_column.null_count,
+            children=base_children,
+        )
+    elif pa.types.is_list(arrow_array.type) and isinstance(
+        cudf_column, cudf.core.column.ListColumn
+    ):
+        if arrow_array.values and cudf_column.base_children:
+            base_children = (
+                cudf_column.base_children[0],
+                _copy_type_metadata_from_arrow(
+                    arrow_array.values, cudf_column.base_children[1]
+                ),
+            )
+            return cudf.core.column.ListColumn(
+                size=cudf_column.base_size,
+                dtype=ListDtype.from_arrow(arrow_array.type),
+                mask=cudf_column.base_mask,
+                offset=cudf_column.offset,
+                null_count=cudf_column.null_count,
+                children=base_children,
+            )
+
+    return cudf_column
+
+
+def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
+    """Concatenate a sequence of columns."""
+    if len(objs) == 0:
+        dtype = pd.api.types.pandas_dtype(None)
+        return column_empty(0, dtype=dtype, masked=True)
+
+    # If all columns are `NumericalColumn` with different dtypes,
+    # we cast them to a common dtype.
+    # Notice, we can always cast pure null columns
+    not_null_col_dtypes = [o.dtype for o in objs if o.valid_count]
+    if len(not_null_col_dtypes) and all(
+        is_numerical_dtype(dtyp) and np.issubdtype(dtyp, np.datetime64)
+        for dtyp in not_null_col_dtypes
+    ):
+        # Use NumPy to find a common dtype
+        common_dtype = np.find_common_type(not_null_col_dtypes, [])
+        # Cast all columns to the common dtype
+        objs = [obj.astype(common_dtype) for obj in objs]
+
+    # Find the first non-null column:
+    head = next((obj for obj in objs if obj.valid_count), objs[0])
+
+    for i, obj in enumerate(objs):
+        # Check that all columns are the same type:
+        if not pd.api.types.is_dtype_equal(obj.dtype, head.dtype):
+            # if all null, cast to appropriate dtype
+            if obj.valid_count == 0:
+                objs[i] = column_empty_like(
+                    head, dtype=head.dtype, masked=True, newsize=len(obj)
+                )
+            else:
+                raise ValueError("All columns must be the same type")
+
+    # TODO: This logic should be generalized to a dispatch to
+    # ColumnBase._concat so that all subclasses can override necessary
+    # behavior. However, at the moment it's not clear what that API should look
+    # like, so CategoricalColumn simply implements a minimal working API.
+    if all(is_categorical_dtype(o.dtype) for o in objs):
+        return cudf.core.column.categorical.CategoricalColumn._concat(
+            cast(
+                MutableSequence[
+                    cudf.core.column.categorical.CategoricalColumn
+                ],
+                objs,
+            )
+        )
+
+    newsize = sum(map(len, objs))
+    if newsize > libcudf.MAX_COLUMN_SIZE:
+        raise MemoryError(
+            f"Result of concat cannot have "
+            f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
+        )
+    elif newsize == 0:
+        col = column_empty(0, head.dtype, masked=True)
+    else:
+        # Filter out inputs that have 0 length, then concatenate.
+        objs = [o for o in objs if len(o)]
+        col = libcudf.concat.concat_columns(objs)
+    return col
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 0bacbe04356..b96a49c2514 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -2,21 +2,28 @@
 
 from __future__ import annotations
 
+import builtins
 import datetime as dt
 import re
 from numbers import Number
-from typing import Any, Sequence, Union, cast
+from types import SimpleNamespace
+from typing import Any, Mapping, Sequence, Union, cast
 
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DatetimeLikeScalar, Dtype, DtypeObj, ScalarLike
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, column, string
+from cudf.core.column import (
+    ColumnBase,
+    as_column,
+    column,
+    column_empty_like,
+    string,
+)
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import _fillna_natwise
 
@@ -127,21 +134,18 @@ def weekday(self) -> ColumnBase:
         return self.get_dt_field("weekday")
 
     def to_pandas(
-        self, index: "cudf.Index" = None, nullable: bool = False, **kwargs
+        self, index: pd.Index = None, nullable: bool = False, **kwargs
     ) -> "cudf.Series":
         # Workaround until following issue is fixed:
         # https://issues.apache.org/jira/browse/ARROW-9772
 
         # Pandas supports only `datetime64[ns]`, hence the cast.
-        pd_series = pd.Series(
-            self.astype("datetime64[ns]").to_array("NAT"), copy=False
+        return pd.Series(
+            self.astype("datetime64[ns]").to_array("NAT"),
+            copy=False,
+            index=index,
         )
 
-        if index is not None:
-            pd_series.index = index
-
-        return pd_series
-
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
@@ -196,6 +200,33 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
             ),
         )
 
+    @property
+    def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
+        output = {
+            "shape": (len(self),),
+            "strides": (self.dtype.itemsize,),
+            "typestr": self.dtype.str,
+            "data": (self.data_ptr, False),
+            "version": 1,
+        }
+
+        if self.nullable and self.has_nulls:
+
+            # Create a simple Python object that exposes the
+            # `__cuda_array_interface__` attribute here since we need to modify
+            # some of the attributes from the numba device array
+            mask = SimpleNamespace(
+                __cuda_array_interface__={
+                    "shape": (len(self),),
+                    "typestr": "<t1",
+                    "data": (self.mask_ptr, True),
+                    "version": 1,
+                }
+            )
+            output["mask"] = mask
+
+        return output
+
     def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn:
         dtype = np.dtype(dtype)
         if dtype == self.dtype:
@@ -274,8 +305,8 @@ def binary_operator(
         reflect: bool = False,
     ) -> ColumnBase:
         if isinstance(rhs, cudf.DateOffset):
-            return binop_offset(self, rhs, op)
-        lhs, rhs = self, rhs
+            return rhs._datetime_binop(self, op, reflect=reflect)
+        lhs: Union[ScalarLike, ColumnBase] = self
         if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"):
             out_dtype = np.dtype(np.bool_)  # type: Dtype
         elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
@@ -300,13 +331,16 @@ def binary_operator(
                 f"Series of dtype {self.dtype} cannot perform "
                 f" the operation {op}"
             )
-        return binop(lhs, rhs, op=op, out_dtype=out_dtype, reflect=reflect)
+
+        if reflect:
+            lhs, rhs = rhs, lhs
+        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     def fillna(
         self, fill_value: Any = None, method: str = None, dtype: Dtype = None
     ) -> DatetimeColumn:
         if fill_value is not None:
-            if cudf.utils.utils.isnat(fill_value):
+            if cudf.utils.utils._isnat(fill_value):
                 return _fillna_natwise(self)
             if is_scalar(fill_value):
                 if not isinstance(fill_value, cudf.Scalar):
@@ -372,19 +406,22 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         else:
             return False
 
-
-@annotate("BINARY_OP", color="orange", domain="cudf_python")
-def binop(
-    lhs: Union[ColumnBase, ScalarLike],
-    rhs: Union[ColumnBase, ScalarLike],
-    op: str,
-    out_dtype: Dtype,
-    reflect: bool,
-) -> ColumnBase:
-    if reflect:
-        lhs, rhs = rhs, lhs
-    out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
-    return out
+    def _make_copy_with_na_as_null(self):
+        """Return a copy with NaN values replaced with nulls."""
+        null = column_empty_like(self, masked=True, newsize=1)
+        out_col = cudf._lib.replace.replace(
+            self,
+            as_column(
+                Buffer(
+                    np.array([self.default_na_value()], dtype=self.dtype).view(
+                        "|u1"
+                    )
+                ),
+                dtype=self.dtype,
+            ),
+            null,
+        )
+        return out_col
 
 
 def binop_offset(lhs, rhs, op):
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index d9e4610832d..459cfae6fdb 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
 from decimal import Decimal
-from typing import cast, Any
+from typing import Any, Sequence, Tuple, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -10,20 +10,34 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.quantiles import quantile as cpp_quantile
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
 from cudf._typing import Dtype
 from cudf.core.buffer import Buffer
-from cudf.core.column import ColumnBase, as_column
+from cudf.core.column import ColumnBase, NumericalColumn, as_column
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
+from .numerical_base import NumericalBaseColumn
 
-class DecimalColumn(ColumnBase):
+
+class DecimalColumn(NumericalBaseColumn):
     dtype: Decimal64Dtype
 
+    def __truediv__(self, other):
+        # TODO: This override is not sufficient. While it will change the
+        # behavior of x / y for two decimal columns, it will not affect
+        # col1.binary_operator(col2), which is how Series/Index will call this.
+        return self.binary_operator("div", other)
+
+    def __setitem__(self, key, value):
+        if isinstance(value, np.integer):
+            value = int(value)
+        super().__setitem__(key, value)
+
     @classmethod
     def from_arrow(cls, data: pa.Array):
         dtype = Decimal64Dtype.from_arrow(data.type)
@@ -72,7 +86,7 @@ def binary_operator(self, op, other, reflect=False):
 
         # Binary Arithmatics between decimal columns. `Scale` and `precision`
         # are computed outside of libcudf
-        if op in ("add", "sub", "mul"):
+        if op in ("add", "sub", "mul", "div"):
             scale = _binop_scale(self.dtype, other.dtype, op)
             output_type = Decimal64Dtype(
                 scale=scale, precision=Decimal64Dtype.MAX_PRECISION
@@ -81,7 +95,7 @@ def binary_operator(self, op, other, reflect=False):
             result.dtype.precision = _binop_precision(
                 self.dtype, other.dtype, op
             )
-        elif op in ("eq", "lt", "gt", "le", "ge"):
+        elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
             if not isinstance(
                 other,
                 (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar),
@@ -113,8 +127,20 @@ def normalize_binop_value(self, other):
         else:
             raise TypeError(f"cannot normalize {type(other)}")
 
-    def _apply_scan_op(self, op: str) -> ColumnBase:
-        result = libcudf.reduce.scan(op, self, True)
+    def _decimal_quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> ColumnBase:
+        quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
+        # get sorted indices and exclude nulls
+        sorted_indices = self.as_frame()._get_sorted_inds(
+            ascending=True, na_position="first"
+        )
+        sorted_indices = sorted_indices[self.null_count :]
+
+        result = cpp_quantile(
+            self, quant, interpolation, sorted_indices, exact
+        )
+
         return self._copy_type_metadata(result)
 
     def as_decimal_column(
@@ -139,37 +165,6 @@ def as_string_column(
                 "cudf.core.column.StringColumn", as_column([], dtype="object")
             )
 
-    def reduce(self, op: str, skipna: bool = None, **kwargs) -> Decimal:
-        min_count = kwargs.pop("min_count", 0)
-        preprocessed = self._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-        if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
-        else:
-            return preprocessed
-
-    def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> Decimal:
-        return self.reduce(
-            "sum", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def product(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> Decimal:
-        return self.reduce(
-            "product", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def sum_of_squares(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> Decimal:
-        return self.reduce(
-            "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
     def fillna(
         self, value: Any = None, method: str = None, dtype: Dtype = None
     ):
@@ -177,11 +172,54 @@ def fillna(
 
         Returns a copy with null filled.
         """
+        if isinstance(value, (int, Decimal)):
+            value = cudf.Scalar(value, dtype=self.dtype)
+        elif (
+            isinstance(value, DecimalColumn)
+            or isinstance(value, NumericalColumn)
+            and is_integer_dtype(value.dtype)
+        ):
+            value = value.astype(self.dtype)
+        else:
+            raise TypeError(
+                "Decimal columns only support using fillna with decimal and "
+                "integer values"
+            )
+
         result = libcudf.replace.replace_nulls(
             input_col=self, replacement=value, method=method, dtype=dtype
         )
         return self._copy_type_metadata(result)
 
+    def serialize(self) -> Tuple[dict, list]:
+        header, frames = super().serialize()
+        header["dtype"] = self.dtype.serialize()
+        header["size"] = self.size
+        return header, frames
+
+    @classmethod
+    def deserialize(cls, header: dict, frames: list) -> ColumnBase:
+        dtype = cudf.Decimal64Dtype.deserialize(*header["dtype"])
+        header["dtype"] = dtype
+        return super().deserialize(header, frames)
+
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            "Decimals are not yet supported via `__cuda_array_interface__`"
+        )
+
+    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
+        """Copies type metadata from self onto other, returning a new column.
+
+        In addition to the default behavior, if `other` is also a decimal
+        column the precision is copied over.
+        """
+        if isinstance(other, DecimalColumn):
+            other.dtype.precision = self.dtype.precision  # type: ignore
+        # Have to ignore typing here because it misdiagnoses super().
+        return super()._copy_type_metadata(other)  # type: ignore
+
 
 def _binop_scale(l_dtype, r_dtype, op):
     # This should at some point be hooked up to libcudf's
@@ -191,6 +229,8 @@ def _binop_scale(l_dtype, r_dtype, op):
         return max(s1, s2)
     elif op == "mul":
         return s1 + s2
+    elif op == "div":
+        return s1 - s2
     else:
         raise NotImplementedError()
 
@@ -205,8 +245,10 @@ def _binop_precision(l_dtype, r_dtype, op):
     p1, p2 = l_dtype.precision, r_dtype.precision
     s1, s2 = l_dtype.scale, r_dtype.scale
     if op in ("add", "sub"):
-        return max(s1, s2) + max(p1 - s1, p2 - s2) + 1
-    elif op == "mul":
-        return p1 + p2 + 1
+        result = max(s1, s2) + max(p1 - s1, p2 - s2) + 1
+    elif op in ("mul", "div"):
+        result = p1 + p2 + 1
     else:
         raise NotImplementedError()
+
+    return min(result, cudf.Decimal64Dtype.MAX_PRECISION)
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index d8bea6b1658..24541c57044 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
+import pandas as pd
 import pyarrow as pa
+
 import cudf
 from cudf.core.column import StructColumn
 from cudf.core.dtypes import IntervalDtype
@@ -110,3 +112,13 @@ def as_interval_column(self, dtype, **kwargs):
             )
         else:
             raise ValueError("dtype must be IntervalDtype")
+
+    def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+        # Note: This does not handle null values in the interval column.
+        # However, this exact sequence (calling __from_arrow__ on the output of
+        # self.to_arrow) is currently the best known way to convert interval
+        # types into pandas (trying to convert the underlying numerical columns
+        # directly is problematic), so we're stuck with this for now.
+        return pd.Series(
+            pd.IntervalDtype().__from_arrow__(self.to_arrow()), index=index
+        )
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index da953df5478..7ea02c0e878 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -8,15 +8,19 @@
 import cudf
 from cudf._lib.copying import segmented_gather
 from cudf._lib.lists import (
+    concatenate_rows,
     contains_scalar,
     count_elements,
     drop_list_duplicates,
     extract_element,
     sort_lists,
 )
+from cudf._lib.table import Table
+from cudf._typing import BinaryOperand
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.core.dtypes import ListDtype
 from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype
 
 
@@ -74,6 +78,58 @@ def __sizeof__(self):
     def base_size(self):
         return len(self.base_children[0]) - 1
 
+    def binary_operator(
+        self, binop: str, other: BinaryOperand, reflect: bool = False
+    ) -> ColumnBase:
+        """
+        Calls a binary operator *binop* on operands *self*
+        and *other*.
+
+        Parameters
+        ----------
+        self, other : list columns
+
+        binop :  binary operator
+            Only "add" operator is currently being supported
+            for lists concatenation functions
+
+        reflect : boolean, default False
+            If ``reflect`` is ``True``, swap the order of
+            the operands.
+
+        Returns
+        -------
+        Series : the output dtype is determined by the
+            input operands.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({'val': [['a', 'a'], ['b'], ['c']]})
+        >>> gdf
+            val
+        0  [a, a]
+        1     [b]
+        2     [c]
+        >>> gdf['val'] + gdf['val']
+        0    [a, a, a, a]
+        1          [b, b]
+        2          [c, c]
+        Name: val, dtype: list
+
+        """
+
+        if isinstance(other.dtype, ListDtype):
+            if binop == "add":
+                return concatenate_rows(Table({0: self, 1: other}))
+            else:
+                raise NotImplementedError(
+                    "Lists concatenation for this operation is not yet"
+                    "supported"
+                )
+        else:
+            raise TypeError("can only concatenate list to list")
+
     @property
     def elements(self):
         """
@@ -171,6 +227,12 @@ def deserialize(cls, header, frames):
             size=header["size"],
         )
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            "Lists are not yet supported via `__cuda_array_interface__`"
+        )
+
 
 class ListMethods(ColumnMethodsMixin):
     """
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 10a9ffbfbae..e35cc744434 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -2,17 +2,16 @@
 
 from __future__ import annotations
 
-from numbers import Number
-from typing import Any, Callable, Sequence, Tuple, Union, cast
+from types import SimpleNamespace
+from typing import Any, Mapping, Sequence, Tuple, Union, cast
 
+import cupy
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 from pandas.api.types import is_integer_dtype
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.quantiles import quantile as cpp_quantile
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import (
@@ -25,14 +24,18 @@
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
+    NUMERIC_TYPES,
+    cudf_dtypes_to_pandas_dtypes,
     min_column_type,
     min_signed_type,
     numeric_normalize_types,
     to_cudf_compatible_scalar,
 )
 
+from .numerical_base import NumericalBaseColumn
 
-class NumericalColumn(ColumnBase):
+
+class NumericalColumn(NumericalBaseColumn):
     def __init__(
         self,
         data: Buffer,
@@ -84,6 +87,33 @@ def __contains__(self, item: ScalarLike) -> bool:
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
+    @property
+    def __cuda_array_interface__(self) -> Mapping[str, Any]:
+        output = {
+            "shape": (len(self),),
+            "strides": (self.dtype.itemsize,),
+            "typestr": self.dtype.str,
+            "data": (self.data_ptr, False),
+            "version": 1,
+        }
+
+        if self.nullable and self.has_nulls:
+
+            # Create a simple Python object that exposes the
+            # `__cuda_array_interface__` attribute here since we need to modify
+            # some of the attributes from the numba device array
+            mask = SimpleNamespace(
+                __cuda_array_interface__={
+                    "shape": (len(self),),
+                    "typestr": "<t1",
+                    "data": (self.mask_ptr, True),
+                    "version": 1,
+                }
+            )
+            output["mask"] = mask
+
+        return output
+
     def unary_operator(self, unaryop: str) -> ColumnBase:
         return _numeric_column_unaryop(self, op=unaryop)
 
@@ -117,7 +147,7 @@ def binary_operator(
                 msg = "{!r} operator not supported between {} and {}"
                 raise TypeError(msg.format(binop, type(self), type(rhs)))
             if isinstance(rhs, cudf.core.column.DecimalColumn):
-                lhs = self.as_decimal_column(
+                lhs: Union[ScalarLike, ColumnBase] = self.as_decimal_column(
                     Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
                 )
                 return lhs.binary_operator(binop, rhs)
@@ -129,12 +159,11 @@ def binary_operator(
                     or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
                 ):
                     out_dtype = np.dtype("float64")
-        return _numeric_column_binop(
-            lhs=self, rhs=rhs, op=binop, out_dtype=out_dtype, reflect=reflect
-        )
 
-    def _apply_scan_op(self, op: str) -> ColumnBase:
-        return libcudf.reduce.scan(op, self, True)
+        if binop in {"lt", "gt", "le", "ge", "eq", "ne", "NULL_EQUALS"}:
+            out_dtype = "bool"
+        lhs, rhs = (self, rhs) if not reflect else (rhs, self)
+        return libcudf.binaryop.binaryop(lhs, rhs, binop, out_dtype)
 
     def normalize_binop_value(
         self, other: ScalarLike
@@ -229,43 +258,6 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
             return self
         return libcudf.unary.cast(self, dtype)
 
-    def reduce(self, op: str, skipna: bool = None, **kwargs) -> float:
-        min_count = kwargs.pop("min_count", 0)
-        preprocessed = self._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-        if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
-        else:
-            return cast(float, preprocessed)
-
-    def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> float:
-        return self.reduce(
-            "sum", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def product(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> float:
-        return self.reduce(
-            "product", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def mean(self, skipna: bool = None, dtype: Dtype = np.float64) -> float:
-        return self.reduce("mean", skipna=skipna, dtype=dtype)
-
-    def var(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
-    ) -> float:
-        return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof)
-
-    def std(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
-    ) -> float:
-        return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)
-
     def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
@@ -282,149 +274,6 @@ def _process_values_for_isin(
 
         return lhs, rhs
 
-    def sum_of_squares(self, dtype: Dtype = None) -> float:
-        return libcudf.reduce.reduce("sum_of_squares", self, dtype=dtype)
-
-    def kurtosis(self, skipna: bool = None) -> float:
-        skipna = True if skipna is None else skipna
-
-        if len(self) == 0 or (not skipna and self.has_nulls):
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        self = self.nans_to_nulls().dropna()  # type: ignore
-
-        if len(self) < 4:
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        n = len(self)
-        miu = self.mean()
-        m4_numerator = ((self - miu) ** self.normalize_binop_value(4)).sum()
-        V = self.var()
-
-        if V == 0:
-            return 0
-
-        term_one_section_one = (n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3))
-        term_one_section_two = m4_numerator / (V ** 2)
-        term_two = ((n - 1) ** 2) / ((n - 2) * (n - 3))
-        kurt = term_one_section_one * term_one_section_two - 3 * term_two
-        return kurt
-
-    def skew(self, skipna: bool = None) -> ScalarLike:
-        skipna = True if skipna is None else skipna
-
-        if len(self) == 0 or (not skipna and self.has_nulls):
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        self = self.nans_to_nulls().dropna()  # type: ignore
-
-        if len(self) < 3:
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        n = len(self)
-        miu = self.mean()
-        m3 = (((self - miu) ** self.normalize_binop_value(3)).sum()) / n
-        m2 = self.var(ddof=0)
-
-        if m2 == 0:
-            return 0
-
-        unbiased_coef = ((n * (n - 1)) ** 0.5) / (n - 2)
-        skew = unbiased_coef * m3 / (m2 ** (3 / 2))
-        return skew
-
-    def quantile(
-        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
-    ) -> NumericalColumn:
-        if isinstance(q, Number) or cudf.utils.dtypes.is_list_like(q):
-            np_array_q = np.asarray(q)
-            if np.logical_or(np_array_q < 0, np_array_q > 1).any():
-                raise ValueError(
-                    "percentiles should all be in the interval [0, 1]"
-                )
-        # Beyond this point, q either being scalar or list-like
-        # will only have values in range [0, 1]
-        result = self._numeric_quantile(q, interpolation, exact)
-        if isinstance(q, Number):
-            return (
-                cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-                if result[0] is cudf.NA
-                else result[0]
-            )
-        return result
-
-    def median(self, skipna: bool = None) -> NumericalColumn:
-        skipna = True if skipna is None else skipna
-
-        if not skipna and self.has_nulls:
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        # enforce linear in case the default ever changes
-        return self.quantile(0.5, interpolation="linear", exact=True)
-
-    def _numeric_quantile(
-        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
-    ) -> NumericalColumn:
-        quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
-        # get sorted indices and exclude nulls
-        sorted_indices = self.as_frame()._get_sorted_inds(
-            ascending=True, na_position="first"
-        )
-        sorted_indices = sorted_indices[self.null_count :]
-
-        return cpp_quantile(self, quant, interpolation, sorted_indices, exact)
-
-    def cov(self, other: ColumnBase) -> float:
-        if (
-            len(self) == 0
-            or len(other) == 0
-            or (len(self) == 1 and len(other) == 1)
-        ):
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        result = (self - self.mean()) * (other - other.mean())
-        cov_sample = result.sum() / (len(self) - 1)
-        return cov_sample
-
-    def corr(self, other: ColumnBase) -> float:
-        if len(self) == 0 or len(other) == 0:
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        cov = self.cov(other)
-        lhs_std, rhs_std = self.std(), other.std()
-
-        if not cov or lhs_std == 0 or rhs_std == 0:
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-        return cov / lhs_std / rhs_std
-
-    def round(self, decimals: int = 0) -> NumericalColumn:
-        """Round the values in the Column to the given number of decimals.
-        """
-        return libcudf.round.round(self, decimal_places=decimals)
-
-    def applymap(
-        self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
-    ) -> ColumnBase:
-        """Apply an element-wise function to transform the values in the Column.
-
-        Parameters
-        ----------
-        udf : function
-            Wrapped by numba jit for call on the GPU as a device function.
-        out_dtype  : numpy.dtype; optional
-            The dtype for use in the output.
-            By default, use the same dtype as *self.dtype*.
-
-        Returns
-        -------
-        result : Column
-            The mask is preserved.
-        """
-        if out_dtype is None:
-            out_dtype = self.dtype
-        out = column.column_applymap(udf=udf, column=self, out_dtype=out_dtype)
-        return out
-
     def default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column
         """
@@ -695,34 +544,22 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
         return False
 
+    def to_pandas(
+        self, index: pd.Index = None, nullable: bool = False, **kwargs
+    ) -> "pd.Series":
+        if nullable and self.dtype in cudf_dtypes_to_pandas_dtypes:
+            pandas_nullable_dtype = cudf_dtypes_to_pandas_dtypes[self.dtype]
+            arrow_array = self.to_arrow()
+            pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
+            pd_series = pd.Series(pandas_array, copy=False)
+        elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls:
+            pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
+        else:
+            pd_series = self.to_arrow().to_pandas(**kwargs)
 
-@annotate("BINARY_OP", color="orange", domain="cudf_python")
-def _numeric_column_binop(
-    lhs: Union[ColumnBase, ScalarLike],
-    rhs: Union[ColumnBase, ScalarLike],
-    op: str,
-    out_dtype: Dtype,
-    reflect: bool = False,
-) -> ColumnBase:
-    if reflect:
-        lhs, rhs = rhs, lhs
-
-    is_op_comparison = op in [
-        "lt",
-        "gt",
-        "le",
-        "ge",
-        "eq",
-        "ne",
-        "NULL_EQUALS",
-    ]
-
-    if is_op_comparison:
-        out_dtype = "bool"
-
-    out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
-
-    return out
+        if index is not None:
+            pd_series.index = index
+        return pd_series
 
 
 def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
new file mode 100644
index 00000000000..fd62b58db9b
--- /dev/null
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+"""Define an interface for columns that can perform numerical operations."""
+
+from __future__ import annotations
+
+from numbers import Number
+from typing import Sequence, Union
+
+import numpy as np
+
+import cudf
+from cudf import _lib as libcudf
+from cudf._typing import Dtype, ScalarLike
+from cudf.core.column import ColumnBase
+
+
+class NumericalBaseColumn(ColumnBase):
+    """A column composed of numerical data.
+
+    This class encodes a standard interface for different types of columns
+    containing numerical types of data. In particular, mathematical operations
+    that make sense whether a column is integral or real, fixed or floating
+    point, should be encoded here.
+    """
+
+    def reduce(
+        self, op: str, skipna: bool = None, min_count: int = 0, **kwargs
+    ) -> ScalarLike:
+        """Perform a reduction operation.
+
+        op : str
+            The operation to perform.
+        skipna : bool
+            Whether or not na values must be
+        """
+        preprocessed = self._process_for_reduction(
+            skipna=skipna, min_count=min_count
+        )
+        if isinstance(preprocessed, ColumnBase):
+            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+        else:
+            return preprocessed
+
+    def sum(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ) -> ScalarLike:
+        return self.reduce(
+            "sum", skipna=skipna, dtype=dtype, min_count=min_count
+        )
+
+    def product(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ) -> ScalarLike:
+        return self.reduce(
+            "product", skipna=skipna, dtype=dtype, min_count=min_count
+        )
+
+    def mean(
+        self, skipna: bool = None, dtype: Dtype = np.float64
+    ) -> ScalarLike:
+        return self.reduce("mean", skipna=skipna, dtype=dtype)
+
+    def var(
+        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+    ) -> ScalarLike:
+        return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof)
+
+    def std(
+        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+    ) -> ScalarLike:
+        return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)
+
+    def sum_of_squares(
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+    ) -> ScalarLike:
+        return self.reduce(
+            "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
+        )
+
+    def kurtosis(self, skipna: bool = None) -> float:
+        skipna = True if skipna is None else skipna
+
+        if len(self) == 0 or (not skipna and self.has_nulls):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        self = self.nans_to_nulls().dropna()  # type: ignore
+
+        if len(self) < 4:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        n = len(self)
+        miu = self.mean()
+        m4_numerator = ((self - miu) ** self.normalize_binop_value(4)).sum()
+        V = self.var()
+
+        if V == 0:
+            return 0
+
+        term_one_section_one = (n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3))
+        term_one_section_two = m4_numerator / (V ** 2)
+        term_two = ((n - 1) ** 2) / ((n - 2) * (n - 3))
+        kurt = term_one_section_one * term_one_section_two - 3 * term_two
+        return kurt
+
+    def skew(self, skipna: bool = None) -> ScalarLike:
+        skipna = True if skipna is None else skipna
+
+        if len(self) == 0 or (not skipna and self.has_nulls):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        self = self.nans_to_nulls().dropna()  # type: ignore
+
+        if len(self) < 3:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        n = len(self)
+        miu = self.mean()
+        m3 = (((self - miu) ** self.normalize_binop_value(3)).sum()) / n
+        m2 = self.var(ddof=0)
+
+        if m2 == 0:
+            return 0
+
+        unbiased_coef = ((n * (n - 1)) ** 0.5) / (n - 2)
+        skew = unbiased_coef * m3 / (m2 ** (3 / 2))
+        return skew
+
+    def quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> NumericalBaseColumn:
+        if isinstance(q, Number) or cudf.utils.dtypes.is_list_like(q):
+            np_array_q = np.asarray(q)
+            if np.logical_or(np_array_q < 0, np_array_q > 1).any():
+                raise ValueError(
+                    "percentiles should all be in the interval [0, 1]"
+                )
+        # Beyond this point, q either being scalar or list-like
+        # will only have values in range [0, 1]
+        result = self._numeric_quantile(q, interpolation, exact)
+        if isinstance(q, Number):
+            return (
+                cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+                if result[0] is cudf.NA
+                else result[0]
+            )
+        return result
+
+    def median(self, skipna: bool = None) -> NumericalBaseColumn:
+        skipna = True if skipna is None else skipna
+
+        if not skipna and self.has_nulls:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        # enforce linear in case the default ever changes
+        return self.quantile(0.5, interpolation="linear", exact=True)
+
+    def _numeric_quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> NumericalBaseColumn:
+        quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
+        # get sorted indices and exclude nulls
+        sorted_indices = self.as_frame()._get_sorted_inds(
+            ascending=True, na_position="first"
+        )
+        sorted_indices = sorted_indices[self.null_count :]
+
+        return libcudf.quantiles.quantile(
+            self, quant, interpolation, sorted_indices, exact
+        )
+
+    def cov(self, other: ColumnBase) -> float:
+        if (
+            len(self) == 0
+            or len(other) == 0
+            or (len(self) == 1 and len(other) == 1)
+        ):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        result = (self - self.mean()) * (other - other.mean())
+        cov_sample = result.sum() / (len(self) - 1)
+        return cov_sample
+
+    def corr(self, other: ColumnBase) -> float:
+        if len(self) == 0 or len(other) == 0:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        cov = self.cov(other)
+        lhs_std, rhs_std = self.std(), other.std()
+
+        if not cov or lhs_std == 0 or rhs_std == 0:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+        return cov / lhs_std / rhs_std
+
+    def round(self, decimals: int = 0) -> NumericalBaseColumn:
+        """Round the values in the Column to the given number of decimals.
+        """
+        return libcudf.round.round(self, decimal_places=decimals)
+
+    def _apply_scan_op(self, op: str) -> ColumnBase:
+        return self._copy_type_metadata(libcudf.reduce.scan(op, self, True))
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index de2df9b50d7..0b83548a92d 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -10,8 +10,8 @@
 import cupy
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from numba import cuda
-from nvtx import annotate
 
 import cudf
 from cudf import _lib as libcudf
@@ -40,7 +40,7 @@
     porter_stemmer_measure as cpp_porter_stemmer_measure,
 )
 from cudf._lib.nvtext.subword_tokenize import (
-    subword_tokenize as cpp_subword_tokenize,
+    subword_tokenize_vocab_file as cpp_subword_tokenize_vocab_file,
 )
 from cudf._lib.nvtext.tokenize import (
     _count_tokens_column as cpp_count_tokens_column,
@@ -78,6 +78,8 @@
 from cudf._lib.strings.combine import (
     concatenate as cpp_concatenate,
     join as cpp_join,
+    join_lists_with_column as cpp_join_lists_with_column,
+    join_lists_with_scalar as cpp_join_lists_with_scalar,
 )
 from cudf._lib.strings.contains import (
     contains_re as cpp_contains_re,
@@ -107,6 +109,7 @@
     startswith_multiple as cpp_startswith_multiple,
 )
 from cudf._lib.strings.findall import findall as cpp_findall
+from cudf._lib.strings.json import get_json_object as cpp_get_json_object
 from cudf._lib.strings.padding import (
     PadSide,
     center as cpp_center,
@@ -256,6 +259,8 @@ def htoi(self) -> ParentType:
 
         return self._return_or_inplace(out, inplace=False)
 
+    hex_to_int = htoi
+
     def ip2int(self) -> ParentType:
         """
         This converts ip strings to integers
@@ -287,6 +292,8 @@ def ip2int(self) -> ParentType:
 
         return self._return_or_inplace(out, inplace=False)
 
+    ip_to_int = ip2int
+
     def __getitem__(self, key):
         if isinstance(key, slice):
             return self.slice(start=key.start, stop=key.stop, step=key.step)
@@ -464,17 +471,193 @@ def cat(self, others=None, sep=None, na_rep=None):
                 out = out[0]
         return out
 
-    def join(self, sep) -> ParentType:
+    def join(
+        self, sep=None, string_na_rep=None, sep_na_rep=None
+    ) -> ParentType:
         """
         Join lists contained as elements in the Series/Index with passed
         delimiter.
 
-        Raises : NotImplementedError
-            Columns of arrays / lists are not yet supported.
-        """
-        raise NotImplementedError(
-            "Columns of arrays / lists are not yet " "supported"
+        If the elements of a Series are lists themselves, join the content of
+        these lists using the delimiter passed to the function.
+        This function is an equivalent to :meth:`str.join`. 
+        In the special case that the lists in the Series contain only ``None``,
+        a `<NA>`/`None` value will always be returned.
+
+        Parameters
+        ----------
+        sep : str or array-like
+            If str, the delimiter is used between list entries.
+            If array-like, the string at a position is used as a
+            delimiter for corresponding row of the list entries.
+        string_na_rep : str, default None
+            This character will take the place of null strings
+            (not empty strings) in the Series but will be considered
+            only if the Series contains list elements and those lists have
+            at least one non-null string. If ``string_na_rep`` is ``None``, 
+            it defaults to empty space "".
+        sep_na_rep : str, default None
+            This character will take the place of any null strings
+            (not empty strings) in `sep`. This parameter can be used
+            only if `sep` is array-like. If ``sep_na_rep`` is ``None``,
+            it defaults to empty space "".
+
+        Returns
+        -------
+        Series/Index: object
+            The list entries concatenated by intervening occurrences of
+            the delimiter.
+
+        Raises
+        ------
+        ValueError
+            - If ``sep_na_rep`` is supplied when ``sep`` is str.
+            - If ``sep`` is array-like and not of equal length with Series/Index.
+        TypeError
+            - If ``string_na_rep`` or ``sep_na_rep`` are not scalar values.
+            - If ``sep`` is not of following types: str or array-like.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g', ' ', 'h']])
+        >>> ser
+        0    [a, b, c]
+        1       [d, e]
+        2          [f]
+        3    [g,  , h]
+        dtype: list
+        >>> ser.str.join(sep='-')
+        0    a-b-c
+        1      d-e
+        2        f
+        3    g- -h
+        dtype: object
+
+        ``sep`` can an array-like input:
+
+        >>> ser.str.join(sep=['-', '+', '.', '='])
+        0    a-b-c
+        1      d+e
+        2        f
+        3    g= =h
+        dtype: object
+
+        If the actual series doesn't have lists, each character is joined
+        by `sep`:
+
+        >>> ser = cudf.Series(['abc', 'def', 'ghi'])
+        >>> ser
+        0    abc
+        1    def
+        2    ghi
+        dtype: object
+        >>> ser.str.join(sep='_')
+        0    a_b_c
+        1    d_e_f
+        2    g_h_i
+        dtype: object
+
+        We can replace `<NA>`/`None` values present in lists using
+        ``string_na_rep`` if the lists contain at least one valid string
+        (lists containing all `None` will result in a `<NA>`/`None` value):
+
+        >>> ser = cudf.Series([['a', 'b', None], [None, None, None], None, ['c', 'd']])
+        >>> ser
+        0          [a, b, None]
+        1    [None, None, None]
+        2                  None
+        3                [c, d]
+        dtype: list
+        >>> ser.str.join(sep='_', string_na_rep='k')
+        0    a_b_k
+        1     <NA>
+        2     <NA>
+        3      c_d
+        dtype: object
+
+        We can replace `<NA>`/`None` values present in lists of ``sep``
+        using ``sep_na_rep``:
+
+        >>> ser.str.join(sep=[None, '^', '.', '-'], sep_na_rep='+')
+        0    a+b+
+        1    <NA>
+        2    <NA>
+        3     c-d
+        dtype: object
+        """  # noqa E501
+        if sep is None:
+            sep = ""
+
+        if string_na_rep is None:
+            string_na_rep = ""
+
+        if is_scalar(sep) and sep_na_rep:
+            raise ValueError(
+                "sep_na_rep cannot be defined when `sep` is scalar."
+            )
+
+        if sep_na_rep is None:
+            sep_na_rep = ""
+
+        if not is_scalar(string_na_rep):
+            raise TypeError(
+                f"string_na_rep should be a string scalar, got {string_na_rep}"
+                f" of type : {type(string_na_rep)}"
+            )
+
+        if isinstance(self._column, cudf.core.column.ListColumn):
+            strings_column = self._column
+        else:
+            # If self._column is not a ListColumn, we will have to
+            # split each row by character and create a ListColumn out of it.
+            strings_column = self._split_by_character()
+
+        if is_scalar(sep):
+            data = cpp_join_lists_with_scalar(
+                strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep)
+            )
+        elif can_convert_to_column(sep):
+            sep_column = column.as_column(sep)
+            if len(sep_column) != len(strings_column):
+                raise ValueError(
+                    f"sep should be of similar size to the series, "
+                    f"got: {len(sep_column)}, expected: {len(strings_column)}"
+                )
+            if not is_scalar(sep_na_rep):
+                raise TypeError(
+                    f"sep_na_rep should be a string scalar, got {sep_na_rep} "
+                    f"of type: {type(sep_na_rep)}"
+                )
+
+            data = cpp_join_lists_with_column(
+                strings_column,
+                sep_column,
+                cudf.Scalar(string_na_rep),
+                cudf.Scalar(sep_na_rep),
+            )
+        else:
+            raise TypeError(
+                f"sep should be an str, array-like or Series object, "
+                f"found {type(sep)}"
+            )
+
+        return self._return_or_inplace(data)
+
+    def _split_by_character(self):
+        result_col = cpp_character_tokenize(self._column)
+
+        offset_col = self._column.children[0]
+
+        res = cudf.core.column.ListColumn(
+            size=len(self._column),
+            dtype=cudf.ListDtype(self._column.dtype),
+            mask=self._column.mask,
+            offset=0,
+            null_count=self._column.null_count,
+            children=(offset_col, result_col),
         )
+        return res
 
     def extract(
         self, pat: str, flags: int = 0, expand: bool = True
@@ -510,7 +693,7 @@ def extract(
         --------
         >>> import cudf
         >>> s = cudf.Series(['a1', 'b2', 'c3'])
-        >>> s.str.extract(r'([ab])(\d)')                                # noqa W605
+        >>> s.str.extract(r'([ab])(\d)')
               0     1
         0     a     1
         1     b     2
@@ -519,7 +702,7 @@ def extract(
         A pattern with one group will return a DataFrame with one
         column if expand=True.
 
-        >>> s.str.extract(r'[ab](\d)', expand=True)                     # noqa W605
+        >>> s.str.extract(r'[ab](\d)', expand=True)
               0
         0     1
         1     2
@@ -527,12 +710,12 @@ def extract(
 
         A pattern with one group will return a Series if expand=False.
 
-        >>> s.str.extract(r'[ab](\d)', expand=False)                    # noqa W605
+        >>> s.str.extract(r'[ab](\d)', expand=False)
         0       1
         1       2
         2    <NA>
         dtype: object
-        """
+        """  # noqa W605
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")
 
@@ -620,7 +803,7 @@ def contains(
 
         Returning any digit using regular expression.
 
-        >>> s1.str.contains('\d', regex=True)                               # noqa W605
+        >>> s1.str.contains('\d', regex=True)
         0    False
         1    False
         2    False
@@ -653,7 +836,7 @@ def contains(
         3     True
         4     <NA>
         dtype: bool
-        """
+        """  # noqa W605
         if case is not True:
             raise NotImplementedError("`case` parameter is not yet supported")
         elif flags != 0:
@@ -1998,6 +2181,72 @@ def get(self, i: int = 0) -> ParentType:
 
         return self._return_or_inplace(cpp_string_get(self._column, i))
 
+    def get_json_object(self, json_path):
+        """
+        Applies a JSONPath string to an input strings column
+        where each row in the column is a valid json string
+
+        Parameters
+        ----------
+        json_path: str
+            The JSONPath string to be applied to each row
+            of the input column
+
+        Returns
+        -------
+        Column: New strings column containing the retrieved json object strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series(
+            [
+                \\"\\"\\"
+                {
+                    "store":{
+                        "book":[
+                            {
+                                "category":"reference",
+                                "author":"Nigel Rees",
+                                "title":"Sayings of the Century",
+                                "price":8.95
+                            },
+                            {
+                                "category":"fiction",
+                                "author":"Evelyn Waugh",
+                                "title":"Sword of Honour",
+                                "price":12.99
+                            }
+                        ]
+                    }
+                }
+                \\"\\"\\"
+            ])
+        >>> s
+            0    {"store": {\\n        "book": [\\n        { "cat...
+            dtype: object
+        >>> s.str.get_json_object("$.store.book")
+            0    [\\n        { "category": "reference",\\n       ...
+            dtype: object
+        """
+
+        try:
+            res = self._return_or_inplace(
+                cpp_get_json_object(
+                    self._column, cudf.Scalar(json_path, "str")
+                )
+            )
+        except RuntimeError as e:
+            matches = (
+                "Unrecognized JSONPath operator",
+                "Invalid empty name in JSONPath query string",
+            )
+            if any(match in str(e) for match in matches):
+                raise ValueError("JSONPath value not found") from e
+            raise
+        else:
+            return res
+
     def split(
         self, pat: str = None, n: int = -1, expand: bool = None
     ) -> ParentType:
@@ -3074,7 +3323,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
         Escape ``'$'`` to find the literal dollar sign.
 
         >>> s = cudf.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
-        >>> s.str.count('\$')                                       # noqa W605
+        >>> s.str.count('\$')
         0    1
         1    0
         2    1
@@ -3088,7 +3337,7 @@ def count(self, pat: str, flags: int = 0) -> ParentType:
         >>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat'])
         >>> index.str.count('a')
         Int64Index([0, 0, 2, 1], dtype='int64')
-        """
+        """  # noqa W605
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")
 
@@ -4435,7 +4684,7 @@ def subword_tokenize(
         array([[0, 0, 2],
                [1, 0, 1]], dtype=uint32)
         """
-        tokens, masks, metadata = cpp_subword_tokenize(
+        tokens, masks, metadata = cpp_subword_tokenize_vocab_file(
             self._column,
             hash_file,
             max_length,
@@ -4766,13 +5015,36 @@ def base_size(self) -> int:
     def data_array_view(self) -> cuda.devicearray.DeviceNDArray:
         raise ValueError("Cannot get an array view of a StringColumn")
 
+    def to_arrow(self) -> pa.Array:
+        """Convert to PyArrow Array
+
+        Examples
+        --------
+        >>> import cudf
+        >>> col = cudf.core.column.as_column([1, 2, 3, 4])
+        >>> col.to_arrow()
+        <pyarrow.lib.Int64Array object at 0x7f886547f830>
+        [
+          1,
+          2,
+          3,
+          4
+        ]
+        """
+        if self.null_count == len(self):
+            return pa.NullArray.from_buffers(
+                pa.null(), len(self), [pa.py_buffer((b""))]
+            )
+        else:
+            return super().to_arrow()
+
     def sum(
         self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
     ):
         result_col = self._process_for_reduction(
             skipna=skipna, min_count=min_count
         )
-        if isinstance(result_col, cudf.core.column.ColumnBase):
+        if isinstance(result_col, type(self)):
             return result_col.str().cat()
         else:
             return result_col
@@ -4799,22 +5071,6 @@ def __contains__(self, item: ScalarLike) -> bool:
     def str(self, parent: ParentType = None) -> StringMethods:
         return StringMethods(self, parent=parent)
 
-    def unary_operator(self, unaryop: builtins.str):
-        raise TypeError(
-            f"Series of dtype `str` cannot perform the operation: "
-            f"{unaryop}"
-        )
-
-    def __len__(self) -> int:
-        return self.size
-
-    @property
-    def _nbytes(self) -> int:
-        if self.size == 0:
-            return 0
-        else:
-            return self.children[1].size
-
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":
@@ -4930,23 +5186,21 @@ def to_array(self, fillna: bool = None) -> np.ndarray:
 
         return self.to_arrow().to_pandas().values
 
-    def __array__(self, dtype=None):
-        raise TypeError(
-            "Implicit conversion to a host NumPy array via __array__ is not "
-            "allowed, Conversion to GPU array in strings is not yet "
-            "supported.\nTo explicitly construct a host array, "
-            "consider using .to_array()"
-        )
+    def to_pandas(
+        self, index: pd.Index = None, nullable: bool = False, **kwargs
+    ) -> "pd.Series":
+        if nullable:
+            pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
+            pd_series = pd.Series(pandas_array, copy=False)
+        else:
+            pd_series = self.to_arrow().to_pandas(**kwargs)
 
-    def __arrow_array__(self, type=None):
-        raise TypeError(
-            "Implicit conversion to a host PyArrow Array via __arrow_array__ "
-            "is not allowed, To explicitly construct a PyArrow Array, "
-            "consider using .to_arrow()"
-        )
+        if index is not None:
+            pd_series.index = index
+        return pd_series
 
     def serialize(self) -> Tuple[dict, list]:
-        header = {"null_count": self.null_count}  # type: Dict[Any, Any]
+        header: Dict[Any, Any] = {"null_count": self.null_count}
         header["type-serialized"] = pickle.dumps(type(self))
         header["size"] = self.size
 
@@ -5108,22 +5362,14 @@ def binary_operator(
             if op == "add":
                 return cast("column.ColumnBase", lhs.str().cat(others=rhs))
             elif op in ("eq", "ne", "gt", "lt", "ge", "le", "NULL_EQUALS"):
-                return _string_column_binop(self, rhs, op=op, out_dtype="bool")
+                return libcudf.binaryop.binaryop(
+                    lhs=self, rhs=rhs, op=op, dtype="bool"
+                )
 
         raise TypeError(
             f"{op} operator not supported between {type(self)} and {type(rhs)}"
         )
 
-    @property
-    def is_unique(self) -> bool:
-        return len(self.unique()) == len(self)
-
-    @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            "Strings are not yet supported via `__cuda_array_interface__`"
-        )
-
     @copy_docstring(column.ColumnBase.view)
     def view(self, dtype) -> "cudf.core.column.ColumnBase":
         if self.null_count > 0:
@@ -5151,17 +5397,6 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
         return to_view.view(dtype)
 
 
-@annotate("BINARY_OP", color="orange", domain="cudf_python")
-def _string_column_binop(
-    lhs: "column.ColumnBase",
-    rhs: "column.ColumnBase",
-    op: str,
-    out_dtype: Dtype,
-) -> "column.ColumnBase":
-    out = libcudf.binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype=out_dtype)
-    return out
-
-
 def _get_cols_list(parent_obj, others):
 
     parent_index = (
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index adaf62ffc25..3c47f30dd15 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -5,9 +5,19 @@
 
 import cudf
 from cudf.core.column import ColumnBase
+from cudf.core.column.methods import ColumnMethodsMixin
+from cudf.utils.dtypes import is_struct_dtype
 
 
 class StructColumn(ColumnBase):
+    """
+    Column that stores fields of values.
+
+    Every column has n children, where n is
+    the number of fields in the Struct Dtype.
+
+    """
+
     dtype: cudf.core.dtypes.StructDtype
 
     @property
@@ -74,6 +84,9 @@ def copy(self, deep=True):
             result = result._rename_fields(self.dtype.fields.keys())
         return result
 
+    def struct(self, parent=None):
+        return StructMethods(self, parent=parent)
+
     def _rename_fields(self, names):
         """
         Return a StructColumn with the same field values as this StructColumn,
@@ -91,3 +104,69 @@ def _rename_fields(self, names):
             null_count=self.null_count,
             children=self.base_children,
         )
+
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            "Structs are not yet supported via `__cuda_array_interface__`"
+        )
+
+    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
+        """Copies type metadata from self onto other, returning a new column.
+
+        In addition to the default behavior, if `other` is a StructColumns we
+        rename the fields of `other` to the field names of `self`.
+        """
+        if isinstance(other, cudf.core.column.StructColumn):
+            other = other._rename_fields(
+                self.dtype.fields.keys()  # type: ignore
+            )
+        # Have to ignore typing here because it misdiagnoses super().
+        return super()._copy_type_metadata(other)  # type: ignore
+
+
+class StructMethods(ColumnMethodsMixin):
+    """
+    Struct methods for Series
+    """
+
+    def __init__(self, column, parent=None):
+        if not is_struct_dtype(column.dtype):
+            raise AttributeError(
+                "Can only use .struct accessor with a 'struct' dtype"
+            )
+        super().__init__(column=column, parent=parent)
+
+    def field(self, key):
+        """
+        Extract children of the specified struct column
+        in the Series
+
+        Parameters
+        ----------
+        key: int or str
+            index/position or field name of the respective
+            struct column
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
+        >>> s.struct.field(0)
+        0    1
+        1    3
+        dtype: int64
+        >>> s.struct.field('a')
+        0    1
+        1    3
+        dtype: int64
+        """
+        fields = list(self._column.dtype.fields.keys())
+        if key in fields:
+            pos = fields.index(key)
+            return self._return_or_inplace(self._column.children[pos])
+        else:
+            return self._return_or_inplace(self._column.children[key])
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index a39638106bb..b202838662c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -9,7 +9,6 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from nvtx import annotate
 
 import cudf
 from cudf import _lib as libcudf
@@ -247,7 +246,7 @@ def binary_operator(
         if reflect:
             lhs, rhs = rhs, lhs  # type: ignore
 
-        return binop(lhs, rhs, op=op, out_dtype=out_dtype)
+        return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     def normalize_binop_value(self, other) -> BinaryOperand:
         if isinstance(other, cudf.Scalar):
@@ -306,7 +305,7 @@ def fillna(
         self, fill_value: Any = None, method: str = None, dtype: Dtype = None
     ) -> TimeDeltaColumn:
         if fill_value is not None:
-            if cudf.utils.utils.isnat(fill_value):
+            if cudf.utils.utils._isnat(fill_value):
                 return _fillna_natwise(self)
             col = self  # type: column.ColumnBase
             if is_scalar(fill_value):
@@ -575,17 +574,6 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
         )
 
 
-@annotate("BINARY_OP", color="orange", domain="cudf_python")
-def binop(
-    lhs: "column.ColumnBase",
-    rhs: "column.ColumnBase",
-    op: str,
-    out_dtype: DtypeObj,
-) -> "cudf.core.column.ColumnBase":
-    out = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
-    return out
-
-
 def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype:
     if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)):
         return rhs_dtype
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 33bae5c1328..f0681b330da 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -4,6 +4,7 @@
 
 import itertools
 from collections.abc import MutableMapping
+from functools import reduce
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -19,12 +20,65 @@
 
 import cudf
 from cudf.core import column
-from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict
+from cudf.utils.utils import cached_property
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
 
 
+class _NestedGetItemDict(dict):
+    """A dictionary whose __getitem__ method accesses nested dicts.
+
+    This class directly subclasses dict for performance, so there are a number
+    of gotchas: 1) the only safe accessor for nested elements is
+    `__getitem__` (all other accessors will fail to perform nested lookups), 2)
+    nested mappings will not exhibit the same behavior (they will be raw
+    dictionaries unless explicitly created to be of this class), and 3) to
+    construct this class you _must_ use `from_zip` to get appropriate treatment
+    of tuple keys.
+    """
+
+    @classmethod
+    def from_zip(cls, data):
+        """Create from zip, specialized factory for nesting."""
+        obj = cls()
+        for key, value in data:
+            d = obj
+            for k in key[:-1]:
+                d = d.setdefault(k, {})
+            d[key[-1]] = value
+        return obj
+
+    def __getitem__(self, key):
+        """Recursively apply dict.__getitem__ for nested elements."""
+        # As described in the pandas docs
+        # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index  # noqa: E501
+        # accessing nested elements of a multiindex must be done using a tuple.
+        # Lists and other sequences are treated as accessing multiple elements
+        # at the top level of the index.
+        if isinstance(key, tuple):
+            return reduce(dict.__getitem__, key, self)
+        return super().__getitem__(key)
+
+
+def _to_flat_dict_inner(d, parents=()):
+    for k, v in d.items():
+        if not isinstance(v, d.__class__):
+            if parents:
+                k = parents + (k,)
+            yield (k, v)
+        else:
+            yield from _to_flat_dict_inner(d=v, parents=parents + (k,))
+
+
+def _to_flat_dict(d):
+    """
+    Convert the given nested dictionary to a flat dictionary
+    with tuple keys.
+    """
+    return {k: v for k, v in _to_flat_dict_inner(d)}
+
+
 class ColumnAccessor(MutableMapping):
 
     _data: "Dict[Any, ColumnBase]"
@@ -166,7 +220,7 @@ def _grouped_data(self) -> MutableMapping:
         return the underlying mapping as a nested mapping.
         """
         if self.multiindex:
-            return to_nested_dict(dict(zip(self.names, self.columns)))
+            return _NestedGetItemDict.from_zip(zip(self.names, self.columns))
         else:
             return self._data
 
@@ -343,10 +397,11 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
         self._clear_cache()
 
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
+        data = {k: self._grouped_data[k] for k in key}
+        if self.multiindex:
+            data = _to_flat_dict(data)
         return self.__class__(
-            to_flat_dict({k: self._grouped_data[k] for k in key}),
-            multiindex=self.multiindex,
-            level_names=self.level_names,
+            data, multiindex=self.multiindex, level_names=self.level_names,
         )
 
     def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
@@ -354,7 +409,8 @@ def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
         if isinstance(result, cudf.core.column.ColumnBase):
             return self.__class__({key: result})
         else:
-            result = to_flat_dict(result)
+            if self.multiindex:
+                result = _to_flat_dict(result)
             if not isinstance(key, tuple):
                 key = (key,)
             return self.__class__(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 30b6c307fbd..f2be0e3bd6e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -8,9 +8,9 @@
 import pickle
 import sys
 import warnings
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 from collections.abc import Iterable, Sequence
-from typing import Any, Optional, Set, TypeVar
+from typing import Any, Optional, TypeVar
 
 import cupy
 import numpy as np
@@ -52,6 +52,7 @@
     is_struct_dtype,
     numeric_normalize_types,
 )
+from cudf.utils.utils import GetAttrGetItemMixin
 
 T = TypeVar("T", bound="DataFrame")
 
@@ -109,9 +110,9 @@ def _reverse_op(fn):
 }
 
 
-class DataFrame(Frame, Serializable):
+class DataFrame(Frame, Serializable, GetAttrGetItemMixin):
 
-    _internal_names = {"_data", "_index"}
+    _PROTECTED_KEYS = frozenset(("_data", "_index"))
 
     @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
     def __init__(self, data=None, index=None, columns=None, dtype=None):
@@ -232,7 +233,14 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
             else:
                 self._data = data._data
                 self.columns = data.columns
+        elif isinstance(data, (cudf.Series, pd.Series)):
+            if isinstance(data, pd.Series):
+                data = cudf.Series.from_pandas(data)
 
+            name = data.name or 0
+            self._init_from_dict_like(
+                {name: data}, index=index, columns=columns
+            )
         elif data is None:
             if index is None:
                 self._index = RangeIndex(0)
@@ -240,7 +248,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 self._index = as_index(index)
             if columns is not None:
                 self._data = ColumnAccessor(
-                    OrderedDict.fromkeys(
+                    dict.fromkeys(
                         columns,
                         column.column_empty(
                             len(self), dtype="object", masked=True
@@ -393,7 +401,7 @@ def _init_from_series_list(self, data, columns, index):
         # If `columns` is passed, the result dataframe
         # contain a dataframe with only the
         # specified `columns` in the same order.
-        if columns:
+        if columns is not None:
             for col_name in columns:
                 if col_name not in self._data:
                     self._data[col_name] = column.column_empty(
@@ -427,38 +435,46 @@ def _init_from_list_like(self, data, index=None, columns=None):
 
             for col_name, col in enumerate(data):
                 self._data[col_name] = column.as_column(col)
-        if columns:
+
+        if columns is not None:
+            if len(columns) != len(data):
+                raise ValueError(
+                    f"Shape of passed values is ({len(index)}, {len(data)}), "
+                    f"indices imply ({len(index)}, {len(columns)})."
+                )
+
             self.columns = columns
 
     def _init_from_dict_like(self, data, index=None, columns=None):
-        data = data.copy()
-        num_rows = 0
-
         if columns is not None:
             # remove all entries in `data` that are
             # not in `columns`
             keys = [key for key in data.keys() if key in columns]
-            data = {key: data[key] for key in keys}
-            extra_cols = [col for col in columns if col not in data.keys()]
+            extra_cols = [col for col in columns if col not in keys]
             if keys:
                 # if keys is non-empty,
                 # add null columns for all values
                 # in `columns` that don't exist in `keys`:
+                data = {key: data[key] for key in keys}
                 data.update({key: None for key in extra_cols})
             else:
-                # if keys is empty,
-                # it means that none of the actual keys in `data`
-                # matches with `columns`.
-                # Hence only assign `data` with `columns` as keys
-                # and their values as empty columns.
+                # If keys is empty, none of the data keys match the columns, so
+                # we need to create an empty DataFrame. To match pandas, the
+                # size of the dataframe must match the provided index, so we
+                # need to return a masked array of nulls if an index is given.
+                row_count = 0 if index is None else len(index)
+                masked = index is not None
                 data = {
-                    key: cudf.core.column.column_empty(row_count=0, dtype=None)
+                    key: cudf.core.column.column_empty(
+                        row_count=row_count, dtype=None, masked=masked,
+                    )
                     for key in extra_cols
                 }
 
         data, index = self._align_input_series_indices(data, index=index)
 
         if index is None:
+            num_rows = 0
             if data:
                 col_name = next(iter(data))
                 if is_scalar(data[col_name]):
@@ -538,6 +554,7 @@ def _align_input_series_indices(data, index):
 
         return data, index
 
+    # The `constructor*` properties are used by `dask` (and `dask_cudf`)
     @property
     def _constructor(self):
         return DataFrame
@@ -638,34 +655,26 @@ def __dir__(self):
         return list(o)
 
     def __setattr__(self, key, col):
-
-        # if an attribute already exists, set it.
         try:
+            # Preexisting attributes may be set. We cannot rely on checking the
+            # `_PROTECTED_KEYS` because we must also allow for settable
+            # properties, and we must call object.__getattribute__ to bypass
+            # the `__getitem__` behavior inherited from `GetAttrGetItemMixin`.
             object.__getattribute__(self, key)
-            object.__setattr__(self, key, col)
-            return
+            super().__setattr__(key, col)
         except AttributeError:
-            pass
-
-        # if a column already exists, set it.
-        if key not in self._internal_names:
-            try:
-                self[key]  # __getitem__ to verify key exists
-                self[key] = col
-                return
-            except KeyError:
-                pass
-
-        object.__setattr__(self, key, col)
-
-    def __getattr__(self, key):
-        if key in self._internal_names:
-            return object.__getattribute__(self, key)
-        else:
-            if key in self:
-                return self[key]
-
-        raise AttributeError("'DataFrame' object has no attribute %r" % key)
+            if key not in self._PROTECTED_KEYS:
+                try:
+                    # Check key existence.
+                    self[key]
+                    # If a column already exists, set it.
+                    self[key] = col
+                    return
+                except KeyError:
+                    pass
+
+            # Set a new attribute that is not already a column.
+            super().__setattr__(key, col)
 
     @annotate("DATAFRAME_GETITEM", color="blue", domain="cudf_python")
     def __getitem__(self, arg):
@@ -1448,7 +1457,7 @@ def _get_columns_by_label(self, labels, downcast=False):
                     new_data, index=self.index, name=labels
                 )
                 return out
-        out = self._constructor()._from_data(
+        out = self.__class__()._from_data(
             new_data, index=self.index, columns=new_data.to_pandas_index()
         )
         return out
@@ -1473,7 +1482,9 @@ def op(lhs, rhs):
                 result[col] = getattr(self[col], fn)(other[k])
         elif isinstance(other, DataFrame):
             if fn in cudf.utils.utils._EQUALITY_OPS:
-                if not self.index.equals(other.index):
+                if not self.columns.equals(
+                    other.columns
+                ) or not self.index.equals(other.index):
                     raise ValueError(
                         "Can only compare identically-labeled "
                         "DataFrame objects"
@@ -2806,7 +2817,7 @@ def reindex(
 
         df = self
         cols = columns
-        dtypes = OrderedDict(df.dtypes)
+        dtypes = dict(df.dtypes)
         idx = labels if index is None and axis in (0, "index") else index
         cols = labels if cols is None and axis in (1, "columns") else cols
         df = df if cols is None else df[list(set(df.columns) & set(cols))]
@@ -3148,20 +3159,6 @@ def take(self, positions, keep_index=True):
         out.columns = self.columns
         return out
 
-    def __copy__(self):
-        return self.copy(deep=True)
-
-    def __deepcopy__(self, memo=None):
-        """
-        Parameters
-        ----------
-        memo, default None
-            Standard signature. Unused
-        """
-        if memo is None:
-            memo = {}
-        return self.copy(deep=True)
-
     @annotate("INSERT", color="green", domain="cudf_python")
     def insert(self, loc, name, value):
         """ Add a column to DataFrame at the index specified by loc.
@@ -5377,7 +5374,7 @@ def describe(
             ldesc_indexes = sorted(
                 (x.index for x in describe_series_list), key=len
             )
-            names = OrderedDict.fromkeys(
+            names = dict.fromkeys(
                 [
                     name
                     for idxnames in ldesc_indexes
@@ -7194,10 +7191,9 @@ def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
         """
-        result_columns = OrderedDict({})
-        for col in columns:
-            result_columns[col] = self._data[col]
-        return DataFrame(result_columns, index=self.index)
+        return DataFrame(
+            {col: self._data[col] for col in columns}, index=self.index
+        )
 
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame’s columns based on the column dtypes.
@@ -7760,8 +7756,6 @@ def explode(self, column, ignore_index=False):
 
         return super()._explode(column, ignore_index)
 
-    _accessors = set()  # type: Set[Any]
-
 
 def from_pandas(obj, nan_as_null=None):
     """
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 0c436cf36e7..f0b0dbba4a5 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -2,7 +2,7 @@
 
 import decimal
 import pickle
-from typing import Any, Optional
+from typing import Any, Optional, Tuple
 
 import numpy as np
 import pandas as pd
@@ -143,6 +143,8 @@ def __init__(self, element_type: Any) -> None:
     def element_type(self) -> Dtype:
         if isinstance(self._typ.value_type, pa.ListType):
             return ListDtype.from_arrow(self._typ.value_type)
+        elif isinstance(self._typ.value_type, pa.StructType):
+            return StructDtype.from_arrow(self._typ.value_type)
         else:
             return np.dtype(self._typ.value_type.to_pandas_dtype()).name
 
@@ -176,10 +178,10 @@ def __eq__(self, other):
         return self._typ.equals(other._typ)
 
     def __repr__(self):
-        if isinstance(self.element_type, ListDtype):
-            return f"ListDtype({self.element_type.__repr__()})"
+        if isinstance(self.element_type, (ListDtype, StructDtype)):
+            return f"{type(self).__name__}({self.element_type.__repr__()})"
         else:
-            return f"ListDtype({self.element_type})"
+            return f"{type(self).__name__}({self.element_type})"
 
     def __hash__(self):
         return hash(self._typ)
@@ -268,6 +270,10 @@ def __init__(self, precision, scale=0):
         self._validate(precision, scale)
         self._typ = pa.decimal128(precision, scale)
 
+    @property
+    def str(self):
+        return f"decimal64({self.precision}, {self.scale})"
+
     @property
     def precision(self):
         return self._typ.precision
@@ -325,6 +331,13 @@ def _from_decimal(cls, decimal):
         precision = max(len(metadata.digits), -metadata.exponent)
         return cls(precision, -metadata.exponent)
 
+    def serialize(self) -> Tuple[dict, list]:
+        return {"precision": self.precision, "scale": self.scale}, []
+
+    @classmethod
+    def deserialize(cls, header: dict, frames: list):
+        return cls(header["precision"], header["scale"])
+
 
 class IntervalDtype(StructDtype):
     name = "interval"
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index bc43c367833..4ea3decdc50 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -5,7 +5,7 @@
 import copy
 import functools
 import warnings
-from collections import OrderedDict, abc as abc
+from collections import abc
 from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, TypeVar, Union
 
 import cupy
@@ -18,11 +18,18 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import ColumnLike, DataFrameOrSeries
-from cudf.core.column import as_column, build_categorical_column, column_empty
+from cudf.core.column import (
+    ColumnBase,
+    as_column,
+    build_categorical_column,
+    column_empty,
+)
 from cudf.core.join import merge
 from cudf.utils.dtypes import (
+    find_common_type,
     is_categorical_dtype,
     is_column_like,
+    is_decimal_dtype,
     is_numerical_dtype,
     is_scalar,
     min_scalar_type,
@@ -48,6 +55,12 @@ class Frame(libcudf.table.Table):
 
     _data: "ColumnAccessor"
 
+    @classmethod
+    def __init_subclass__(cls):
+        # All subclasses contain a set _accessors that is used to hold custom
+        # accessors defined by user APIs (see cudf/api/extensions/accessor.py).
+        cls._accessors = set()
+
     @classmethod
     def _from_table(cls, table: Frame):
         return cls(table._data, index=table._index)
@@ -144,6 +157,15 @@ def size(self):
         """
         return self._num_columns * self._num_rows
 
+    @property
+    def _is_homogeneous(self):
+        # make sure that the dataframe has columns
+        if not self._data.columns:
+            return True
+
+        first_type = self._data.columns[0].dtype.name
+        return all(x.dtype.name == first_type for x in self._data.columns)
+
     @property
     def empty(self):
         """
@@ -298,9 +320,6 @@ def copy(self: T, deep: bool = True) -> T:
     def _concat(
         cls, objs, axis=0, join="outer", ignore_index=False, sort=False
     ):
-        # shallow-copy the input DFs in case the same DF instance
-        # is concatenated with itself
-
         # flag to indicate at least one empty input frame also has an index
         empty_has_index = False
         # length of output frame's RangeIndex if all input frames are empty,
@@ -310,35 +329,46 @@ def _concat(
         num_empty_input_frames = 0
 
         for i, obj in enumerate(objs):
+            # shallow-copy the input DFs in case the same DF instance
+            # is concatenated with itself
             objs[i] = obj.copy(deep=False)
-            if ignore_index:
-                # If ignore_index is true, determine if
-                # all or some objs are empty(and have index).
-                # 1. If all objects are empty(and have index), we
-                # should set the index separately using RangeIndex.
-                # 2. If some objects are empty(and have index), we
-                # create empty columns later while populating `columns`
-                # variable. Detailed explanation of second case before
-                # allocation of `columns` variable below.
-                if obj.empty:
-                    num_empty_input_frames += 1
-                    result_index_length += len(obj)
-                    empty_has_index = empty_has_index or len(obj) > 0
+
+            # If ignore_index is true, determine if
+            # all or some objs are empty(and have index).
+            # 1. If all objects are empty(and have index), we
+            # should set the index separately using RangeIndex.
+            # 2. If some objects are empty(and have index), we
+            # create empty columns later while populating `columns`
+            # variable. Detailed explanation of second case before
+            # allocation of `columns` variable below.
+            if ignore_index and obj.empty:
+                num_empty_input_frames += 1
+                result_index_length += len(obj)
+                empty_has_index = empty_has_index or len(obj) > 0
 
         if join == "inner":
-            all_columns_list = [obj._column_names for obj in objs]
-            # get column names present in ALL objs
+            sets_of_column_names = [set(obj._column_names) for obj in objs]
+
             intersecting_columns = functools.reduce(
-                np.intersect1d, all_columns_list
+                set.intersection, sets_of_column_names
             )
-            # get column names not present in all objs
             union_of_columns = functools.reduce(
-                pd.Index.union, [obj.columns for obj in objs]
+                set.union, sets_of_column_names
             )
             non_intersecting_columns = union_of_columns.symmetric_difference(
                 intersecting_columns
             )
-            names = OrderedDict.fromkeys(intersecting_columns).keys()
+
+            # Get an ordered list of the intersecting columns to preserve input
+            # order, which is promised by pandas for inner joins.
+            ordered_intersecting_columns = [
+                name
+                for obj in objs
+                for name in obj._column_names
+                if name in intersecting_columns
+            ]
+
+            names = dict.fromkeys(ordered_intersecting_columns).keys()
 
             if axis == 0:
                 if ignore_index and (
@@ -353,7 +383,6 @@ def _concat(
                     num_empty_input_frames = len(objs)
                     result_index_length = sum(len(obj) for obj in objs)
 
-                objs = [obj.copy(deep=False) for obj in objs]
                 # remove columns not present in all objs
                 for obj in objs:
                     obj.drop(
@@ -364,7 +393,7 @@ def _concat(
         elif join == "outer":
             # Get a list of the unique table column names
             names = [name for f in objs for name in f._column_names]
-            names = OrderedDict.fromkeys(names).keys()
+            names = dict.fromkeys(names).keys()
 
         else:
             raise ValueError(
@@ -372,12 +401,14 @@ def _concat(
                 "the other axis"
             )
 
-        try:
-            if sort:
-                names = list(sorted(names))
-            else:
+        if sort:
+            try:
+                # Sorted always returns a list, but will fail to sort if names
+                # include different types that are not comparable.
+                names = sorted(names)
+            except TypeError:
                 names = list(names)
-        except TypeError:
+        else:
             names = list(names)
 
         # Combine the index and table columns for each Frame into a list of
@@ -393,7 +424,7 @@ def _concat(
                 else list(f._index._data.columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
-            for i, f in enumerate(objs)
+            for f in objs
         ]
 
         # Get a list of the combined index and table column indices
@@ -471,6 +502,11 @@ def _concat(
                     cudf.core.index.as_index(out.index._values)
                 )
 
+        # Reassign precision for any decimal cols
+        for name, col in out._data.items():
+            if isinstance(col, cudf.core.column.DecimalColumn):
+                col = tables[0]._data[name]._copy_type_metadata(col)
+
         # Reassign index and column names
         if isinstance(objs[0].columns, pd.MultiIndex):
             out.columns = objs[0].columns
@@ -599,7 +635,7 @@ def _get_columns_by_index(self, indices):
 
         """
         data = self._data.select_by_index(indices)
-        return self._constructor(
+        return self.__class__(
             data, columns=data.to_pandas_index(), index=self.index
         )
 
@@ -1434,7 +1470,7 @@ def rank(
 
         Parameters
         ----------
-        axis : {0 or 'index', 1 or 'columns'}, default 0
+        axis : {0 or 'index'}, default 0
             Index to direct ranking.
         method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
             How to rank the group of records that have the same value
@@ -1464,12 +1500,19 @@ def rank(
         """
         if method not in {"average", "min", "max", "first", "dense"}:
             raise KeyError(method)
+
         method_enum = libcudf.sort.RankMethod[method.upper()]
         if na_option not in {"keep", "top", "bottom"}:
             raise ValueError(
                 "na_option must be one of 'keep', 'top', or 'bottom'"
             )
 
+        if axis not in (0, "index"):
+            raise NotImplementedError(
+                f"axis must be `0`/`index`, "
+                f"axis={axis} is not yet supported in rank"
+            )
+
         source = self
         if numeric_only:
             numeric_cols = (
@@ -2172,12 +2215,14 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame:
                         replacements_per_column[name],
                         all_na_per_column[name],
                     )
-                except KeyError:
-                    # We need to create a deep copy if `find_and_replace`
-                    # was not successful or any of
-                    # `to_replace_per_column`, `replacements_per_column`,
-                    # `all_na_per_column` don't contain the `name`
-                    # that exists in `copy_data`
+                except (KeyError, OverflowError):
+                    # We need to create a deep copy if :
+                    # i. `find_and_replace` was not successful or any of
+                    #    `to_replace_per_column`, `replacements_per_column`,
+                    #    `all_na_per_column` don't contain the `name`
+                    #    that exists in `copy_data`.
+                    # ii. There is an OverflowError while trying to cast
+                    #     `to_replace_per_column` to `replacements_per_column`.
                     copy_data[name] = col.copy(deep=True)
         else:
             copy_data = self._data.copy(deep=True)
@@ -3255,17 +3300,21 @@ def _reindex(
                 # double-argsort to map back from sorted to unsorted positions
                 df = df.take(index.argsort(ascending=True).argsort())
 
-        cols = OrderedDict()
         index = index if index is not None else df.index
         names = columns if columns is not None else list(df.columns)
-        for name in names:
-            if name in df._data:
-                cols[name] = df._data[name].copy(deep=deep)
-            else:
-                dtype = dtypes.get(name, np.float64)
-                cols[name] = column_empty(
-                    dtype=dtype, masked=True, row_count=len(index)
+        cols = {
+            name: (
+                df._data[name].copy(deep=deep)
+                if name in df._data
+                else column_empty(
+                    dtype=dtypes.get(name, np.float64),
+                    masked=True,
+                    row_count=len(index),
                 )
+            )
+            for name in names
+        }
+
         result = self.__class__._from_table(
             Frame(
                 data=cudf.core.column_accessor.ColumnAccessor(
@@ -3280,6 +3329,522 @@ def _reindex(
         return self._mimic_inplace(result, inplace=inplace)
 
 
+_truediv_int_dtype_corrections = {
+    np.int8: np.float32,
+    np.int16: np.float32,
+    np.int32: np.float32,
+    np.int64: np.float64,
+    np.uint8: np.float32,
+    np.uint16: np.float32,
+    np.uint32: np.float64,
+    np.uint64: np.float64,
+    np.bool_: np.float32,
+}
+
+
+class SingleColumnFrame(Frame):
+    """A one-dimensional frame.
+
+    Frames with only a single column share certain logic that is encoded in
+    this class.
+    """
+
+    @property
+    def name(self):
+        """The name of this object."""
+        return next(iter(self._data.names))
+
+    @name.setter
+    def name(self, value):
+        self._data[value] = self._data.pop(self.name)
+
+    @property
+    def ndim(self):
+        """Dimension of the data (always 1)."""
+        return 1
+
+    @property
+    def shape(self):
+        """Returns a tuple representing the dimensionality of the Index.
+        """
+        return (len(self),)
+
+    def __iter__(self):
+        cudf.utils.utils.raise_iteration_error(obj=self)
+
+    def __len__(self):
+        return len(self._column)
+
+    def __bool__(self):
+        raise TypeError(
+            f"The truth value of a {type(self)} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
+    @property
+    def _num_columns(self):
+        return 1
+
+    @property
+    def _column(self):
+        return self._data[self.name]
+
+    @_column.setter
+    def _column(self, value):
+        self._data[self.name] = value
+
+    @property
+    def values(self):
+        """
+        Return a CuPy representation of the data.
+
+        Returns
+        -------
+        out : cupy.ndarray
+            A device representation of the underlying data.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([1, -10, 100, 20])
+        >>> ser.values
+        array([  1, -10, 100,  20])
+        >>> type(ser.values)
+        <class 'cupy.core.core.ndarray'>
+        >>> index = cudf.Index([1, -10, 100, 20])
+        >>> index.values
+        array([  1, -10, 100,  20])
+        >>> type(index.values)
+        <class 'cupy.core.core.ndarray'>
+        """
+        return self._column.values
+
+    @property
+    def values_host(self):
+        """
+        Return a NumPy representation of the data.
+
+        Returns
+        -------
+        out : numpy.ndarray
+            A host representation of the underlying data.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([1, -10, 100, 20])
+        >>> ser.values_host
+        array([  1, -10, 100,  20])
+        >>> type(ser.values_host)
+        <class 'numpy.ndarray'>
+        >>> index = cudf.Index([1, -10, 100, 20])
+        >>> index.values_host
+        array([  1, -10, 100,  20])
+        >>> type(index.values_host)
+        <class 'numpy.ndarray'>
+        """
+        return self._column.values_host
+
+    def tolist(self):
+
+        raise TypeError(
+            "cuDF does not support conversion to host memory "
+            "via the `tolist()` method. Consider using "
+            "`.to_arrow().to_pylist()` to construct a Python list."
+        )
+
+    to_list = tolist
+
+    def to_gpu_array(self, fillna=None):
+        """Get a dense numba device array for the data.
+
+        Parameters
+        ----------
+        fillna : str or None
+            See *fillna* in ``.to_array``.
+
+        Notes
+        -----
+
+        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
+        output size could be smaller.
+
+        Returns
+        -------
+        numba.DeviceNDArray
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([10, 20, 30, 40, 50])
+        >>> s
+        0    10
+        1    20
+        2    30
+        3    40
+        4    50
+        dtype: int64
+        >>> s.to_gpu_array()
+        <numba.cuda.cudadrv.devicearray.DeviceNDArray object at 0x7f1840858890>
+        """
+        return self._column.to_gpu_array(fillna=fillna)
+
+    @classmethod
+    def from_arrow(cls, array):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
+        StringIndex(['a' 'b' None], dtype='object')
+        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
+        0       a
+        1       b
+        2    <NA>
+        dtype: object
+        """
+        return cls(cudf.core.column.column.ColumnBase.from_arrow(array))
+
+    def to_arrow(self):
+        """
+        Convert to a PyArrow Array.
+
+        Returns
+        -------
+        PyArrow Array
+
+        Examples
+        --------
+        >>> import cudf
+        >>> sr = cudf.Series(["a", "b", None])
+        >>> sr.to_arrow()
+        <pyarrow.lib.StringArray object at 0x7f796b0e7600>
+        [
+          "a",
+          "b",
+          null
+        ]
+        >>> ind = cudf.Index(["a", "b", None])
+        >>> ind.to_arrow()
+        <pyarrow.lib.StringArray object at 0x7f796b0e7750>
+        [
+          "a",
+          "b",
+          null
+        ]
+        """
+        return self._column.to_arrow()
+
+    @property
+    def is_unique(self):
+        """Return boolean if values in the object are unique.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_unique
+
+    @property
+    def is_monotonic(self):
+        """Return boolean if values in the object are monotonic_increasing.
+
+        This property is an alias for :attr:`is_monotonic_increasing`.
+
+        Returns
+        -------
+        bool
+        """
+        return self.is_monotonic_increasing
+
+    @property
+    def is_monotonic_increasing(self):
+        """Return boolean if values in the object are monotonic_increasing.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_monotonic_increasing
+
+    @property
+    def is_monotonic_decreasing(self):
+        """Return boolean if values in the object are monotonic_decreasing.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_monotonic_decreasing
+
+    @property
+    def __cuda_array_interface__(self):
+        return self._column.__cuda_array_interface__
+
+    def factorize(self, na_sentinel=-1):
+        """Encode the input values as integer labels
+
+        Parameters
+        ----------
+        na_sentinel : number
+            Value to indicate missing category.
+
+        Returns
+        --------
+        (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
+            - *labels* contains the encoded values
+            - *cats* contains the categories in order that the N-th
+              item corresponds to the (N-1) code.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series(['a', 'a', 'c'])
+        >>> codes, uniques = s.factorize()
+        >>> codes
+        array([0, 0, 1], dtype=int8)
+        >>> uniques
+        StringIndex(['a' 'c'], dtype='object')
+        """
+        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
+
+    @property
+    def _copy_construct_defaults(self):
+        """A default dictionary of kwargs to be used for copy construction."""
+        raise NotImplementedError
+
+    def _copy_construct(self, **kwargs):
+        """Shallow copy this object by replacing certain ctor args.
+        """
+        return self.__class__(**{**self._copy_construct_defaults, **kwargs})
+
+    def _binaryop(
+        self,
+        other,
+        fn,
+        fill_value=None,
+        reflect=False,
+        lhs=None,
+        *args,
+        **kwargs,
+    ):
+        """Perform a binary operation between two single column frames.
+
+        Parameters
+        ----------
+        other : SingleColumnFrame
+            The second operand.
+        fn : str
+            The operation
+        fill_value : Any, default None
+            The value to replace null values with. If ``None``, nulls are not
+            filled before the operation.
+        reflect : bool, default False
+            If ``True`` the operation is reflected (i.e whether to swap the
+            left and right operands).
+        lhs : SingleColumnFrame, default None
+            The left hand operand. If ``None``, self is used. This parameter
+            allows child classes to preprocess the inputs if necessary.
+
+        Returns
+        -------
+        SingleColumnFrame
+            A new instance containing the result of the operation.
+        """
+        if lhs is None:
+            lhs = self
+
+        rhs = self._normalize_binop_value(other)
+
+        if fn == "truediv":
+            truediv_type = _truediv_int_dtype_corrections.get(lhs.dtype.type)
+            if truediv_type is not None:
+                lhs = lhs.astype(truediv_type)
+
+        output_mask = None
+        if fill_value is not None:
+            if is_scalar(rhs):
+                if lhs.nullable:
+                    lhs = lhs.fillna(fill_value)
+            else:
+                # If both columns are nullable, pandas semantics dictate that
+                # nulls that are present in both lhs and rhs are not filled.
+                if lhs.nullable and rhs.nullable:
+                    # Note: lhs is a Frame, while rhs is already a column.
+                    lmask = as_column(lhs._column.nullmask)
+                    rmask = as_column(rhs.nullmask)
+                    output_mask = (lmask | rmask).data
+                    lhs = lhs.fillna(fill_value)
+                    rhs = rhs.fillna(fill_value)
+                elif lhs.nullable:
+                    lhs = lhs.fillna(fill_value)
+                elif rhs.nullable:
+                    rhs = rhs.fillna(fill_value)
+
+        outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect)
+
+        # Get the appropriate name for output operations involving two objects
+        # that are Series-like objects. The output shares the lhs's name unless
+        # the rhs is a _differently_ named Series-like object.
+        if (
+            isinstance(other, (SingleColumnFrame, pd.Series, pd.Index))
+            and self.name != other.name
+        ):
+            result_name = None
+        else:
+            result_name = self.name
+
+        output = lhs._copy_construct(data=outcol, name=result_name)
+
+        if output_mask is not None:
+            output._column = output._column.set_mask(output_mask)
+        return output
+
+    def _normalize_binop_value(self, other):
+        """Returns a *column* (not a Series) or scalar for performing
+        binary operations with self._column.
+        """
+        if isinstance(other, ColumnBase):
+            return other
+        if isinstance(other, SingleColumnFrame):
+            return other._column
+        if other is cudf.NA:
+            return cudf.Scalar(other, dtype=self.dtype)
+        else:
+            return self._column.normalize_binop_value(other)
+
+    def _bitwise_binop(self, other, op):
+        """Type-coercing wrapper around _binaryop for bitwise operations."""
+        # This will catch attempts at bitwise ops on extension dtypes.
+        try:
+            self_is_bool = np.issubdtype(self.dtype, np.bool_)
+            other_is_bool = np.issubdtype(other.dtype, np.bool_)
+        except TypeError:
+            raise TypeError(
+                f"Operation 'bitwise {op}' not supported between "
+                f"{self.dtype.type.__name__} and {other.dtype.type.__name__}"
+            )
+
+        if (self_is_bool or np.issubdtype(self.dtype, np.integer)) and (
+            other_is_bool or np.issubdtype(other.dtype, np.integer)
+        ):
+            # TODO: This doesn't work on Series (op) DataFrame
+            # because dataframe doesn't have dtype
+            ser = self._binaryop(other, op)
+            if self_is_bool or other_is_bool:
+                ser = ser.astype(np.bool_)
+            return ser
+        else:
+            raise TypeError(
+                f"Operation 'bitwise {op}' not supported between "
+                f"{self.dtype.type.__name__} and {other.dtype.type.__name__}"
+            )
+
+    # Binary arithmetic operations.
+    def __add__(self, other):
+        return self._binaryop(other, "add")
+
+    def __radd__(self, other):
+        return self._binaryop(other, "add", reflect=True)
+
+    def __sub__(self, other):
+        return self._binaryop(other, "sub")
+
+    def __rsub__(self, other):
+        return self._binaryop(other, "sub", reflect=True)
+
+    def __mul__(self, other):
+        return self._binaryop(other, "mul")
+
+    def __rmul__(self, other):
+        return self._binaryop(other, "mul", reflect=True)
+
+    def __mod__(self, other):
+        return self._binaryop(other, "mod")
+
+    def __rmod__(self, other):
+        return self._binaryop(other, "mod", reflect=True)
+
+    def __pow__(self, other):
+        return self._binaryop(other, "pow")
+
+    def __rpow__(self, other):
+        return self._binaryop(other, "pow", reflect=True)
+
+    def __floordiv__(self, other):
+        return self._binaryop(other, "floordiv")
+
+    def __rfloordiv__(self, other):
+        return self._binaryop(other, "floordiv", reflect=True)
+
+    def __truediv__(self, other):
+        if is_decimal_dtype(self.dtype):
+            return self._binaryop(other, "div")
+        else:
+            return self._binaryop(other, "truediv")
+
+    def __rtruediv__(self, other):
+        if is_decimal_dtype(self.dtype):
+            return self._binaryop(other, "div", reflect=True)
+        else:
+            return self._binaryop(other, "truediv", reflect=True)
+
+    __div__ = __truediv__
+
+    def __and__(self, other):
+        return self._bitwise_binop(other, "and")
+
+    def __or__(self, other):
+        return self._bitwise_binop(other, "or")
+
+    def __xor__(self, other):
+        return self._bitwise_binop(other, "xor")
+
+    # Binary rich comparison operations.
+    def __eq__(self, other):
+        return self._binaryop(other, "eq")
+
+    def __ne__(self, other):
+        return self._binaryop(other, "ne")
+
+    def __lt__(self, other):
+        return self._binaryop(other, "lt")
+
+    def __le__(self, other):
+        return self._binaryop(other, "le")
+
+    def __gt__(self, other):
+        return self._binaryop(other, "gt")
+
+    def __ge__(self, other):
+        return self._binaryop(other, "ge")
+
+    # Unary logical operators
+    def __neg__(self):
+        return -1 * self
+
+    def __pos__(self):
+        return self.copy(deep=True)
+
+    def __abs__(self):
+        return self._unaryop("abs")
+
+
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
 ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]:
@@ -3481,8 +4046,11 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
         # default to the first non-null dtype
         dtypes[idx] = cols[0].dtype
         # If all the non-null dtypes are int/float, find a common dtype
-        if all(is_numerical_dtype(col.dtype) for col in cols):
-            dtypes[idx] = np.find_common_type([col.dtype for col in cols], [])
+        if all(
+            is_numerical_dtype(col.dtype) or is_decimal_dtype(col.dtype)
+            for col in cols
+        ):
+            dtypes[idx] = find_common_type([col.dtype for col in cols])
         # If all categorical dtypes, combine the categories
         elif all(
             isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index cc94548d9a2..6a298df32d6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 import collections
-import functools
 import pickle
 import warnings
 
@@ -9,8 +8,10 @@
 
 import cudf
 from cudf._lib import groupby as libgroupby
+from cudf._lib.table import Table
 from cudf.core.abc import Serializable
-from cudf.utils.utils import cached_property
+from cudf.utils.dtypes import is_list_like
+from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
 # Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the
@@ -99,6 +100,21 @@ def size(self):
             .agg("size")
         )
 
+    def cumcount(self):
+        """
+        Return the cumulative count of keys in each group.
+        """
+        return (
+            cudf.Series(
+                cudf.core.column.column_empty(
+                    len(self.obj), "int8", masked=False
+                )
+            )
+            .groupby(self.grouping, sort=self._sort)
+            .agg("cumcount")
+            .reset_index(drop=True)
+        )
+
     @cached_property
     def _groupby(self):
         return libgroupby.GroupBy(self.grouping.keys, dropna=self._dropna)
@@ -212,9 +228,10 @@ def nth(self, n):
         """
         Return the nth row from each group.
         """
-        result = self.agg(lambda x: x.nth(n))
-        sizes = self.size()
-        return result[n < sizes]
+        result = self.agg(lambda x: x.nth(n)).sort_index()
+        sizes = self.size().sort_index()
+
+        return result[sizes > n]
 
     def serialize(self):
         header = {}
@@ -570,50 +587,183 @@ def rolling(self, *args, **kwargs):
         """
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
+    def count(self, dropna=True):
+        """Compute the number of values in each column.
 
-# Set of valid groupby aggregations that are monkey-patched into the GroupBy
-# namespace.
-_VALID_GROUPBY_AGGS = {
-    "count",
-    "sum",
-    "idxmin",
-    "idxmax",
-    "min",
-    "max",
-    "mean",
-    "var",
-    "std",
-    "quantile",
-    "median",
-    "nunique",
-    "collect",
-    "unique",
-}
-
-
-# Dynamically bind the different aggregation methods.
-def _agg_func_name_with_args(self, func_name, *args, **kwargs):
-    """
-    Aggregate given an aggregate function name and arguments to the
-    function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named
-    aggregations must be members of _AggregationFactory.
-    """
+        Parameters
+        ----------
+        dropna : bool
+            If ``True``, don't include null values in the count.
+        """
+
+        def func(x):
+            return getattr(x, "count")(dropna=dropna)
+
+        return self.agg(func)
+
+    def sum(self):
+        """Compute the column-wise sum of the values in each group."""
+        return self.agg("sum")
+
+    def prod(self):
+        """Compute the column-wise product of the values in each group."""
+        return self.agg("prod")
+
+    def idxmin(self):
+        """Get the column-wise index of the minimum value in each group."""
+        return self.agg("idxmin")
+
+    def idxmax(self):
+        """Get the column-wise index of the maximum value in each group."""
+        return self.agg("idxmax")
+
+    def min(self):
+        """Get the column-wise minimum value in each group."""
+        return self.agg("min")
+
+    def max(self):
+        """Get the column-wise maximum value in each group."""
+        return self.agg("max")
+
+    def mean(self):
+        """Compute the column-wise mean of the values in each group."""
+        return self.agg("mean")
+
+    def median(self):
+        """Get the column-wise median of the values in each group."""
+        return self.agg("median")
+
+    def var(self, ddof=1):
+        """Compute the column-wise variance of the values in each group.
+
+        Parameters
+        ----------
+        ddof : int
+            The delta degrees of freedom. N - ddof is the divisor used to
+            normalize the variance.
+        """
+
+        def func(x):
+            return getattr(x, "var")(ddof=ddof)
+
+        return self.agg(func)
+
+    def std(self, ddof=1):
+        """Compute the column-wise std of the values in each group.
 
-    def func(x):
-        """Compute the {} of the group.""".format(func_name)
-        return getattr(x, func_name)(*args, **kwargs)
+        Parameters
+        ----------
+        ddof : int
+            The delta degrees of freedom. N - ddof is the divisor used to
+            normalize the standard deviation.
+        """
+
+        def func(x):
+            return getattr(x, "std")(ddof=ddof)
+
+        return self.agg(func)
+
+    def quantile(self, q=0.5, interpolation="linear"):
+        """Compute the column-wise quantiles of the values in each group.
+
+        Parameters
+        ----------
+        q : float or array-like
+            The quantiles to compute.
+        interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
+            The interpolation method to use when the desired quantile lies
+            between two data points. Defaults to "linear".
+       """
 
-    func.__name__ = func_name
-    return self.agg(func)
+        def func(x):
+            return getattr(x, "quantile")(q=q, interpolation=interpolation)
 
+        return self.agg(func)
 
-for key in _VALID_GROUPBY_AGGS:
-    setattr(
-        GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key)
-    )
+    def nunique(self):
+        """Compute the number of unique values in each column in each group."""
+        return self.agg("nunique")
+
+    def collect(self):
+        """Get a list of all the values for each column in each group."""
+        return self.agg("collect")
+
+    def unique(self):
+        """Get a list of the unique values for each column in each group."""
+        return self.agg("unique")
+
+    def cumsum(self):
+        """Compute the column-wise cumulative sum of the values in
+        each group."""
+        return self.agg("cumsum")
 
+    def cummin(self):
+        """Get the column-wise cumulative minimum value in each group."""
+        return self.agg("cummin")
+
+    def cummax(self):
+        """Get the column-wise cumulative maximum value in each group."""
+        return self.agg("cummax")
+
+    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+        """
+        Shift each group by ``periods`` positions.
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Number of periods to shift.
+        freq : str, unsupported
+        axis : 0, axis to shift
+            Shift direction. Only row-wise shift is supported
+        fill_value : scalar or list of scalars, optional
+            The scalar value to use for newly introduced missing values. Can be
+            specified with `None`, a single value or multiple values:
+
+            - `None` (default): sets all indeterminable values to null.
+            - Single value: fill all shifted columns with this value. Should
+              match the data type of all columns.
+            - List of values: fill shifted columns with corresponding value in
+              the list. The length of the list should match the number of
+              columns shifted. Each value should match the data type of the
+              column to fill.
+
+        Returns
+        -------
+        Series or DataFrame
+            Object shifted within each group.
+
+        Notes
+        -----
+        Parameter ``freq`` is unsupported.
+        """
+
+        if freq is not None:
+            raise NotImplementedError("Parameter freq is unsupported.")
+
+        if not axis == 0:
+            raise NotImplementedError("Only axis=0 is supported.")
+
+        value_column_names = [
+            x for x in self.obj._column_names if x not in self.grouping.names
+        ]
+        num_columns_to_shift = len(value_column_names)
+        if is_list_like(fill_value):
+            if not len(fill_value) == num_columns_to_shift:
+                raise ValueError(
+                    "Mismatched number of columns and values to fill."
+                )
+        else:
+            fill_value = [fill_value] * num_columns_to_shift
+
+        value_columns = self.obj._data.select_by_label(value_column_names)
+        result = self._groupby.shift(Table(value_columns), periods, fill_value)
+        return self.obj.__class__._from_table(result)
+
+
+class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
+    _PROTECTED_KEYS = frozenset(("obj",))
 
-class DataFrameGroupBy(GroupBy):
     def __init__(
         self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
     ):
@@ -708,17 +858,6 @@ def __init__(
             dropna=dropna,
         )
 
-    def __getattr__(self, key):
-        # Without this check, copying can trigger a RecursionError. See
-        # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  # noqa: E501
-        # for an explanation.
-        if key == "obj":
-            raise AttributeError
-        try:
-            return self[key]
-        except KeyError:
-            raise AttributeError
-
     def __getitem__(self, key):
         return self.obj[key].groupby(
             self.grouping, dropna=self._dropna, sort=self._sort
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f65afb6a1d4..3b977a8ced6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -4,7 +4,7 @@
 
 import pickle
 from numbers import Number
-from typing import Any, Dict, Set, Type
+from typing import Any, Dict, Type
 
 import cupy
 import numpy as np
@@ -27,10 +27,11 @@
     arange,
     column,
 )
+from cudf.core.column.column import _concat_columns
 from cudf.core.column.string import StringMethods as StringMethods
 from cudf.core.dtypes import IntervalDtype
-from cudf.core.frame import Frame
-from cudf.utils import ioutils, utils
+from cudf.core.frame import SingleColumnFrame
+from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     find_common_type,
@@ -45,35 +46,7 @@
 from cudf.utils.utils import cached_property, search_range
 
 
-def _to_frame(this_index, index=True, name=None):
-    """Create a DataFrame with a column containing this Index
-
-    Parameters
-    ----------
-    index : boolean, default True
-        Set the index of the returned DataFrame as the original Index
-    name : str, default None
-        Name to be used for the column
-
-    Returns
-    -------
-    DataFrame
-        cudf DataFrame
-    """
-
-    if name is not None:
-        col_name = name
-    elif this_index.name is None:
-        col_name = 0
-    else:
-        col_name = this_index.name
-
-    return cudf.DataFrame(
-        {col_name: this_index._values}, index=this_index if index else None
-    )
-
-
-class Index(Frame, Serializable):
+class Index(SingleColumnFrame, Serializable):
 
     dtype: DtypeObj
 
@@ -180,12 +153,6 @@ def drop_duplicates(self, keep="first"):
         """  # noqa: E501
         return super().drop_duplicates(keep=keep)
 
-    @property
-    def shape(self):
-        """Returns a tuple representing the dimensionality of the Index.
-        """
-        return (len(self),)
-
     def serialize(self):
         header = {}
         header["index_column"] = {}
@@ -277,81 +244,6 @@ def get_level_values(self, level):
         else:
             raise KeyError(f"Requested level with name {level} " "not found")
 
-    def __iter__(self):
-        cudf.utils.utils.raise_iteration_error(obj=self)
-
-    @classmethod
-    def from_arrow(cls, array):
-        """Convert PyArrow Array/ChunkedArray to Index
-
-        Parameters
-        ----------
-        array : PyArrow Array/ChunkedArray
-            PyArrow Object which has to be converted to Index
-
-        Raises
-        ------
-        TypeError for invalid input type.
-
-        Returns
-        -------
-        cudf Index
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import pyarrow as pa
-        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
-        StringIndex(['a' 'b' None], dtype='object')
-        """
-
-        return cls(cudf.core.column.column.ColumnBase.from_arrow(array))
-
-    def to_arrow(self):
-        """Convert Index to PyArrow Array
-
-        Returns
-        -------
-        PyArrow Array
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ind = cudf.Index(["a", "b", None])
-        >>> ind.to_arrow()
-        <pyarrow.lib.StringArray object at 0x7f796b0e7750>
-        [
-          "a",
-          "b",
-          null
-        ]
-        """
-
-        return self._data.columns[0].to_arrow()
-
-    @property
-    def values_host(self):
-        """
-        Return a numpy representation of the Index.
-
-        Only the values in the Index will be returned.
-
-        Returns
-        -------
-        out : numpy.ndarray
-            The values of the Index.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> index = cudf.Index([1, -10, 100, 20])
-        >>> index.values_host
-        array([  1, -10, 100,  20])
-        >>> type(index.values_host)
-        <class 'numpy.ndarray'>
-        """
-        return self._values.values_host
-
     @classmethod
     def deserialize(cls, header, frames):
         h = header["index_column"]
@@ -362,12 +254,6 @@ def deserialize(cls, header, frames):
         index = col_typ.deserialize(h, frames[: header["frame_count"]])
         return idx_typ(index, name=name)
 
-    @property
-    def ndim(self):
-        """Dimension of the data. Apart from MultiIndex ndim is always 1.
-        """
-        return 1
-
     @property
     def names(self):
         """
@@ -388,18 +274,6 @@ def names(self, values):
 
         self.name = values[0]
 
-    @property
-    def name(self):
-        """
-        Returns the name of the Index.
-        """
-        return next(iter(self._data.names))
-
-    @name.setter
-    def name(self, value):
-        col = self._data.pop(self.name)
-        self._data[value] = col
-
     def dropna(self, how="any"):
         """
         Return an Index with null values removed.
@@ -462,17 +336,6 @@ def _clean_nulls_from_index(self):
         else:
             return self
 
-    def factorize(self, na_sentinel=-1):
-        """
-        Encode the input values as integer labels
-
-        See Also
-        --------
-        cudf.core.series.Series.factorize : Encode the input values of Series.
-
-        """
-        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
-
     @property
     def nlevels(self):
         """
@@ -641,25 +504,32 @@ def argsort(self, ascending=True, **kwargs):
         indices = self._values.argsort(ascending=ascending, **kwargs)
         return cupy.asarray(indices)
 
-    @property
-    def values(self):
-        """
-        Return an array representing the data in the Index.
+    def to_frame(self, index=True, name=None):
+        """Create a DataFrame with a column containing this Index
+
+        Parameters
+        ----------
+        index : boolean, default True
+            Set the index of the returned DataFrame as the original Index
+        name : str, default None
+            Name to be used for the column
 
         Returns
         -------
-        array : A cupy array of data in the Index.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> index = cudf.Index([1, -10, 100, 20])
-        >>> index.values
-        array([  1, -10, 100,  20])
-        >>> type(index.values)
-        <class 'cupy.core.core.ndarray'>
+        DataFrame
+            cudf DataFrame
         """
-        return self._values.values
+
+        if name is not None:
+            col_name = name
+        elif self.name is None:
+            col_name = 0
+        else:
+            col_name = self.name
+
+        return cudf.DataFrame(
+            {col_name: self._values}, index=self if index else None
+        )
 
     def any(self):
         """
@@ -686,16 +556,6 @@ def to_pandas(self):
         """
         return pd.Index(self._values.to_pandas(), name=self.name)
 
-    def tolist(self):
-
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via `tolist()` method. Consider using "
-            "`.to_arrow().to_pylist()` to construct a Python list."
-        )
-
-    to_list = tolist
-
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
         """{docstring}"""
@@ -779,7 +639,7 @@ def sum(self):
 
     @classmethod
     def _concat(cls, objs):
-        data = ColumnBase._concat([o._values for o in objs])
+        data = _concat_columns([o._values for o in objs])
         names = {obj.name for obj in objs}
         if len(names) == 1:
             [name] = names
@@ -915,14 +775,63 @@ def difference(self, other, sort=None):
 
         return difference
 
-    def _apply_op(self, fn, other=None):
+    def _binaryop(self, other, fn, fill_value=None, reflect=False):
+        # TODO: Rather than including an allowlist of acceptable types, we
+        # should instead return NotImplemented for __all__ other types. That
+        # will allow other types to support binops with cudf objects if they so
+        # choose, and just as importantly will allow better error messages if
+        # they don't support it.
+        if isinstance(other, (cudf.DataFrame, cudf.Series)):
+            return NotImplemented
 
-        idx_series = cudf.Series(self, name=self.name)
-        op = getattr(idx_series, fn)
-        if other is not None:
-            return as_index(op(other))
-        else:
-            return as_index(op())
+        return super()._binaryop(other, fn, fill_value, reflect)
+
+    def _copy_construct(self, **kwargs):
+        # Need to override the parent behavior because pandas allows operations
+        # on unsigned types to return signed values, forcing us to choose the
+        # right index type here.
+        data = kwargs.get("data")
+        cls = self.__class__
+
+        if data is not None:
+            if self.dtype != data.dtype:
+                # TODO: This logic is largely copied from `as_index`. The two
+                # should be unified via a centralized type dispatching scheme.
+                if isinstance(data, NumericalColumn):
+                    try:
+                        cls = _dtype_to_index[data.dtype.type]
+                    except KeyError:
+                        cls = GenericIndex
+                        # TODO: GenericIndex has a different API for __new__
+                        # than other Index types. Refactoring Index types will
+                        # be necessary to clean this up.
+                        kwargs["values"] = kwargs.pop("data")
+                elif isinstance(data, StringColumn):
+                    cls = StringIndex
+                elif isinstance(data, DatetimeColumn):
+                    cls = DatetimeIndex
+                elif isinstance(data, TimeDeltaColumn):
+                    cls = TimedeltaIndex
+                elif isinstance(data, CategoricalColumn):
+                    cls = CategoricalIndex
+            elif cls is RangeIndex:
+                # RangeIndex must convert to other numerical types for ops
+
+                # TODO: The one exception to the output type selected here is
+                # that scalar multiplication of a RangeIndex in pandas results
+                # in another RangeIndex. Propagating that information through
+                # cudf with the current internals is possible, but requires
+                # significant hackery since we'd need _copy_construct or some
+                # other constructor to be intrinsically capable of processing
+                # operations. We should fix this behavior once we've completed
+                # a more thorough refactoring of the various Index classes that
+                # makes it easier to propagate this logic.
+                try:
+                    cls = _dtype_to_index[data.dtype.type]
+                except KeyError:
+                    cls = GenericIndex
+
+        return cls(**{**self._copy_construct_defaults, **kwargs})
 
     def sort_values(self, return_indexer=False, ascending=True, key=None):
         """
@@ -1019,74 +928,6 @@ def unique(self):
         """
         return as_index(self._values.unique(), name=self.name)
 
-    def __add__(self, other):
-        return self._apply_op("__add__", other)
-
-    def __radd__(self, other):
-        return self._apply_op("__radd__", other)
-
-    def __sub__(self, other):
-        return self._apply_op("__sub__", other)
-
-    def __rsub__(self, other):
-        return self._apply_op("__rsub__", other)
-
-    def __mul__(self, other):
-        return self._apply_op("__mul__", other)
-
-    def __rmul__(self, other):
-        return self._apply_op("__rmul__", other)
-
-    def __mod__(self, other):
-        return self._apply_op("__mod__", other)
-
-    def __rmod__(self, other):
-        return self._apply_op("__rmod__", other)
-
-    def __pow__(self, other):
-        return self._apply_op("__pow__", other)
-
-    def __floordiv__(self, other):
-        return self._apply_op("__floordiv__", other)
-
-    def __rfloordiv__(self, other):
-        return self._apply_op("__rfloordiv__", other)
-
-    def __truediv__(self, other):
-        return self._apply_op("__truediv__", other)
-
-    def __rtruediv__(self, other):
-        return self._apply_op("__rtruediv__", other)
-
-    __div__ = __truediv__
-
-    def __and__(self, other):
-        return self._apply_op("__and__", other)
-
-    def __or__(self, other):
-        return self._apply_op("__or__", other)
-
-    def __xor__(self, other):
-        return self._apply_op("__xor__", other)
-
-    def __eq__(self, other):
-        return self._apply_op("__eq__", other)
-
-    def __ne__(self, other):
-        return self._apply_op("__ne__", other)
-
-    def __lt__(self, other):
-        return self._apply_op("__lt__", other)
-
-    def __le__(self, other):
-        return self._apply_op("__le__", other)
-
-    def __gt__(self, other):
-        return self._apply_op("__gt__", other)
-
-    def __ge__(self, other):
-        return self._apply_op("__ge__", other)
-
     def join(
         self, other, how="left", level=None, return_indexers=False, sort=False
     ):
@@ -1292,59 +1133,6 @@ def to_series(self, index=None, name=None):
             name=self.name if name is None else name,
         )
 
-    @property
-    def is_unique(self):
-        """
-        Return if the index has unique values.
-        """
-        raise (NotImplementedError)
-
-    @property
-    def is_monotonic(self):
-        """
-        Alias for is_monotonic_increasing.
-        """
-        return self.is_monotonic_increasing
-
-    @property
-    def is_monotonic_increasing(self):
-        """
-        Return if the index is monotonic increasing
-        (only equal or increasing) values.
-        """
-        return self._values.is_monotonic_increasing
-
-    @property
-    def is_monotonic_decreasing(self):
-        """
-        Return if the index is monotonic decreasing
-        (only equal or decreasing) values.
-        """
-        return self._values.is_monotonic_decreasing
-
-    @property
-    def empty(self):
-        """
-        Indicator whether Index is empty.
-
-        True if Index is entirely empty (no items).
-
-        Returns
-        -------
-        out : bool
-            If Index is empty, return True, if not return False.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> index = cudf.Index([])
-        >>> index
-        Float64Index([], dtype='float64')
-        >>> index.empty
-        True
-        """
-        return not self.size
-
     def get_slice_bound(self, label, side, kind):
         """
         Calculate slice bound that corresponds to given label.
@@ -1426,9 +1214,7 @@ def isin(self, values):
         array([ True, False, False])
         """
 
-        result = self.to_series().isin(values).values
-
-        return result
+        return self._values.isin(values).values
 
     def where(self, cond, other=None):
         """
@@ -1460,10 +1246,6 @@ def where(self, cond, other=None):
         """
         return super().where(cond=cond, other=other)
 
-    @property
-    def __cuda_array_interface__(self):
-        raise (NotImplementedError)
-
     def memory_usage(self, deep=False):
         """
         Memory usage of the values.
@@ -1555,11 +1337,13 @@ def _from_table(cls, table):
         else:
             return as_index(table)
 
+    @property
+    def _copy_construct_defaults(self):
+        return {"data": self._column, "name": self.name}
+
     @classmethod
     def _from_data(cls, data, index=None):
-        return cls._from_table(Frame(data=data))
-
-    _accessors = set()  # type: Set[Any]
+        return cls._from_table(SingleColumnFrame(data=data))
 
     @property
     def _constructor_expanddim(self):
@@ -1606,7 +1390,7 @@ def __new__(
         if step == 0:
             raise ValueError("Step must not be zero.")
 
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
         if isinstance(start, range):
             therange = start
             start = therange.start
@@ -1654,10 +1438,6 @@ def step(self):
         """
         return self._step
 
-    @property
-    def _num_columns(self):
-        return 1
-
     @property
     def _num_rows(self):
         return len(self)
@@ -1734,8 +1514,9 @@ def __len__(self):
         return len(range(self._start, self._stop, self._step))
 
     def __getitem__(self, index):
+        len_self = len(self)
         if isinstance(index, slice):
-            sl_start, sl_stop, sl_step = index.indices(len(self))
+            sl_start, sl_stop, sl_step = index.indices(len_self)
 
             lo = self._start + sl_start * self._step
             hi = self._start + sl_stop * self._step
@@ -1743,7 +1524,11 @@ def __getitem__(self, index):
             return RangeIndex(start=lo, stop=hi, step=st, name=self._name)
 
         elif isinstance(index, Number):
-            index = utils.normalize_index(index, len(self))
+            if index < 0:
+                index = len_self + index
+            if not (0 <= index < len_self):
+                raise IndexError("out-of-bound")
+            index = min(index, len_self)
             index = self._start + index * self._step
             return index
         else:
@@ -1753,9 +1538,6 @@ def __getitem__(self, index):
 
         return as_index(self._values[index], name=self.name)
 
-    def __eq__(self, other):
-        return super(type(self), self).__eq__(other)
-
     def equals(self, other):
         if isinstance(other, RangeIndex):
             if (self._start, self._stop, self._step) == (
@@ -1810,7 +1592,7 @@ def is_contiguous(self):
 
     @property
     def size(self):
-        return self.__len__()
+        return len(self)
 
     def find_label_range(self, first=None, last=None):
         """Find subrange in the ``RangeIndex``, marked by their positions, that
@@ -1851,25 +1633,6 @@ def find_label_range(self, first=None, last=None):
 
         return begin, end
 
-    @copy_docstring(_to_frame)  # type: ignore
-    def to_frame(self, index=True, name=None):
-        return _to_frame(self, index, name)
-
-    def to_gpu_array(self, fillna=None):
-        """Get a dense numba device array for the data.
-
-        Parameters
-        ----------
-        fillna : str or None
-            Replacement value to fill in place of nulls.
-
-        Notes
-        -----
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-        """
-        return self._values.to_gpu_array(fillna=fillna)
-
     def to_pandas(self):
         return pd.RangeIndex(
             start=self._start,
@@ -1936,10 +1699,6 @@ def get_slice_bound(self, label, side, kind=None):
         pos = search_range(start, stop, label, step, side=side)
         return pos
 
-    @property
-    def __cuda_array_interface__(self):
-        return self._values.__cuda_array_interface__
-
     def memory_usage(self, **kwargs):
         return 0
 
@@ -1973,7 +1732,7 @@ def __new__(cls, values, **kwargs):
             Column's name. Otherwise if this name is different from the value
             Column's, the values Column will be cloned to adopt this name.
         """
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
         out._initialize(values, **kwargs)
 
         return out
@@ -2001,7 +1760,7 @@ def _initialize(self, values, **kwargs):
 
     @property
     def _values(self):
-        return next(iter(self._data.columns))
+        return self._column
 
     def copy(self, name=None, deep=False, dtype=None, names=None):
         """
@@ -2038,9 +1797,6 @@ def copy(self, name=None, deep=False, dtype=None, names=None):
     def __sizeof__(self):
         return self._values.__sizeof__()
 
-    def __len__(self):
-        return len(self._values)
-
     def __repr__(self):
         max_seq_items = get_option("max_seq_items") or len(self)
         mr = 0
@@ -2130,10 +1886,6 @@ def __getitem__(self, index):
         else:
             return res
 
-    @copy_docstring(_to_frame)  # type: ignore
-    def to_frame(self, index=True, name=None):
-        return _to_frame(self, index, name)
-
     @property
     def dtype(self):
         """
@@ -2160,20 +1912,9 @@ def find_label_range(self, first, last):
             end += 1
         return begin, end
 
-    @property
-    def is_unique(self):
-        """
-        Return if the index has unique values.
-        """
-        return self._values.is_unique
-
     def get_slice_bound(self, label, side, kind):
         return self._values.get_slice_bound(label, side, kind)
 
-    @property
-    def __cuda_array_interface__(self):
-        return self._values.__cuda_array_interface__
-
 
 class NumericIndex(GenericIndex):
     """Immutable, ordered and sliceable sequence of labels.
@@ -2196,7 +1937,7 @@ class NumericIndex(GenericIndex):
 
     def __new__(cls, data=None, dtype=None, copy=False, name=None):
 
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
         dtype = _index_to_dtype[cls]
         if copy:
             data = column.as_column(data, dtype=dtype).copy()
@@ -2318,7 +2059,7 @@ def __new__(
         # pandas dtindex creation first which.  For now
         # just make sure we handle np.datetime64 arrays
         # and then just dispatch upstream
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
 
         if freq is not None:
             raise NotImplementedError("Freq is not yet supported")
@@ -2573,7 +2314,7 @@ def __new__(
         name=None,
     ) -> "TimedeltaIndex":
 
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
 
         if freq is not None:
             raise NotImplementedError("freq is not yet supported")
@@ -2705,7 +2446,7 @@ def __new__(
                 )
         if copy:
             data = column.as_column(data, dtype=dtype).copy(deep=True)
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
         kwargs = _setdefault_name(data, name=name)
         if isinstance(data, CategoricalColumn):
             data = data
@@ -2931,7 +2672,7 @@ def __new__(
     ) -> "IntervalIndex":
         if copy:
             data = column.as_column(data, dtype=dtype).copy()
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
         kwargs = _setdefault_name(data, name=name)
         if isinstance(data, IntervalColumn):
             data = data
@@ -3004,7 +2745,7 @@ class StringIndex(GenericIndex):
     """
 
     def __new__(cls, values, copy=False, **kwargs):
-        out = Frame.__new__(cls)
+        out = SingleColumnFrame.__new__(cls)
         kwargs = _setdefault_name(values, **kwargs)
         if isinstance(values, StringColumn):
             values = values.copy(deep=copy)
@@ -3112,7 +2853,7 @@ def as_index(arbitrary, **kwargs) -> Index:
     )
 
 
-_dtype_to_index = {
+_dtype_to_index: Dict[Any, Type[Index]] = {
     np.int8: Int8Index,
     np.int16: Int16Index,
     np.int32: Int32Index,
@@ -3123,7 +2864,7 @@ def as_index(arbitrary, **kwargs) -> Index:
     np.uint64: UInt64Index,
     np.float32: Float32Index,
     np.float64: Float64Index,
-}  # type: Dict[Any, Type[Index]]
+}
 
 _index_to_dtype = {
     Int8Index: np.int8,
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index aec931fefbf..21d075ae67d 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -2,6 +2,7 @@
 
 from typing import Any, Union
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 from nvtx import annotate
@@ -16,6 +17,7 @@
     is_categorical_dtype,
     is_column_like,
     is_list_like,
+    is_numerical_dtype,
     is_scalar,
     to_cudf_compatible_scalar,
 )
@@ -57,7 +59,9 @@ def get_label_range_or_mask(index, start, stop, step):
         if start is not None and stop is not None:
             if start > stop:
                 return slice(0, 0, None)
-            boolean_mask = (index >= start) and (index <= stop)
+            # TODO: Once Index binary ops are updated to support logical_and,
+            # can use that instead of using cupy.
+            boolean_mask = cp.logical_and((index >= start), (index <= stop))
         elif start is not None:
             boolean_mask = index >= start
         else:
@@ -81,7 +85,11 @@ def __getitem__(self, arg):
             arg = list(arg)
         data = self._sr._column[arg]
 
-        if is_scalar(data) or _is_null_host_scalar(data):
+        if (
+            isinstance(data, list)
+            or is_scalar(data)
+            or _is_null_host_scalar(data)
+        ):
             return data
         index = self._sr.index.take(arg)
         return self._sr._copy_construct(data=data, index=index)
@@ -97,9 +105,11 @@ def __setitem__(self, key, value):
             value = to_cudf_compatible_scalar(value)
         else:
             value = column.as_column(value)
-
         if (
-            not is_categorical_dtype(self._sr._column.dtype)
+            not isinstance(
+                self._sr._column.dtype,
+                (cudf.Decimal64Dtype, cudf.CategoricalDtype),
+            )
             and hasattr(value, "dtype")
             and pd.api.types.is_numeric_dtype(value.dtype)
         ):
@@ -164,10 +174,17 @@ def __setitem__(self, key, value):
         self._sr.iloc[key] = value
 
     def _loc_to_iloc(self, arg):
-        from cudf.core.column import column
-        from cudf.core.series import Series
-
         if is_scalar(arg):
+            if not is_numerical_dtype(self._sr.index.dtype):
+                # TODO: switch to cudf.utils.dtypes.is_integer(arg)
+                if isinstance(
+                    arg, cudf.Scalar
+                ) and pd.api.types.is_integer_dtype(arg.dtype):
+                    found_index = arg.value
+                    return found_index
+                elif pd.api.types.is_integer(arg):
+                    found_index = arg
+                    return found_index
             try:
                 found_index = self._sr.index._values.find_first_value(
                     arg, closest=False
@@ -187,7 +204,7 @@ def _loc_to_iloc(self, arg):
             return indices_from_labels(self._sr, arg)
 
         else:
-            arg = Series(column.as_column(arg))
+            arg = cudf.core.series.Series(cudf.core.column.as_column(arg))
             if arg.dtype in (bool, np.bool_):
                 return arg
             else:
@@ -402,10 +419,34 @@ def _setitem_tuple_arg(self, key, value):
                 "DataFrames with a MultiIndex"
             )
 
-        columns = self._get_column_selection(key[1])
-
-        for col in columns:
-            self._df[col].loc[key[0]] = value
+        try:
+            columns = self._get_column_selection(key[1])
+        except KeyError:
+            if not self._df.empty and isinstance(key[0], slice):
+                pos_range = get_label_range_or_mask(
+                    self._df.index, key[0].start, key[0].stop, key[0].step
+                )
+                idx = self._df.index[pos_range]
+            elif self._df.empty and isinstance(key[0], slice):
+                idx = None
+            else:
+                idx = cudf.Index(key[0])
+            if is_scalar(value):
+                length = len(idx) if idx is not None else 1
+                value = as_column(value, length=length)
+
+            new_col = cudf.Series(value, index=idx)
+            if not self._df.empty:
+                new_col = new_col._align_to_index(self._df.index, how="right")
+
+            if self._df.empty:
+                self._df.index = (
+                    idx if idx is not None else cudf.RangeIndex(len(new_col))
+                )
+            self._df._data.insert(key[1], new_col)
+        else:
+            for col in columns:
+                self._df[col].loc[key[0]] = value
 
     def _get_column_selection(self, arg):
         return self._df._get_columns_by_label(arg)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 1c1e48e7372..7fe877c945e 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -5,7 +5,6 @@
 import numbers
 import pickle
 import warnings
-from collections import OrderedDict
 from collections.abc import Sequence
 from typing import Any, List, Tuple, Union
 
@@ -20,7 +19,7 @@
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.column import column
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame
+from cudf.core.frame import Frame, SingleColumnFrame
 from cudf.core.index import Index, as_index
 
 
@@ -205,6 +204,11 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
+    @property
+    def _num_columns(self):
+        # MultiIndex is not a single-columned frame.
+        return super(SingleColumnFrame, self)._num_columns
+
     def rename(self, names, inplace=False):
         """
         Alter MultiIndex level names
@@ -573,7 +577,7 @@ def from_arrow(cls, table):
                    names=['a', 'b'])
         """
 
-        return super(Index, cls).from_arrow(table)
+        return super(SingleColumnFrame, cls).from_arrow(table)
 
     def to_arrow(self):
         """Convert MultiIndex to PyArrow Table
@@ -607,7 +611,7 @@ def to_arrow(self):
         ]
         """
 
-        return super(Index, self).to_arrow()
+        return super(SingleColumnFrame, self).to_arrow()
 
     @property
     def codes(self):
@@ -1049,9 +1053,6 @@ def deserialize(cls, header, frames):
         names = pickle.loads(header["names"])
         return MultiIndex(names=names, source_data=source_data)
 
-    def __iter__(self):
-        cudf.utils.utils.raise_iteration_error(obj=self)
-
     def __getitem__(self, index):
         # TODO: This should be a take of the _source_data only
         match = self.take(index)
@@ -1108,29 +1109,6 @@ def get_level_values(self, level):
         )
         return level_values
 
-    def _to_frame(self):
-
-        # for each column of codes
-        # replace column with mapping from integers to levels
-        df = self.codes.copy(deep=False)
-        for idx, col in enumerate(df.columns):
-            # use merge as a replace fn
-            level = cudf.DataFrame(
-                {
-                    "idx": column.arange(
-                        len(self.levels[idx]), dtype=df[col].dtype
-                    ),
-                    "level": self.levels[idx],
-                }
-            )
-            code = cudf.DataFrame({"idx": df[col]})
-            df[col] = code.merge(level).level
-        return df
-
-    @property
-    def _values(self):
-        return list([i for i in self])
-
     @classmethod
     def _concat(cls, objs):
 
@@ -1248,7 +1226,7 @@ def _poplevels(self, level):
         if not ilevels:
             return None
 
-        popped_data = OrderedDict({})
+        popped_data = {}
         popped_names = []
         names = list(self.names)
 
@@ -1413,11 +1391,7 @@ def is_monotonic_increasing(self):
         Return if the index is monotonic increasing
         (only equal or increasing) values.
         """
-        if not hasattr(self, "_is_monotonic_increasing"):
-            self._is_monotonic_increasing = self._is_sorted(
-                ascending=None, null_position=None
-            )
-        return self._is_monotonic_increasing
+        return self._is_sorted(ascending=None, null_position=None)
 
     @property
     def is_monotonic_decreasing(self):
@@ -1425,11 +1399,9 @@ def is_monotonic_decreasing(self):
         Return if the index is monotonic decreasing
         (only equal or decreasing) values.
         """
-        if not hasattr(self, "_is_monotonic_decreasing"):
-            self._is_monotonic_decreasing = self._is_sorted(
-                ascending=[False] * len(self.levels), null_position=None
-            )
-        return self._is_monotonic_decreasing
+        return self._is_sorted(
+            ascending=[False] * len(self.levels), null_position=None
+        )
 
     def argsort(self, ascending=True, **kwargs):
         indices = self._source_data.argsort(ascending=ascending, **kwargs)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1c339d79aaf..b949f60fa63 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -7,7 +7,7 @@
 
 import cudf
 
-_axis_map = {0: 0, 1: 1, "index": 0, "columns": 1}
+_AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1}
 
 
 def _align_objs(objs, how="outer"):
@@ -71,6 +71,8 @@ def _align_objs(objs, how="outer"):
 
 
 def _normalize_series_and_dataframe(objs, axis):
+    """Convert any cudf.Series objects in objs to DataFrames in place."""
+    # Default to naming series by a numerical id if they are not named.
     sr_name = 0
     for idx, o in enumerate(objs):
         if isinstance(o, cudf.Series):
@@ -201,44 +203,46 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     1      b       2  monkey  george
     """
 
+    # TODO: Do we really need to have different error messages for an empty
+    # list and a list of None?
     if not objs:
         raise ValueError("No objects to concatenate")
 
     objs = [obj for obj in objs if obj is not None]
+
+    if not objs:
+        raise ValueError("All objects passed were None")
+
     # Return for single object
     if len(objs) == 1:
+        obj = objs[0]
+
         if ignore_index:
             if axis == 1:
                 result = cudf.DataFrame(
-                    data=objs[0]._data.copy(deep=True),
-                    index=objs[0].index.copy(deep=True),
+                    data=obj._data.copy(deep=True),
+                    index=obj.index.copy(deep=True),
                 )
-                # TODO: Move following columns setting into
-                # above constructor after following issue is fixed:
-                # https://github.com/rapidsai/cudf/issues/6821
-                result.columns = pd.RangeIndex(len(objs[0]._data.names))
+                # The DataFrame constructor for dict-like data (such as the
+                # ColumnAccessor given by obj._data here) will drop any columns
+                # in the data that are not in `columns`, so we have to rename
+                # after construction.
+                result.columns = pd.RangeIndex(len(obj._data.names))
             elif axis == 0:
-                result = cudf.DataFrame(
-                    data=objs[0]._data.copy(deep=True),
-                    index=cudf.RangeIndex(len(objs[0])),
-                )
-        else:
-            result = objs[0].copy()
-        if sort:
-            if axis == 0:
-                return result.sort_index()
-            elif not result.columns.is_monotonic:
-                # TODO: Sorting by columns can be done
-                # once the following issue is fixed:
-                # https://github.com/rapidsai/cudf/issues/6821
-                raise NotImplementedError(
-                    "Sorting by columns is not yet supported"
-                )
+                if isinstance(obj, (pd.Series, cudf.Series)):
+                    result = cudf.Series(
+                        data=obj._data.copy(deep=True),
+                        index=cudf.RangeIndex(len(obj)),
+                    )
+                else:
+                    result = cudf.DataFrame(
+                        data=obj._data.copy(deep=True),
+                        index=cudf.RangeIndex(len(obj)),
+                    )
         else:
-            return result
+            result = obj.copy()
 
-    if len(objs) == 0:
-        raise ValueError("All objects passed were None")
+        return result.sort_index(axis=axis) if sort else result
 
     # Retrieve the base types of `objs`. In order to support sub-types
     # and object wrappers, we use `isinstance()` instead of comparing
@@ -247,7 +251,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     for o in objs:
         if isinstance(o, cudf.MultiIndex):
             typs.add(cudf.MultiIndex)
-        if issubclass(type(o), cudf.Index):
+        elif isinstance(o, cudf.Index):
             typs.add(type(o))
         elif isinstance(o, cudf.DataFrame):
             typs.add(cudf.DataFrame)
@@ -258,13 +262,11 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
 
     allowed_typs = {cudf.Series, cudf.DataFrame}
 
-    param_axis = _axis_map.get(axis, None)
-    if param_axis is None:
+    axis = _AXIS_MAP.get(axis, None)
+    if axis is None:
         raise ValueError(
-            f'`axis` must be 0 / "index" or 1 / "columns", got: {param_axis}'
+            f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
         )
-    else:
-        axis = param_axis
 
     # when axis is 1 (column) we can concat with Series and Dataframes
     if axis == 1:
@@ -275,44 +277,50 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         df = cudf.DataFrame()
         _normalize_series_and_dataframe(objs, axis=axis)
 
-        old_objs = objs
+        # Inner joins involving empty data frames always return empty dfs, but
+        # We must delay returning until we have set the column names.
+        empty_inner = any(obj.empty for obj in objs) and join == "inner"
+
         objs = [obj for obj in objs if obj.shape != (0, 0)]
+
         if len(objs) == 0:
             return df
-        empty_inner = False
-        if join == "inner":
-            # don't filter out empty df's
-            if any(obj.empty for obj in old_objs):
-                empty_inner = True
 
         objs, match_index = _align_objs(objs, how=join)
 
-        for idx, o in enumerate(objs):
-            if idx == 0:
-                df.index = o.index
-            for col in o._data.names:
-                if col in df._data:
+        df.index = objs[0].index
+        for o in objs:
+            for name, col in o._data.items():
+                if name in df._data:
                     raise NotImplementedError(
-                        f"A Column with duplicate name found: {col}, cuDF "
+                        f"A Column with duplicate name found: {name}, cuDF "
                         f"doesn't support having multiple columns with "
                         f"same names yet."
                     )
-                df[col] = o._data[col]
+                df[name] = col
+
+        result_columns = objs[0].columns.append(
+            [obj.columns for obj in objs[1:]]
+        )
 
-        result_columns = objs[0].columns
-        for o in objs[1:]:
-            result_columns = result_columns.append(o.columns)
         if ignore_index:
             # with ignore_index the column names change to numbers
             df.columns = pd.RangeIndex(len(result_columns.unique()))
         else:
             df.columns = result_columns.unique()
+
         if empty_inner:
             # if join is inner and it contains an empty df
             # we return an empty df
             return df.head(0)
+
+        # This check uses `sort is not False` rather than just `sort=True`
+        # to differentiate between a user-provided `False` value and the
+        # default `None`. This is necessary for pandas compatibility, even
+        # though `True` and `False` are the only valid options from the user.
         if not match_index and sort is not False:
             return df.sort_index()
+
         if sort or join == "inner":
             # when join='outer' and sort=False string indexes
             # are returned unsorted. Everything else seems
@@ -321,8 +329,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         else:
             return df
 
+    # If we get here, we are always concatenating along axis 0 (the rows).
     typ = list(typs)[0]
-
     if len(typs) > 1:
         if allowed_typs == typs:
             # This block of code will run when `objs` has
@@ -343,15 +351,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             # objs are empty dataframes.
             return cudf.DataFrame()
         elif len(objs) == 1:
-            if join == "inner":
-                data = None
-            else:
-                data = objs[0]._data.copy(deep=True)
+            obj = objs[0]
             result = cudf.DataFrame(
-                data=data,
-                index=cudf.RangeIndex(len(objs[0]))
+                data=None if join == "inner" else obj._data.copy(deep=True),
+                index=cudf.RangeIndex(len(obj))
                 if ignore_index
-                else objs[0].index.copy(deep=True),
+                else obj.index.copy(deep=True),
             )
             return result
         else:
@@ -363,7 +368,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 axis=axis,
                 join=join,
                 ignore_index=ignore_index,
-                sort=sort,
+                # Explicitly cast rather than relying on None being falsy.
+                sort=bool(sort),
             )
         return result
 
@@ -524,16 +530,13 @@ def _tile(A, reps):
             return cudf.Series([], dtype=A.dtype)
 
     # Step 1: tile id_vars
-    mdata = collections.OrderedDict()
-    for col in id_vars:
-        mdata[col] = _tile(frame[col], K)
+    mdata = {col: _tile(frame[col], K) for col in id_vars}
 
     # Step 2: add variable
-    var_cols = []
-    for i, _ in enumerate(value_vars):
-        var_cols.append(
-            cudf.Series(cudf.core.column.full(N, i, dtype=np.int8))
-        )
+    var_cols = [
+        cudf.Series(cudf.core.column.full(N, i, dtype=np.int8))
+        for i in range(len(value_vars))
+    ]
     temp = cudf.Series._concat(objs=var_cols, index=None)
 
     if not var_name:
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index d879b2ec4e2..fbd4cf492de 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -2,6 +2,7 @@
 import decimal
 
 import numpy as np
+import pyarrow as pa
 
 from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
 from cudf.core.column.column import ColumnBase
@@ -114,44 +115,36 @@ def _device_value_to_host(self):
         self._host_value = self._device_value._to_host_scalar()
 
     def _preprocess_host_value(self, value, dtype):
-        if isinstance(dtype, Decimal64Dtype):
-            # TODO: Support coercion from decimal.Decimal to different dtype
-            # TODO: Support coercion from integer to Decimal64Dtype
-            raise NotImplementedError(
-                "dtype as cudf.Decimal64Dtype is not supported. Pass a "
-                "decimal.Decimal to construct a DecimalScalar."
-            )
-        if isinstance(value, decimal.Decimal) and dtype is not None:
-            raise TypeError(f"Can not coerce decimal to {dtype}")
-
-        value = to_cudf_compatible_scalar(value, dtype=dtype)
         valid = not _is_null_host_scalar(value)
 
-        if isinstance(value, decimal.Decimal):
-            # 0.0042 -> Decimal64Dtype(2, 4)
+        if isinstance(dtype, Decimal64Dtype):
+            value = pa.scalar(
+                value, type=pa.decimal128(dtype.precision, dtype.scale)
+            ).as_py()
+        if isinstance(value, decimal.Decimal) and dtype is None:
             dtype = Decimal64Dtype._from_decimal(value)
 
-        else:
-            if dtype is None:
-                if not valid:
-                    if isinstance(value, (np.datetime64, np.timedelta64)):
-                        unit, _ = np.datetime_data(value)
-                        if unit == "generic":
-                            raise TypeError(
-                                "Cant convert generic NaT to null scalar"
-                            )
-                        else:
-                            dtype = value.dtype
-                    else:
+        value = to_cudf_compatible_scalar(value, dtype=dtype)
+
+        if dtype is None:
+            if not valid:
+                if isinstance(value, (np.datetime64, np.timedelta64)):
+                    unit, _ = np.datetime_data(value)
+                    if unit == "generic":
                         raise TypeError(
-                            "dtype required when constructing a null scalar"
+                            "Cant convert generic NaT to null scalar"
                         )
+                    else:
+                        dtype = value.dtype
                 else:
-                    dtype = value.dtype
-            dtype = np.dtype(dtype)
+                    raise TypeError(
+                        "dtype required when constructing a null scalar"
+                    )
+            else:
+                dtype = value.dtype
 
-            # temporary
-            dtype = np.dtype("object") if dtype.char == "U" else dtype
+        if not isinstance(dtype, Decimal64Dtype):
+            dtype = np.dtype(dtype)
 
         if not valid:
             value = NA
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 955519d0b57..c5a7b07d778 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -7,13 +7,12 @@
 from collections import abc as abc
 from numbers import Number
 from shutil import get_terminal_size
-from typing import Any, Optional, Set
+from typing import Any, Optional
 from uuid import uuid4
 
 import cupy
 import numpy as np
 import pandas as pd
-from nvtx import annotate
 from pandas._config import get_option
 from pandas.api.types import is_dict_like
 
@@ -22,7 +21,6 @@
 from cudf._lib.transform import bools_to_mask
 from cudf.core.abc import Serializable
 from cudf.core.column import (
-    ColumnBase,
     DatetimeColumn,
     TimeDeltaColumn,
     arange,
@@ -34,26 +32,27 @@
 from cudf.core.column.categorical import (
     CategoricalAccessor as CategoricalAccessor,
 )
+from cudf.core.column.column import _concat_columns
 from cudf.core.column.lists import ListMethods
 from cudf.core.column.string import StringMethods
+from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame, _drop_rows_by_labels
+from cudf.core.frame import SingleColumnFrame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import SeriesGroupBy
 from cudf.core.index import Index, RangeIndex, as_index
 from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
 from cudf.core.window import Rolling
-from cudf.utils import cudautils, docutils, ioutils, utils
+from cudf.utils import cudautils, docutils, ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     can_convert_to_column,
+    find_common_type,
     is_decimal_dtype,
     is_list_dtype,
     is_list_like,
     is_mixed_with_object_dtype,
     is_scalar,
-    is_string_dtype,
     min_scalar_type,
-    numeric_normalize_types,
 )
 from cudf.utils.utils import (
     get_appropriate_dispatched_func,
@@ -61,7 +60,8 @@
 )
 
 
-class Series(Frame, Serializable):
+class Series(SingleColumnFrame, Serializable):
+    # The `constructor*` properties are used by `dask` (and `dask_cudf`)
     @property
     def _constructor(self):
         return Series
@@ -265,8 +265,7 @@ def __init__(
 
     @classmethod
     def _from_table(cls, table, index=None):
-        name = next(iter(table._data.keys()))
-        data = next(iter(table._data.values()))
+        name, data = next(iter(table._data.items()))
         if index is None:
             if table._index is not None:
                 index = Index._from_table(table._index)
@@ -289,14 +288,6 @@ def _from_data(
             out.name = name
         return out
 
-    @property
-    def _column(self):
-        return self._data[self.name]
-
-    @_column.setter
-    def _column(self, value):
-        self._data[self.name] = value
-
     def __contains__(self, item):
         return item in self._index
 
@@ -341,52 +332,6 @@ def from_pandas(cls, s, nan_as_null=None):
         """
         return cls(s, nan_as_null=nan_as_null)
 
-    @property
-    def values(self):
-        """
-        Return a CuPy representation of the Series.
-
-        Only the values in the Series will be returned.
-
-        Returns
-        -------
-        out : cupy.ndarray
-            The values of the Series.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, -10, 100, 20])
-        >>> ser.values
-        array([  1, -10, 100,  20])
-        >>> type(ser.values)
-        <class 'cupy.core.core.ndarray'>
-        """
-        return self._column.values
-
-    @property
-    def values_host(self):
-        """
-        Return a numpy representation of the Series.
-
-        Only the values in the Series will be returned.
-
-        Returns
-        -------
-        out : numpy.ndarray
-            The values of the Series.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, -10, 100, 20])
-        >>> ser.values_host
-        array([  1, -10, 100,  20])
-        >>> type(ser.values_host)
-        <class 'numpy.ndarray'>
-        """
-        return self._column.values_host
-
     def serialize(self):
         header = {}
         frames = []
@@ -401,12 +346,6 @@ def serialize(self):
 
         return header, frames
 
-    @property
-    def shape(self):
-        """Returns a tuple representing the dimensionality of the Series.
-        """
-        return (len(self),)
-
     @property
     def dt(self):
         """
@@ -435,23 +374,6 @@ def dt(self):
                 "Can only use .dt accessor with datetimelike values"
             )
 
-    @property
-    def ndim(self):
-        """Dimension of the data. Series ndim is always 1.
-        """
-        return 1
-
-    @property
-    def name(self):
-        """Returns name of the Series.
-        """
-        return self._data.names[0]
-
-    @name.setter
-    def name(self, value):
-        col = self._data.pop(self.name)
-        self._data[value] = col
-
     @classmethod
     def deserialize(cls, header, frames):
         index_nframes = header["index_frame_count"]
@@ -467,16 +389,9 @@ def deserialize(cls, header, frames):
 
         return Series(column, index=index, name=name)
 
+    @property
     def _copy_construct_defaults(self):
-        return dict(data=self._column, index=self._index, name=self.name)
-
-    def _copy_construct(self, **kwargs):
-        """Shallow copy this object by replacing certain ctor args.
-        """
-        params = self._copy_construct_defaults()
-        cls = type(self)
-        params.update(kwargs)
-        return cls(**params)
+        return {"data": self._column, "index": self._index, "name": self.name}
 
     def _get_columns_by_label(self, labels, downcast=False):
         """Return the column specified by `labels`
@@ -487,64 +402,11 @@ def _get_columns_by_label(self, labels, downcast=False):
         new_data = super()._get_columns_by_label(labels, downcast)
 
         return (
-            self._constructor(data=new_data, index=self.index)
+            self.__class__(data=new_data, index=self.index)
             if len(new_data) > 0
-            else self._constructor(dtype=self.dtype, name=self.name)
+            else self.__class__(dtype=self.dtype, name=self.name)
         )
 
-    @classmethod
-    def from_arrow(cls, array):
-        """
-        Convert from PyArrow Array/ChunkedArray to Series.
-
-        Parameters
-        ----------
-        array : PyArrow Array/ChunkedArray
-            PyArrow Object which has to be converted to cudf Series.
-
-        Raises
-        ------
-        TypeError for invalid input type.
-
-        Returns
-        -------
-        cudf Series
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import pyarrow as pa
-        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
-        0       a
-        1       b
-        2    <NA>
-        dtype: object
-        """
-
-        return cls(cudf.core.column.ColumnBase.from_arrow(array))
-
-    def to_arrow(self):
-        """
-        Convert Series to a PyArrow Array.
-
-        Returns
-        -------
-        PyArrow Array
-
-        Examples
-        --------
-        >>> import cudf
-        >>> sr = cudf.Series(["a", "b", None])
-        >>> sr.to_arrow()
-        <pyarrow.lib.StringArray object at 0x7f796b0e7600>
-        [
-          "a",
-          "b",
-          null
-        ]
-        """
-        return self._column.to_arrow()
-
     def drop(
         self,
         labels=None,
@@ -667,14 +529,6 @@ def drop(
         if not inplace:
             return out
 
-    def __copy__(self, deep=True):
-        return self.copy(deep)
-
-    def __deepcopy__(self, memo=None):
-        if memo is None:
-            memo = {}
-        return self.copy()
-
     def append(self, to_append, ignore_index=False, verify_integrity=False):
         """Append values from another ``Series`` or array-like object.
         If ``ignore_index=True``, the index is reset.
@@ -1047,11 +901,6 @@ def memory_usage(self, index=True, deep=False):
             n += self._index.memory_usage(deep=deep)
         return n
 
-    def __len__(self):
-        """Returns the size of the ``Series`` including null values.
-        """
-        return len(self._column)
-
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         if method == "__call__":
             return get_appropriate_dispatched_func(
@@ -1186,12 +1035,9 @@ def __getitem__(self, arg):
         else:
             return self.loc[arg]
 
-    def __iter__(self):
-        cudf.utils.utils.raise_iteration_error(obj=self)
+    iteritems = SingleColumnFrame.__iter__
 
-    iteritems = __iter__
-
-    items = __iter__
+    items = SingleColumnFrame.__iter__
 
     def to_dict(self, into=dict):
         raise TypeError(
@@ -1251,22 +1097,6 @@ def take(self, indices, keep_index=True):
             data = self._column.take(col_inds, keep_index=False)
             return self._copy_construct(data=data, index=None)
 
-    def __bool__(self):
-        """Always raise TypeError when converting a Series
-        into a boolean.
-        """
-        raise TypeError(f"can't compute boolean for {type(self)}")
-
-    def tolist(self):
-
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via `tolist()` method. Consider using "
-            "`.to_arrow().to_pylist()` to construct a Python list."
-        )
-
-    to_list = tolist
-
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -1484,66 +1314,32 @@ def __repr__(self):
             lines[-1] = lines[-1] + "dtype: %s" % self.dtype
         else:
             lines = output.split(",")
-            return lines[0] + ", dtype: %s)" % self.dtype
+            lines[-1] = " dtype: %s)" % self.dtype
+            return ",".join(lines)
         if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
             lines.append(category_memory)
         return "\n".join(lines)
 
-    @annotate("BINARY_OP", color="orange", domain="cudf_python")
     def _binaryop(
         self, other, fn, fill_value=None, reflect=False, can_reindex=False
     ):
-        """
-        Internal util to call a binary operator *fn* on operands *self*
-        and *other*.  Return the output Series.  The output dtype is
-        determined by the input operands.
-
-        If ``reflect`` is ``True``, swap the order of the operands.
-        """
         if isinstance(other, cudf.DataFrame):
             return NotImplemented
 
-        result_name = utils.get_result_name(self, other)
         if isinstance(other, Series):
-            if not can_reindex and fn in cudf.utils.utils._EQUALITY_OPS:
-                if not self.index.equals(other.index):
-                    raise ValueError(
-                        "Can only compare identically-labeled "
-                        "Series objects"
-                    )
-            lhs, rhs = _align_indices([self, other], allow_non_unique=True)
+            if (
+                not can_reindex
+                and fn in cudf.utils.utils._EQUALITY_OPS
+                and not self.index.equals(other.index)
+            ):
+                raise ValueError(
+                    "Can only compare identically-labeled " "Series objects"
+                )
+            lhs, other = _align_indices([self, other], allow_non_unique=True)
         else:
-            lhs, rhs = self, other
-        rhs = self._normalize_binop_value(rhs)
+            lhs = self
 
-        if fn == "truediv":
-            if str(lhs.dtype) in truediv_int_dtype_corrections:
-                truediv_type = truediv_int_dtype_corrections[str(lhs.dtype)]
-                lhs = lhs.astype(truediv_type)
-
-        if fill_value is not None:
-            if is_scalar(rhs):
-                lhs = lhs.fillna(fill_value)
-            else:
-                if lhs.nullable and rhs.nullable:
-                    lmask = Series(data=lhs.nullmask)
-                    rmask = Series(data=rhs.nullmask)
-                    mask = (lmask | rmask).data
-                    lhs = lhs.fillna(fill_value)
-                    rhs = rhs.fillna(fill_value)
-                    result = lhs._binaryop(rhs, fn=fn, reflect=reflect)
-                    data = column.build_column(
-                        data=result.data, dtype=result.dtype, mask=mask
-                    )
-                    return lhs._copy_construct(data=data)
-                elif lhs.nullable:
-                    lhs = lhs.fillna(fill_value)
-                elif rhs.nullable:
-                    rhs = rhs.fillna(fill_value)
-
-        outcol = lhs._column.binary_operator(fn, rhs, reflect=reflect)
-        result = lhs._copy_construct(data=outcol, name=result_name)
-        return result
+        return super()._binaryop(other, fn, fill_value, reflect, lhs)
 
     def add(self, other, fill_value=None, axis=0):
         """
@@ -1598,9 +1394,6 @@ def add(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "add", fill_value)
 
-    def __add__(self, other):
-        return self._binaryop(other, "add")
-
     def radd(self, other, fill_value=None, axis=0):
         """Addition of series and other, element-wise
         (binary operator radd).
@@ -1648,9 +1441,6 @@ def radd(self, other, fill_value=None, axis=0):
             other, "add", fill_value=fill_value, reflect=True
         )
 
-    def __radd__(self, other):
-        return self._binaryop(other, "add", reflect=True)
-
     def subtract(self, other, fill_value=None, axis=0):
         """Subtraction of series and other, element-wise
         (binary operator sub).
@@ -1699,9 +1489,6 @@ def subtract(self, other, fill_value=None, axis=0):
 
     sub = subtract
 
-    def __sub__(self, other):
-        return self._binaryop(other, "sub")
-
     def rsub(self, other, fill_value=None, axis=0):
         """Subtraction of series and other, element-wise
         (binary operator rsub).
@@ -1747,9 +1534,6 @@ def rsub(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "sub", fill_value, reflect=True)
 
-    def __rsub__(self, other):
-        return self._binaryop(other, "sub", reflect=True)
-
     def multiply(self, other, fill_value=None, axis=0):
         """Multiplication of series and other, element-wise
         (binary operator mul).
@@ -1797,9 +1581,6 @@ def multiply(self, other, fill_value=None, axis=0):
 
     mul = multiply
 
-    def __mul__(self, other):
-        return self._binaryop(other, "mul")
-
     def rmul(self, other, fill_value=None, axis=0):
         """Multiplication of series and other, element-wise
         (binary operator rmul).
@@ -1848,9 +1629,6 @@ def rmul(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "mul", fill_value, True)
 
-    def __rmul__(self, other):
-        return self._binaryop(other, "mul", reflect=True)
-
     def mod(self, other, fill_value=None, axis=0):
         """Modulo of series and other, element-wise
         (binary operator mod).
@@ -1886,9 +1664,6 @@ def mod(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "mod", fill_value)
 
-    def __mod__(self, other):
-        return self._binaryop(other, "mod")
-
     def rmod(self, other, fill_value=None, axis=0):
         """Modulo of series and other, element-wise
         (binary operator rmod).
@@ -1937,9 +1712,6 @@ def rmod(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "mod", fill_value, True)
 
-    def __rmod__(self, other):
-        return self._binaryop(other, "mod", reflect=True)
-
     def pow(self, other, fill_value=None, axis=0):
         """Exponential power of series and other, element-wise
         (binary operator pow).
@@ -1985,9 +1757,6 @@ def pow(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "pow", fill_value)
 
-    def __pow__(self, other):
-        return self._binaryop(other, "pow")
-
     def rpow(self, other, fill_value=None, axis=0):
         """Exponential power of series and other, element-wise
         (binary operator rpow).
@@ -2033,9 +1802,6 @@ def rpow(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "pow", fill_value, True)
 
-    def __rpow__(self, other):
-        return self._binaryop(other, "pow", reflect=True)
-
     def floordiv(self, other, fill_value=None, axis=0):
         """Integer division of series and other, element-wise
         (binary operator floordiv).
@@ -2081,9 +1847,6 @@ def floordiv(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "floordiv", fill_value)
 
-    def __floordiv__(self, other):
-        return self._binaryop(other, "floordiv")
-
     def rfloordiv(self, other, fill_value=None, axis=0):
         """Integer division of series and other, element-wise
         (binary operator rfloordiv).
@@ -2137,9 +1900,6 @@ def rfloordiv(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "floordiv", fill_value, True)
 
-    def __rfloordiv__(self, other):
-        return self._binaryop(other, "floordiv", reflect=True)
-
     def truediv(self, other, fill_value=None, axis=0):
         """Floating division of series and other, element-wise
         (binary operator truediv).
@@ -2185,9 +1945,6 @@ def truediv(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "truediv", fill_value)
 
-    def __truediv__(self, other):
-        return self._binaryop(other, "truediv")
-
     def rtruediv(self, other, fill_value=None, axis=0):
         """Floating division of series and other, element-wise
         (binary operator rtruediv).
@@ -2233,81 +1990,18 @@ def rtruediv(self, other, fill_value=None, axis=0):
             raise NotImplementedError("Only axis=0 supported at this time.")
         return self._binaryop(other, "truediv", fill_value, True)
 
-    def __rtruediv__(self, other):
-        return self._binaryop(other, "truediv", reflect=True)
-
-    __div__ = __truediv__
-
-    def _bitwise_binop(self, other, op):
-        if (
-            np.issubdtype(self.dtype, np.bool_)
-            or np.issubdtype(self.dtype, np.integer)
-        ) and (
-            np.issubdtype(other.dtype, np.bool_)
-            or np.issubdtype(other.dtype, np.integer)
-        ):
-            # TODO: This doesn't work on Series (op) DataFrame
-            # because dataframe doesn't have dtype
-            ser = self._binaryop(other, op)
-            if np.issubdtype(self.dtype, np.bool_) or np.issubdtype(
-                other.dtype, np.bool_
-            ):
-                ser = ser.astype(np.bool_)
-            return ser
-        else:
-            raise TypeError(
-                f"Operation 'bitwise {op}' not supported between "
-                f"{self.dtype.type.__name__} and {other.dtype.type.__name__}"
-            )
-
-    def __and__(self, other):
-        """Performs vectorized bitwise and (&) on corresponding elements of two
-        series.
-        """
-        return self._bitwise_binop(other, "and")
-
-    def __or__(self, other):
-        """Performs vectorized bitwise or (|) on corresponding elements of two
-        series.
-        """
-        return self._bitwise_binop(other, "or")
-
-    def __xor__(self, other):
-        """Performs vectorized bitwise xor (^) on corresponding elements of two
-        series.
-        """
-        return self._bitwise_binop(other, "xor")
-
     def logical_and(self, other):
-        ser = self._binaryop(other, "l_and")
-        return ser.astype(np.bool_)
+        return self._binaryop(other, "l_and").astype(np.bool_)
 
     def remainder(self, other):
-        ser = self._binaryop(other, "mod")
-        return ser
+        return self._binaryop(other, "mod")
 
     def logical_or(self, other):
-        ser = self._binaryop(other, "l_or")
-        return ser.astype(np.bool_)
+        return self._binaryop(other, "l_or").astype(np.bool_)
 
     def logical_not(self):
         return self._unaryop("not")
 
-    def _normalize_binop_value(self, other):
-        """Returns a *column* (not a Series) or scalar for performing
-        binary operations with self._column.
-        """
-        if isinstance(other, ColumnBase):
-            return other
-        if isinstance(other, Series):
-            return other._column
-        elif isinstance(other, Index):
-            return Series(other)._column
-        elif other is cudf.NA:
-            return cudf.Scalar(other, dtype=self.dtype)
-        else:
-            return self._column.normalize_binop_value(other)
-
     def eq(self, other, fill_value=None, axis=0):
         """Equal to of series and other, element-wise
         (binary operator eq).
@@ -2360,9 +2054,6 @@ def eq(self, other, fill_value=None, axis=0):
             other=other, fn="eq", fill_value=fill_value, can_reindex=True
         )
 
-    def __eq__(self, other):
-        return self._binaryop(other, "eq")
-
     def ne(self, other, fill_value=None, axis=0):
         """Not equal to of series and other, element-wise
         (binary operator ne).
@@ -2415,9 +2106,6 @@ def ne(self, other, fill_value=None, axis=0):
             other=other, fn="ne", fill_value=fill_value, can_reindex=True
         )
 
-    def __ne__(self, other):
-        return self._binaryop(other, "ne")
-
     def lt(self, other, fill_value=None, axis=0):
         """Less than of series and other, element-wise
         (binary operator lt).
@@ -2470,9 +2158,6 @@ def lt(self, other, fill_value=None, axis=0):
             other=other, fn="lt", fill_value=fill_value, can_reindex=True
         )
 
-    def __lt__(self, other):
-        return self._binaryop(other, "lt")
-
     def le(self, other, fill_value=None, axis=0):
         """Less than or equal to of series and other, element-wise
         (binary operator le).
@@ -2525,9 +2210,6 @@ def le(self, other, fill_value=None, axis=0):
             other=other, fn="le", fill_value=fill_value, can_reindex=True
         )
 
-    def __le__(self, other):
-        return self._binaryop(other, "le")
-
     def gt(self, other, fill_value=None, axis=0):
         """Greater than of series and other, element-wise
         (binary operator gt).
@@ -2580,9 +2262,6 @@ def gt(self, other, fill_value=None, axis=0):
             other=other, fn="gt", fill_value=fill_value, can_reindex=True
         )
 
-    def __gt__(self, other):
-        return self._binaryop(other, "gt")
-
     def ge(self, other, fill_value=None, axis=0):
         """Greater than or equal to of series and other, element-wise
         (binary operator ge).
@@ -2635,15 +2314,8 @@ def ge(self, other, fill_value=None, axis=0):
             other=other, fn="ge", fill_value=fill_value, can_reindex=True
         )
 
-    def __ge__(self, other):
-        return self._binaryop(other, "ge")
-
     def __invert__(self):
-        """Bitwise invert (~) for each element.
-        Logical NOT if dtype is bool
-
-        Returns a new Series.
-        """
+        """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         if np.issubdtype(self.dtype, np.integer):
             return self._unaryop("invert")
         elif np.issubdtype(self.dtype, np.bool_):
@@ -2653,13 +2325,6 @@ def __invert__(self):
                 f"Operation `~` not supported on {self.dtype.type.__name__}"
             )
 
-    def __neg__(self):
-        """Negated value (-) for each element
-
-        Returns a new Series.
-        """
-        return self.__mul__(-1)
-
     @copy_docstring(CategoricalAccessor.__init__)  # type: ignore
     @property
     def cat(self):
@@ -2675,6 +2340,11 @@ def str(self):
     def list(self):
         return ListMethods(column=self._column, parent=self)
 
+    @copy_docstring(StructMethods.__init__)  # type: ignore
+    @property
+    def struct(self):
+        return StructMethods(column=self._column, parent=self)
+
     @property
     def dtype(self):
         """dtype of the Series"""
@@ -2731,9 +2401,14 @@ def _concat(cls, objs, axis=0, index=True):
                     )
 
             if dtype_mismatch:
-                objs = numeric_normalize_types(*objs)
+                common_dtype = find_common_type([obj.dtype for obj in objs])
+                objs = [obj.astype(common_dtype) for obj in objs]
+
+        col = _concat_columns([o._column for o in objs])
+
+        if isinstance(col, cudf.core.column.DecimalColumn):
+            col = objs[0]._column._copy_type_metadata(col)
 
-        col = ColumnBase._concat([o._column for o in objs])
         return cls(data=col, index=index, name=name)
 
     @property
@@ -3133,40 +2808,6 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
 
         return result_series._column.any()
 
-    def to_gpu_array(self, fillna=None):
-        """Get a dense numba device array for the data.
-
-        Parameters
-        ----------
-        fillna : str or None
-            See *fillna* in ``.to_array``.
-
-        Notes
-        -----
-
-        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
-        output size could be smaller.
-
-        Returns
-        -------
-        numba DeviceNDArray
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([10, 20, 30, 40, 50])
-        >>> s
-        0    10
-        1    20
-        2    30
-        3    40
-        4    50
-        dtype: int64
-        >>> s.to_gpu_array()
-        <numba.cuda.cudadrv.devicearray.DeviceNDArray object at 0x7f1840858890>
-        """
-        return self._column.to_gpu_array(fillna=fillna)
-
     def to_pandas(self, index=True, nullable=False, **kwargs):
         """
         Convert to a Pandas Series.
@@ -4216,38 +3857,6 @@ def _return_sentinel_series():
 
         return codes._copy_construct(name=None, index=self.index)
 
-    def factorize(self, na_sentinel=-1):
-        """Encode the input values as integer labels
-
-        Parameters
-        ----------
-        na_sentinel : number
-            Value to indicate missing category.
-
-        Returns
-        --------
-        (labels, cats) : (Series, Series)
-            - *labels* contains the encoded values
-            - *cats* contains the categories in order that the N-th
-              item corresponds to the (N-1) code.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series(['a', 'a', 'c'])
-        >>> codes, uniques = s.factorize()
-        >>> codes
-        0    0
-        1    0
-        2    1
-        dtype: int8
-        >>> uniques
-        0    a
-        1    c
-        dtype: object
-        """
-        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
-
     # UDF related
 
     def applymap(self, udf, out_dtype=None):
@@ -4358,19 +3967,9 @@ def applymap(self, udf, out_dtype=None):
         4    105
         dtype: int64
         """
-        if is_string_dtype(self._column.dtype) or isinstance(
-            self._column, cudf.core.column.CategoricalColumn
-        ):
-            raise TypeError(
-                "User defined functions are currently not "
-                "supported on Series with dtypes `str` and `category`."
-            )
-
-        if callable(udf):
-            res_col = self._unaryop(udf)
-        else:
-            res_col = self._column.applymap(udf, out_dtype=out_dtype)
-        return self._copy_construct(data=res_col)
+        if not callable(udf):
+            raise ValueError("Input UDF must be a callable object.")
+        return self._copy_construct(data=self._unaryop(udf))
 
     #
     # Stats
@@ -4623,60 +4222,7 @@ def product(
             skipna=skipna, dtype=dtype, min_count=min_count
         )
 
-    def prod(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
-    ):
-        """
-        Return product of the values in the series
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        dtype : data type
-            Data type to cast the result to.
-
-        min_count : int, default 0
-            The required number of valid values to perform the operation.
-            If fewer than min_count non-NA values are present the result
-            will be NA.
-
-            The default being 0. This means the sum of an all-NA or empty
-            Series is 0, and the product of an all-NA or empty Series is 1.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level`, `numeric_only`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.prod()
-        120
-        """
-        return self.product(
-            axis=axis,
-            skipna=skipna,
-            dtype=dtype,
-            level=level,
-            numeric_only=numeric_only,
-            min_count=min_count,
-            **kwargs,
-        )
+    prod = product
 
     def cummin(self, axis=None, skipna=True, *args, **kwargs):
         """
@@ -5717,9 +5263,6 @@ def abs(self):
         """
         return self._unaryop("abs")
 
-    def __abs__(self):
-        return self.abs()
-
     # Rounding
     def ceil(self):
         """
@@ -6307,54 +5850,6 @@ def rename(self, index=None, copy=True):
 
         return out.copy(deep=copy)
 
-    @property
-    def is_unique(self):
-        """
-        Return boolean if values in the object are unique.
-
-        Returns
-        -------
-        out : bool
-        """
-        return self._column.is_unique
-
-    @property
-    def is_monotonic(self):
-        """
-        Return boolean if values in the object are monotonic_increasing.
-
-        Returns
-        -------
-        out : bool
-        """
-        return self._column.is_monotonic_increasing
-
-    @property
-    def is_monotonic_increasing(self):
-        """
-        Return boolean if values in the object are monotonic_increasing.
-
-        Returns
-        -------
-        out : bool
-        """
-        return self._column.is_monotonic_increasing
-
-    @property
-    def is_monotonic_decreasing(self):
-        """
-        Return boolean if values in the object are monotonic_decreasing.
-
-        Returns
-        -------
-        out : bool
-        """
-        return self._column.is_monotonic_decreasing
-
-    @property
-    def __cuda_array_interface__(self):
-        return self._column.__cuda_array_interface__
-
     def _align_to_index(
         self, index, how="outer", sort=True, allow_non_unique=False
     ):
@@ -6519,22 +6014,6 @@ def explode(self, ignore_index=False):
 
         return super()._explode(self._column_names[0], ignore_index)
 
-    _accessors = set()  # type: Set[Any]
-
-
-truediv_int_dtype_corrections = {
-    "int8": "float32",
-    "int16": "float32",
-    "int32": "float32",
-    "int64": "float64",
-    "uint8": "float32",
-    "uint16": "float32",
-    "uint32": "float64",
-    "uint64": "float64",
-    "bool": "float32",
-    "int": "float",
-}
-
 
 class DatetimeProperties(object):
     """
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
new file mode 100644
index 00000000000..9058491d8e7
--- /dev/null
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from __future__ import annotations
+from typing import Union
+import cupy as cp
+from warnings import warn
+
+from cudf._lib.nvtext.subword_tokenize import (
+    subword_tokenize_inmem_hash as cpp_subword_tokenize,
+    Hashed_Vocabulary as cpp_hashed_vocabulary,
+)
+
+
+def _cast_to_appropriate_type(ar, cast_type):
+    if cast_type == "cp":
+        return ar
+
+    if cast_type == "pt":
+        from torch.utils.dlpack import from_dlpack
+
+    elif cast_type == "tf":
+        from tf.experimental.dlpack import from_dlpack
+
+    return from_dlpack(ar.astype("int32").toDlpack())
+
+
+class SubwordTokenizer:
+    """
+    Run CUDA BERT subword tokenizer on cuDF strings column.
+    Encodes words to token ids using vocabulary from a pretrained
+    tokenizer.
+    This function requires about 21x the number of character bytes
+    in the input strings column as working memory.
+
+    Parameters
+    ----------
+    hash_file : str
+        Path to hash file containing vocabulary of words with token-ids.
+        This can be created from the raw vocabulary
+        using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function
+
+    do_lower : bool, Default is True
+        If set to True, original text will be lowercased before encoding.
+
+    Returns
+    -------
+    SubwordTokenizer
+    """
+
+    def __init__(self, hash_file: str, do_lower_case: bool = True):
+
+        self.do_lower_case = do_lower_case
+        self.vocab_file = cpp_hashed_vocabulary(hash_file)
+
+    def __call__(
+        self,
+        text,
+        max_length: int,
+        max_num_rows: int,
+        add_special_tokens: bool = True,
+        padding: str = "max_length",
+        truncation: Union[bool, str] = False,
+        stride: int = 0,
+        return_tensors: str = "cp",
+        return_token_type_ids: bool = False,
+    ):
+        """
+        Run CUDA BERT subword tokenizer on cuDF strings column.
+        Encodes words to token ids using vocabulary from a
+        pretrained tokenizer.
+
+        Parameters
+        ----------
+        text : cudf string series
+            The batch of sequences to be encoded.
+
+        max_length : int
+            Controls the maximum length to use or pad to.
+
+        max_num_rows : int
+            Maximum number of rows for the output token-ids expected to
+            be generated by the tokenizer.
+            Used for allocating temporary working memory on the GPU device.
+            If the output generates a larger number of rows,
+            behavior is undefined.
+            This will vary based on stride, truncation, and max_length.
+            For example, for non-overlapping sequences output rows will be
+            the same as input rows.
+            A good default can be twice the max_length
+
+        add_special_tokens : bool, optional, defaults to True
+            Whether or not to encode the sequences with the special tokens
+            of the BERT classification model
+
+        padding : "max_length"
+            Pad to a maximum length specified with the argument max_length
+
+        truncation : bool, defaults to False
+            True:
+            Truncate to a maximum length specified with the argument max_length
+            False or 'do_not_truncate': default
+            No truncation (Output differs from HuggingFace)
+
+        stride : int, optional, defaults to 0
+            The value of this argument defines the number of
+            overlapping tokens.
+            The information about the overlapping tokens is
+            present in the metadata outputed.
+
+        return_tensors : str, {"cp", "pt", "tf"} defaults to "cp"
+            "cp" : Return cupy cp.ndarray objects
+            "tf" : Return TensorFlow tf.constant objects
+            "pt" : Return PyTorch torch.Tensor objects
+
+
+        return_token_type_ids : bool, optional
+            Only False currently supported
+
+        Returns
+        -------
+        An encoding with the following fields:
+            input_ids:(type defined by return_tensors)
+                A tensor of token ids to be fed to the model.
+            attention_mask: (type defined by return_tensors)
+                A tensor of indices specifying which tokens
+                should be attended to by the model
+            metadata: (type defined by return_tensors)
+                Each row contains the index id of the original string and the
+                first and last index of the token-ids that are non-padded and
+                non-overlapping
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.utils.hash_vocab_utils  import hash_vocab
+        >>> hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')
+
+
+        >>> from cudf.core.subword_tokenizer import SubwordTokenizer
+        >>> cudf_tokenizer  = SubwordTokenizer('voc_hash.txt',
+        ...                                    do_lower_case=True)
+        >>> str_series = cudf.Series(['This is the', 'best book'])
+        >>> tokenizer_output = cudf_tokenizer(str_series,
+        ...                                   max_length=8,
+        ...                                   max_num_rows=len(str_series),
+        ...                                   padding='max_length',
+        ...                                   return_tensors='pt',
+        ...                                   truncation=True)
+        >>> tokenizer_output['input_ids']
+        tensor([[ 101, 1142, 1110, 1103,  102,    0,    0,    0],
+                [ 101, 1436, 1520,  102,    0,    0,    0,    0]],
+                device='cuda:0',
+               dtype=torch.int32)
+        >>> tokenizer_output['attention_mask']
+        tensor([[1, 1, 1, 1, 1, 0, 0, 0],
+                [1, 1, 1, 1, 0, 0, 0, 0]],
+                device='cuda:0', dtype=torch.int32)
+        >>> tokenizer_output['metadata']
+        tensor([[0, 1, 3],
+                [1, 1, 2]], device='cuda:0', dtype=torch.int32)
+        """
+
+        if return_token_type_ids:
+            # raise not currently supported
+            # Can also return zeros
+            error_msg = "Returning token_type_ids is currently supported"
+            raise NotImplementedError(error_msg)
+
+        if truncation in (False, "do_not_truncate"):
+            if add_special_tokens:
+                error_msg = (
+                    "Adding special tokens is not supported "
+                    f"with truncation = {truncation}. "
+                )
+                recommendation = (
+                    "Custom Cupy kernel can potentially "
+                    "be used to add it. For reference "
+                    "see: _bert_add_special_tokens"
+                )
+                raise NotImplementedError(error_msg + recommendation)
+
+            truncation = False
+            warning_msg = (
+                "When truncation is not True, the behaviour currently differs "
+                "from HuggingFace as cudf always returns overflowing tokens"
+            )
+            warn(warning_msg)
+
+        if padding != "max_length":
+            error_msg = (
+                "Only padding to the provided max_length"
+                "is currently supported"
+            )
+            raise NotImplementedError(error_msg)
+
+        if max_length <= stride:
+            error_msg = "Stride should be less than max_length"
+            raise ValueError(error_msg)
+
+        if return_tensors not in {"cp", "pt", "tf"}:
+            error_msg = (
+                "Only cupy(cp), pytorch(pt) and tensorflow(tf) "
+                "tensors are supported"
+            )
+            raise NotImplementedError(error_msg)
+
+        stride = max_length - stride
+        # behaviour varies from subword_tokenize but maps with huggingface
+
+        input_ids, attention_mask, metadata = cpp_subword_tokenize(
+            text._column,
+            self.vocab_file,
+            max_sequence_length=max_length,
+            stride=stride,
+            do_lower=self.do_lower_case,
+            do_truncate=truncation,
+            max_rows_tensor=max_num_rows,
+        )
+
+        tokenizer_output = {
+            "input_ids": cp.asarray(input_ids).reshape(-1, max_length),
+            "attention_mask": cp.asarray(attention_mask).reshape(
+                -1, max_length
+            ),
+            "metadata": cp.asarray(metadata).reshape(-1, 3),
+        }
+
+        if add_special_tokens:
+            tokenizer_output = _bert_add_special_tokens(tokenizer_output)
+
+        tokenizer_output = {
+            k: _cast_to_appropriate_type(v, return_tensors)
+            for k, v in tokenizer_output.items()
+        }
+
+        return tokenizer_output
+
+
+def _bert_add_special_tokens(token_o):
+    """
+    Adds special tokens (CLS,SEP) which are often used by pre-trained BERT
+    models to input_ids and adjusts attention_mask and metadata to account
+    for them.
+    """
+    max_length = token_o["input_ids"].shape[1]
+    seq_end_col = max_length - (token_o["input_ids"][:, ::-1] != 0).argmax(1)
+    # clipping to take overflow into account
+    seq_end_col = cp.clip(seq_end_col + 1, a_max=max_length - 1)
+
+    _bert_add_special_tokens_input_ids(token_o["input_ids"], seq_end_col)
+    _bert_add_special_tokens_attention_mask(
+        token_o["attention_mask"], seq_end_col
+    )
+    _bert_add_special_tokens_metadata(token_o["metadata"], max_length)
+
+    return token_o
+
+
+def _bert_add_special_tokens_input_ids(input_ids, seq_end_col):
+    """
+    Add token ids for special tokens ([CLS] and [SEP]) to
+    the start and end of each sequence
+    """
+    # Mark sequence start with [CLS] token mapping to the start of sequence
+    input_ids[:, 1:-1] = input_ids[:, 0:-2]
+    input_ids[:, 0] = 101
+    # Mark end of sequence [SEP]
+
+    input_ids[
+        cp.arange(0, input_ids.shape[0], dtype=cp.uint32), seq_end_col
+    ] = 102
+
+
+def _bert_add_special_tokens_attention_mask(attention_mask, seq_end_col):
+    """
+    Mark attention mask for special tokens ([CLS] and [SEP]) with 1
+    """
+    # Copy attention masks for all but last two
+    attention_mask[:, 1:-1] = attention_mask[:, 0:-2]
+    # Mark [CLS] token with 1
+    attention_mask[:, 0] = 1
+    # Mark [SEP] token with 1
+    attention_mask[
+        cp.arange(0, attention_mask.shape[0], dtype=cp.uint32), seq_end_col
+    ] = 1
+
+
+def _bert_add_special_tokens_metadata(metadata, max_length):
+    """
+    Edit metadata to account for the added special tokens ([CLS] and [SEP])
+    """
+    # metadata seq starts from plus 1
+    metadata[:, 1] = metadata[:, 1] + 1
+    # clip done to take overflow into account
+    metadata[:, 2] = cp.clip(metadata[:, 2] + 1, a_max=max_length - 2)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 535e497e8dc..816de4faf53 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -4,16 +4,16 @@
 from typing import Sequence, Union
 
 import numpy as np
-import pandas as pd
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
+from cudf import _lib as libcudf
 from cudf._lib.strings.convert.convert_integers import (
     is_integer as cpp_is_integer,
 )
 from cudf.core import column
 from cudf.core.index import as_index
-from cudf.utils.dtypes import is_scalar
+from cudf.utils.dtypes import is_integer, is_scalar
 
 _unit_dtype_map = {
     "ns": "datetime64[ns]",
@@ -104,6 +104,13 @@ def to_datetime(
     >>> cudf.to_datetime(1490195805433502912, unit='ns')
     numpy.datetime64('1780-11-20T01:02:30.494253056')
     """
+    if errors not in {"ignore", "raise", "coerce", "warn"}:
+        raise ValueError(
+            f"errors parameter has to be either one of: "
+            f"{['ignore', 'raise', 'coerce', 'warn']}, found: "
+            f"{errors}"
+        )
+
     if arg is None:
         return None
 
@@ -337,67 +344,31 @@ def get_units(value):
     return value
 
 
-class _DateOffsetScalars(object):
-    def __init__(self, scalars):
-        self._gpu_scalars = scalars
-
-
-class _UndoOffsetMeta(pd._libs.tslibs.offsets.OffsetMeta):
-    """
-    For backward compatibility reasons, `pd.DateOffset` is defined
-    with a metaclass `OffsetMeta`, which makes it such that any
-    subclass of `pd._libs.tslibs.offset.BaseOffset` is reported as
-    a subclass of `pd.DateOffset`.
-
-    Because we subclass `pd.DateOffset`, we inherit this behaviour,
-    but don't want to. This metaclass inherits from `OffsetMeta`
-    and restores normal instance and subclass checking to any
-    classes that use it.
-    """
-
-    @classmethod
-    def __instancecheck__(cls, obj) -> bool:
-        return type.__instancecheck__(cls, obj)
+class DateOffset:
 
-    @classmethod
-    def __subclasscheck__(cls, obj) -> bool:
-        return type.__subclasscheck__(cls, obj)
+    _UNITS_TO_CODES = {
+        "nanoseconds": "ns",
+        "microseconds": "us",
+        "milliseconds": "ms",
+        "seconds": "s",
+        "minutes": "m",
+        "hours": "h",
+        "days": "D",
+        "weeks": "W",
+        "months": "M",
+        "years": "Y",
+    }
 
+    _CODES_TO_UNITS = {v: k for k, v in _UNITS_TO_CODES.items()}
 
-class DateOffset(pd.DateOffset, metaclass=_UndoOffsetMeta):
     def __init__(self, n=1, normalize=False, **kwds):
         """
         An object used for binary ops where calendrical arithmetic
         is desired rather than absolute time arithmetic. Used to
         add or subtract a whole number of periods, such as several
         months or years, to a series or index of datetime dtype.
-        Works similarly to pd.DateOffset, and currently supports a
-        subset of its functionality. The arguments that aren't yet
-        supported are:
-            - years
-            - weeks
-            - days
-            - hours
-            - minutes
-            - seconds
-            - microseconds
-            - milliseconds
-            - nanoseconds
-        In addition, cuDF does not yet support DateOffset arguments
-        that 'replace' units in the datetime data being operated on
-        such as
-            - year
-            - month
-            - week
-            - day
-            - hour
-            - minute
-            - second
-            - microsecond
-            - millisecond
-            - nanosecond
-        Finally, cuDF does not yet support rounding via a `normalize`
-        keyword argument.
+        Works similarly to pd.DateOffset, but stores the offset
+        on the device (GPU).
 
         Parameters
         ----------
@@ -431,17 +402,32 @@ def __init__(self, n=1, normalize=False, **kwds):
         1   1999-01-31 00:00:00.012345678
         2   1999-02-28 00:00:00.012345678
         dtype: datetime64[ns]
+
+        Notes
+        -----
+        Note that cuDF does not yet support DateOffset arguments
+        that 'replace' units in the datetime data being operated on
+        such as
+            - year
+            - month
+            - week
+            - day
+            - hour
+            - minute
+            - second
+            - microsecond
+            - millisecond
+            - nanosecond
+
+        cuDF does not yet support rounding via a `normalize`
+        keyword argument.
         """
         if normalize:
             raise NotImplementedError(
                 "normalize not yet supported for DateOffset"
             )
 
-        # TODO: Pandas supports combinations
-        if len(kwds) > 1:
-            raise NotImplementedError("Multiple time units not yet supported")
-
-        all_possible_kwargs = {
+        all_possible_units = {
             "years",
             "months",
             "weeks",
@@ -449,6 +435,7 @@ def __init__(self, n=1, normalize=False, **kwds):
             "hours",
             "minutes",
             "seconds",
+            "milliseconds",
             "microseconds",
             "nanoseconds",
             "year",
@@ -459,30 +446,120 @@ def __init__(self, n=1, normalize=False, **kwds):
             "minute",
             "second",
             "microsecond",
-            "millisecond" "nanosecond",
+            "millisecond",
+            "nanosecond",
+        }
+
+        supported_units = {
+            "years",
+            "months",
+            "weeks",
+            "days",
+            "hours",
+            "minutes",
+            "seconds",
+            "milliseconds",
+            "microseconds",
+            "nanoseconds",
         }
 
-        supported_kwargs = {"months"}
+        unsupported_units = all_possible_units - supported_units
+
+        invalid_kwds = set(kwds) - supported_units - unsupported_units
+        if invalid_kwds:
+            raise TypeError(
+                f"Keyword arguments '{','.join(list(invalid_kwds))}'"
+                " are not recognized"
+            )
+
+        unsupported_kwds = set(kwds) & unsupported_units
+        if unsupported_kwds:
+            raise NotImplementedError(
+                f"Keyword arguments '{','.join(list(unsupported_kwds))}'"
+                " are not yet supported."
+            )
+
+        if any(not is_integer(val) for val in kwds.values()):
+            raise ValueError("Non-integer periods not supported")
+
+        self._kwds = kwds
+        kwds = self._combine_months_and_years(**kwds)
+        kwds = self._combine_kwargs_to_seconds(**kwds)
 
         scalars = {}
         for k, v in kwds.items():
-            if k in all_possible_kwargs:
+            if k in all_possible_units:
                 # Months must be int16
-                dtype = "int16" if k == "months" else None
+                if k == "months":
+                    # TODO: throw for out-of-bounds int16 values
+                    dtype = "int16"
+                else:
+                    unit = self._UNITS_TO_CODES[k]
+                    dtype = np.dtype(f"timedelta64[{unit}]")
                 scalars[k] = cudf.Scalar(v, dtype=dtype)
 
-        super().__init__(n=n, normalize=normalize, **kwds)
+        self._scalars = scalars
+
+    @property
+    def kwds(self):
+        return self._kwds
+
+    def _combine_months_and_years(self, **kwargs):
+        # TODO: if months is zero, don't do a binop
+        kwargs["months"] = kwargs.pop("years", 0) * 12 + kwargs.pop(
+            "months", 0
+        )
+        return kwargs
+
+    def _combine_kwargs_to_seconds(self, **kwargs):
+        """
+        Combine days, weeks, hours and minutes to a single
+        scalar representing the total seconds
+        """
+        seconds = 0
+        seconds += kwargs.pop("weeks", 0) * 604800
+        seconds += kwargs.pop("days", 0) * 86400
+        seconds += kwargs.pop("hours", 0) * 3600
+        seconds += kwargs.pop("minutes", 0) * 60
+        seconds += kwargs.pop("seconds", 0)
+
+        if seconds > np.iinfo("int64").max:
+            raise NotImplementedError(
+                "Total days + weeks + hours + minutes + seconds can not exceed"
+                f" {np.iinfo('int64').max} seconds"
+            )
+
+        if seconds != 0:
+            kwargs["seconds"] = seconds
+        return kwargs
 
-        wrong_kwargs = set(kwds.keys()).difference(supported_kwargs)
-        if len(wrong_kwargs) > 0:
-            raise ValueError(
-                f"Keyword arguments '{','.join(list(wrong_kwargs))}'"
-                " are not yet supported in cuDF DateOffsets"
+    def _datetime_binop(self, datetime_col, op, reflect=False):
+        if reflect and op == "sub":
+            raise TypeError(
+                f"Can not subtract a {type(datetime_col).__name__}"
+                f" from a {type(self).__name__}"
+            )
+        if op not in {"add", "sub"}:
+            raise TypeError(
+                f"{op} not supported between {type(self).__name__}"
+                f" and {type(datetime_col).__name__}"
             )
-        self._scalars = _DateOffsetScalars(scalars)
+        if not self._is_no_op:
+            if "months" in self._scalars:
+                rhs = self._generate_months_column(len(datetime_col), op)
+                datetime_col = libcudf.datetime.add_months(datetime_col, rhs)
+
+            for unit, value in self._scalars.items():
+                if unit != "months":
+                    value = -value if op == "sub" else value
+                    datetime_col += cudf.core.column.as_column(
+                        value, length=len(datetime_col)
+                    )
 
-    def _generate_column(self, size, op):
-        months = self._scalars._gpu_scalars["months"]
+        return datetime_col
+
+    def _generate_months_column(self, size, op):
+        months = self._scalars["months"]
         months = -months if op == "sub" else months
         # TODO: pass a scalar instead of constructing a column
         # https://github.com/rapidsai/cudf/issues/6990
@@ -493,13 +570,45 @@ def _generate_column(self, size, op):
     def _is_no_op(self):
         # some logic could be implemented here for more complex cases
         # such as +1 year, -12 months
-        return all([i == 0 for i in self.kwds.values()])
+        return all([i == 0 for i in self._kwds.values()])
 
-    def __setattr__(self, name, value):
-        if not isinstance(value, _DateOffsetScalars):
-            raise AttributeError("DateOffset objects are immutable.")
-        else:
-            object.__setattr__(self, name, value)
+    def __neg__(self):
+        new_scalars = {k: -v for k, v in self._kwds.items()}
+        return DateOffset(**new_scalars)
+
+    def __repr__(self):
+        includes = []
+        for unit in sorted(self._UNITS_TO_CODES):
+            val = self._kwds.get(unit, None)
+            if val is not None:
+                includes.append(f"{unit}={val}")
+        unit_data = ", ".join(includes)
+        repr_str = f"<{self.__class__.__name__}: {unit_data}>"
+
+        return repr_str
+
+    @classmethod
+    def _from_freqstr(cls, freqstr):
+        """
+        Parse a string and return a DateOffset object
+        expects strings of the form 3D, 25W, 10ms, 42ns, etc.
+        """
+        numeric_part = ""
+        freq_part = ""
+
+        for x in freqstr:
+            if x.isdigit():
+                numeric_part += x
+            else:
+                freq_part += x
+
+        if (
+            freq_part not in cls._CODES_TO_UNITS
+            or not numeric_part + freq_part == freqstr
+        ):
+            raise ValueError(f"Cannot interpret frequency str: {freqstr}")
+
+        return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
 
 
 def _isin_datetimelike(
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 7d1ab3a5435..ca0bd46d9e6 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -10,9 +10,10 @@
 from cudf.core import column
 from cudf.core.column.column import as_column
 from cudf.utils import cudautils
+from cudf.utils.utils import GetAttrGetItemMixin
 
 
-class Rolling:
+class Rolling(GetAttrGetItemMixin):
     """
     Rolling window calculations.
 
@@ -154,6 +155,8 @@ class Rolling:
     dtype: float64
     """
 
+    _PROTECTED_KEYS = frozenset(("obj",))
+
     _time_window = False
 
     def __init__(
@@ -181,15 +184,6 @@ def __init__(
                 )
         self.win_type = win_type
 
-    def __getattr__(self, key):
-        if key == "obj":
-            raise AttributeError()
-        return self.obj[key].rolling(
-            window=self.window,
-            min_periods=self.min_periods,
-            center=self.center,
-        )
-
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 7c8455b6575..e07dca8e1cb 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -223,8 +223,6 @@ def read_orc(
     skiprows=None,
     num_rows=None,
     use_index=True,
-    decimals_as_float=True,
-    force_decimal_scale=None,
     timestamp_type=None,
     **kwargs,
 ):
@@ -266,8 +264,6 @@ def read_orc(
                 skiprows,
                 num_rows,
                 use_index,
-                decimals_as_float,
-                force_decimal_scale,
                 timestamp_type,
             )
         )
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index c17630d1227..5ace108a72d 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -5,7 +5,6 @@
 from uuid import uuid4
 
 from fsspec.core import get_fs_token_paths
-from fsspec.utils import stringify_path
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -203,7 +202,7 @@ def read_parquet(
     for source in filepath_or_buffer:
         if ioutils.is_directory(source, **kwargs):
             fs = _ensure_filesystem(passed_filesystem=None, path=source)
-            source = stringify_path(source)
+            source = ioutils.stringify_pathlike(source)
             source = fs.sep.join([source, "*.parquet"])
 
         tmp_source, compression = ioutils.get_filepath_or_buffer(
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc
new file mode 100644
index 00000000000..20df4c1a92f
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc differ
diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
new file mode 100644
index 00000000000..84b13c9d946
--- /dev/null
+++ b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt
@@ -0,0 +1,4382 @@
+26899
+27424
+875
+7428432802425011718 0
+5054974408289448963 6
+18358444369622338053 9
+5716902217424485892 14
+8236612966193239043 18
+15282833726017872390 21
+15533348956988973570 27
+9001315167781089284 29
+7621090240282984451 33
+15337888141402371590 36
+16169070283077377537 42
+15615300272936709634 43
+12338784885023498756 45
+3175624061711419395 49
+9436392785812228615 52
+12978641027296058883 59
+14468815760709033991 62
+15607694490571932163 69
+53295083356623878 72
+0 78
+2230148770582976004 78
+6120456721458209796 82
+15411373208619074054 86
+10274574020114097153 92
+9000294930530661890 93
+13031557903172483076 95
+11350066664294002181 99
+6325605033787362307 104
+2909954277284188676 107
+4104562716099355138 111
+3267092979937387012 113
+17525453481571210244 117
+11532627846208440834 121
+10784672185103672321 123
+11229796758348255749 124
+4379577250247562242 129
+1041161126836283908 131
+3854383966527313413 135
+16467720483237810694 140
+14820844471735454722 146
+13111220924289178119 148
+2548683052821249538 155
+719749806464434178 157
+2121722119826170883 159
+9005614210949580292 162
+7050169108294333445 166
+17351764915062575107 171
+14644698505496219141 174
+11657834349296686081 179
+13626797927783164930 180
+14735048589438940164 182
+1078491261937017863 186
+7952761372439242754 193
+7692446865301965827 195
+4552111108816020995 198
+12455022990418032132 201
+1123962659471997957 205
+3056549312838577156 210
+1025661670765243906 214
+5397331336358247944 216
+7810366437124875782 224
+1195318972358038531 230
+7079722807026103811 233
+2524512050942986248 236
+1208593608912656389 244
+458260789232344578 249
+13194777122325112327 251
+5922704468287492 258
+11746235869336195079 262
+8611574268876189188 269
+7889840228953421829 273
+16998721522558936068 278
+6703563424903621638 282
+8885848295085850114 288
+13776273837475230211 290
+6036043703810622467 293
+2006225773287659526 296
+14202467530861800964 302
+7157057020317447684 306
+16885485872491802629 310
+12800303798361952772 315
+621325108927868418 319
+16727475898656483841 321
+6890112792805515778 322
+2421332377941126151 324
+16243404411124196356 331
+179400401794890244 335
+2630159406474274819 339
+1306609735592145925 342
+14908020842914311174 347
+1684452927247835651 353
+9400495923215416322 356
+8041860727239247878 358
+5619270496913133574 364
+2985476283152588291 370
+18150632792370312198 373
+13075355875451793410 379
+7596576612263365635 381
+7174955249282660868 384
+2272878747426984963 388
+9645618748109430277 391
+5995177571885476868 396
+16871713338758691845 400
+11801224416933808644 405
+15551192014010130949 409
+8196030292452405250 414
+4794784530053649411 416
+68047322062825475 419
+10163451915097363972 422
+4366630365820669955 426
+9174613115382159879 429
+17673253091692480002 436
+10710744348807818249 438
+6301209632168211460 447
+6557199531177304066 451
+10370980735304160259 453
+2426040420413965827 456
+18123352379522220547 459
+15891150425892429319 462
+16507447417454265351 469
+487708338428237827 476
+14107089365716616196 479
+747857609528251395 483
+17357876987202521607 486
+321005419951863300 493
+703083947315053061 497
+0 502
+17149635587492691460 502
+8277651075246678020 506
+1819886593879462403 510
+13106328552418381315 513
+17519686381941948418 516
+10696099526822671877 518
+4627984173327437314 523
+2628632462897246722 525
+3686397216490033667 527
+6617920799692924934 530
+6679301623707790339 536
+2596030458845084674 539
+13288938917088308226 541
+8348492885671808517 543
+6252009608718840325 548
+5807005916268695559 553
+15382799971167504899 560
+14954638692016032262 563
+8963684459383523331 569
+2934745887866391556 572
+8236887590303639044 576
+2016330563068923911 580
+12976290063611676164 587
+9986513189506445831 591
+780378482699725318 598
+383862355994530823 604
+7511344867307093508 611
+1435616864863593988 615
+12590979271693393411 619
+859813995721111047 622
+17910873098448224770 629
+16703366890805911553 631
+6922480979814889987 632
+8200210214462711297 635
+18382541080931060232 636
+12959023536126992897 644
+11055794376142651906 645
+8668012051305565187 647
+6795201209679524868 650
+3864186432644490244 654
+4574634299775772674 658
+2086703290536303619 660
+7145543127561014787 663
+9889572542971630085 666
+3510566585561691650 671
+10482036181312531460 673
+4296479271603189251 677
+17165580381790665732 680
+17931697598514948104 684
+5072138329769649158 692
+17857316349005986308 698
+1196313437880152072 702
+16094827446472526340 710
+6365083142954013701 714
+17639674970007880709 719
+1336948026798963208 724
+15719079816546418177 732
+453771991153695748 733
+15666021623592344581 737
+3887496731301423107 742
+16351565489992748547 745
+12913808626051103749 748
+9427161342471792643 753
+14610089064185748483 756
+11909740995340709890 759
+3386059367942955011 761
+7100313088634791944 764
+14954362273735097348 772
+5300343188950335490 776
+3306636399811602435 778
+15049176780536452612 781
+11478464585367391747 785
+4192691696663825924 788
+1724981527538165256 792
+8923121468991320579 800
+10407927314751914499 803
+4140577061391662082 806
+11024499228689010181 808
+11103397578962422789 813
+16103730809841527300 818
+2161511371026989571 822
+16905537098408481288 825
+14418359835235787780 833
+8643099440826274820 837
+15803230958149170691 841
+2270949347024239618 844
+16607521085023703556 846
+12520505897845165062 850
+10502193626894192132 856
+12350321094518214659 860
+4950437143309872131 863
+938542234576037889 866
+9547302901107668484 867
+7827404372121768966 871
+17757593377946824198 877
+13699186867246955524 883
+9859653826627356163 887
+16394835100035514883 890
+13800374264730731525 893
+16954635983094506500 898
+8015308433863798275 902
+858715644299290630 905
+4519655150699331077 911
+7134867591233050115 916
+6432786657037144579 919
+0 922
+9408341322832972291 922
+13653279902433200130 925
+1249019122170091524 927
+5444522055126761479 931
+18233734556082323457 938
+1838285473517654531 939
+10799019207790220804 942
+2448710159565130755 946
+18425837006146807297 949
+1384258267102048263 950
+6553795393861204486 957
+5022631533298058243 963
+2595435540421003780 966
+18298501952506793480 970
+17380720526409169413 978
+10291550905275666437 983
+8968303908578660869 988
+7762552109517888009 993
+12993351549860134403 1002
+13098482377540869636 1005
+17174134275815044100 1009
+2405939573849534980 1013
+11051603729345690626 1017
+2765842466801084934 1019
+13348255112383532037 1025
+4560899789258637829 1030
+17071422935680193539 1035
+11513452937230732294 1038
+1637355496640499203 1044
+14940739688966611972 1047
+8286559267538602502 1051
+6029036263825492484 1057
+6337648087046756355 1061
+12327119652833755139 1064
+7489768843341343236 1067
+17101806024406781955 1071
+1494687508867621385 1074
+915975103999953922 1083
+14731060910946571783 1085
+7993361195780195330 1092
+13688799604315935236 1094
+7328858946338903047 1098
+2913637027195678723 1105
+18189363439163655681 1108
+11261484070936291332 1109
+1244962005334571010 1113
+12618388435910808066 1115
+655187203027088898 1117
+1699259352638115337 1119
+9837815037477742085 1128
+10558465000768489987 1133
+3128326958710492164 1136
+16210393874387209731 1140
+3831602806328386054 1143
+1858477608543888899 1149
+11203849268139405826 1152
+14876215834473532933 1154
+838167957834962945 1159
+4472540425609859076 1160
+11410947109444917250 1164
+8435818218907397633 1166
+11045000766266457089 1167
+12325335880954441220 1168
+16708265953266297345 1172
+18342265362969646594 1173
+6953158344648897539 1175
+9922701673105435137 1178
+10113283973443524101 1179
+11668798096262926343 1184
+2129351334726026241 1191
+5692959118811792390 1192
+2917574127780044290 1198
+0 1200
+14420924818562740228 1200
+6098057863303978497 1204
+1252966646111680002 1205
+7111078464697947144 1207
+14144456899593720327 1215
+7367692118573781509 1222
+9319588592876439043 1227
+5212294342286609410 1230
+1600499660866511361 1232
+17579747388547180552 1233
+8365608306992954885 1241
+10307394306592963076 1246
+17092600292669807621 1250
+17030981925892977667 1255
+6929843536411176451 1258
+9908722951841282057 1261
+14685407131320535554 1270
+12861962652898171396 1272
+11958437143660911107 1276
+15904867421058229764 1279
+7283769647955500035 1283
+7872121678898447876 1286
+11726527760261815816 1290
+2316085662456682505 1298
+12840093831481137155 1307
+15574983692566917639 1310
+15176154862895929860 1317
+16186650646772958214 1321
+1965140296142659588 1327
+17362020270091437575 1331
+26356620300320263 1338
+4688323194808506371 1345
+470137109846916612 1348
+785647648524588041 1352
+686083037273571331 1361
+8705676087000994307 1364
+15985311040931325446 1367
+8848102120172622345 1373
+14900059783221505542 1382
+11611185676221023751 1388
+5823293000835959809 1395
+11173877492782561286 1396
+5985141512875075076 1402
+16607272189142469634 1406
+7000924871247012354 1408
+12796508861938638339 1410
+16352304696891085315 1413
+12654027566339262469 1416
+17652126895193709571 1421
+2059554016646703617 1424
+8824828815238545922 1425
+8026041213654553606 1427
+189105210507091461 1433
+8038465995762949635 1438
+0 1441
+4346653818095449092 1441
+13441396742193060358 1445
+5067771148519478785 1451
+210369551309682178 1452
+7856429334361659909 1454
+6456628847560069634 1459
+4777640967745320451 1461
+8983636279512822276 1464
+14568805960710332932 1468
+13817574021643753989 1472
+14625711259902278149 1477
+4632056779689710085 1482
+17613320542667293189 1487
+3172012402848437254 1492
+8040798394603101188 1498
+14064841209998140419 1502
+1914908168343121410 1505
+7368139610144548354 1507
+12868473585497306119 1509
+0 1516
+1618708134596732930 1516
+12587973098332420105 1518
+4964388169698209795 1527
+11644359715676310021 1530
+2644060095775605251 1535
+6430078223195648003 1538
+10183198452214045187 1541
+1240799682393062914 1544
+594310634075621378 1546
+2369514519273954820 1548
+10180653661786314245 1552
+954303650251543043 1557
+14430712698160791045 1560
+7362398115224322564 1565
+17170839233019868678 1569
+4334478792852912645 1575
+6976600872204725253 1580
+2757627166710815234 1585
+11581525848542896643 1587
+1902097979216049156 1590
+7092174838851165700 1594
+3776232881097953287 1598
+4956341896516184071 1605
+16560365104979398147 1612
+9985649880040289799 1615
+8870322153106933763 1622
+6905121755133908995 1625
+13368640352340902916 1628
+6681848478588709895 1632
+1825204937600832520 1639
+10492979809894170628 1647
+16021790814379410438 1651
+2537982728896871938 1657
+17110141827238231043 1659
+8972517116882764291 1662
+6878463938568223238 1665
+3653948979877717506 1671
+11414481194651397126 1673
+14522267179648162819 1679
+3098339502618796035 1682
+7079749050994126342 1685
+13571764215085394946 1691
+4748948606525397506 1693
+1577643399485818884 1695
+4080235243237779462 1699
+10874175738252140040 1705
+8407257242091918850 1713
+13208300770644489219 1715
+692428139842995202 1718
+1811883090719733762 1720
+9059362818280152070 1722
+1942856588307002885 1728
+8118332366482353665 1733
+4958069245857057284 1734
+14647311378680886789 1738
+10762024033896625670 1743
+28898254948429830 1749
+9834906317233815042 1755
+14985989359682912259 1757
+1282980713864208388 1760
+6063131598875265027 1764
+11171681444901584901 1767
+9942643440891227650 1772
+7536761905759707139 1774
+17586310513048226310 1777
+5368266791748388869 1783
+14231943828217691651 1788
+12518647321260815877 1791
+129394441281844743 1796
+2483490487411335170 1803
+654244401428041732 1805
+15646533714849457160 1809
+11807354932867949571 1817
+15902831808268765699 1820
+16275101253541722114 1823
+7489443708629377026 1825
+15395914353243975682 1827
+5617555619731661829 1829
+3134100206450675206 1834
+11607495136261988868 1840
+4974806308616426501 1844
+17446584074836170241 1849
+15686830167444742663 1850
+9706307518401206273 1857
+1668062460313515521 1858
+1175330870409010693 1859
+6316020408117881860 1864
+3926008952689808899 1868
+7412001888157663237 1871
+16350342416828571139 1876
+17722048717800707588 1879
+6638262866276511751 1883
+7428951476729761793 1890
+17816197047883941382 1891
+1346568064340942337 1897
+3701787015222295555 1898
+6659812133237486083 1901
+1828541539854978054 1904
+12379063259192634885 1910
+2611769333840765443 1915
+9618163593004828678 1918
+10135224491789939206 1924
+12979651712861326853 1930
+8882180359699969027 1935
+8839565787481092102 1938
+13328456084920556038 1944
+14232512278042323458 1950
+1868952656876792325 1952
+7567044498348088836 1957
+9878469525845452294 1961
+10877666723773861891 1967
+4437849393189355524 1970
+542122243470857732 1974
+4059190346138068994 1978
+14321675947144358916 1980
+14971180244834539009 1984
+7944574903635664900 1985
+6982417546170903047 1989
+9205813465909939715 1996
+14237044737088801799 1999
+636814072910696963 2006
+12520841226045264391 2009
+8898943418672995331 2016
+15646690259358356484 2019
+15618851112604340228 2023
+10285088843216830977 2027
+18286036510192394760 2028
+6450286360774949890 2036
+12025307250191760899 2038
+7044602746592181249 2041
+8270361223031661060 2042
+7199149542695273990 2046
+16798091800673956358 2052
+5285433079037354499 2058
+8498140496880657410 2061
+18434636390635965953 2063
+8780418579830073348 2064
+959965579978681347 2068
+2666650386212475906 2071
+4093783342266269185 2073
+7977153448080645638 2074
+3230317076849645570 2080
+2644129221999468547 2082
+7597431151331275265 2085
+6151418962808616963 2086
+16786361788616914434 2089
+9522044737514147334 2091
+15360350686533802498 2097
+4398995179394704386 2099
+4163122903470647302 2101
+18110267126768664070 2107
+17811600627481865731 2113
+11988559903619469315 2116
+5893679902922151940 2119
+3302430115655037445 2123
+2756050317441962502 2128
+7373324598575981572 2134
+15626353672087051269 2138
+9026268416534243843 2143
+5857105831257628164 2146
+11246462751297413124 2150
+7459631049065515526 2154
+2175352842263141379 2160
+9748465532031254533 2163
+12060676108130005507 2168
+8160425232164846593 2171
+1665947540125783558 2172
+10758171140537368580 2178
+5744770555727548418 2182
+15867521551313803780 2184
+11178209498970826244 2188
+2663862265833334277 2192
+646145646253570050 2197
+6886825228888300036 2199
+5219187155516171272 2203
+16142200027647465989 2211
+8727938199665870852 2216
+1200328579526163971 2220
+12449385538114001417 2223
+14632283715533800450 2232
+5295800027246062086 2234
+8827019094633400323 2240
+14543826221768176641 2243
+12388128316821831686 2244
+3087048392675298821 2250
+17669786912563615747 2255
+3879520399747123716 2258
+15648071975541157893 2262
+5580473107362200071 2267
+6895786389712974853 2274
+17709709086906012676 2279
+9627483233657542665 2283
+9602326803985618949 2292
+6748599026443758086 2297
+11488364339401397254 2303
+6716511183525677573 2309
+16003763240189186563 2314
+6003803301075291138 2317
+15800367754014516746 2319
+2817341800198731782 2329
+2110085916033252869 2335
+10353852055773781511 2340
+8745468498457416193 2347
+15197463976907486213 2348
+11844773108515011075 2353
+10745169896165544965 2356
+9502565595236673539 2361
+18340734722524717062 2364
+0 2370
+4877506240735029250 2370
+6632868101528461318 2372
+1094192348264738308 2378
+15930308455756352518 2382
+7517061312773919237 2388
+11537382714050522116 2393
+15343851421525887493 2397
+15685583084244037124 2402
+11443729733346354693 2406
+18096845502703148037 2411
+13060060807344890377 2416
+8226818503915081731 2425
+5171144332412330499 2428
+5367144440061049859 2431
+4687503341676126209 2434
+8115677569098133507 2435
+8753274682505368066 2438
+6767268893840927749 2440
+10747160183142327300 2445
+5318831768157948930 2449
+16744837601970291208 2451
+3968740997769839108 2459
+1041860322726726147 2463
+13185494599343868419 2466
+3781663100474830852 2469
+8664347289501861378 2473
+7145447006642560001 2475
+977858689003972101 2476
+188865761021926916 2481
+14781205616979726850 2485
+7514076159997088261 2487
+15227633270557658627 2492
+7486357174119883778 2495
+7899052859637422087 2497
+4312982947448530435 2504
+2484418012864310785 2507
+8450324929602980870 2508
+11374778755239228418 2514
+10780034123560756745 2516
+10313953391808102916 2525
+13836623279669341188 2529
+16297706918062760459 2533
+6404560275247226885 2544
+8323769790774729734 2549
+10061687257419431941 2555
+6724033317759518212 2560
+12265972209834273288 2564
+4748706107567735299 2572
+17588235414846031363 2575
+16029681841978911746 2578
+333014962274056196 2580
+2819861156000228870 2584
+17301319418358929926 2590
+14323022738651812355 2596
+17758251407482208260 2599
+9992216596142364674 2603
+5541911712511293955 2605
+1880849355295036931 2608
+15421034026101803523 2611
+2288503501826235907 2614
+2336333131728265731 2617
+15127408664422292997 2620
+6756061181968708102 2625
+2316367058427453443 2631
+13786932856453332482 2634
+17564157627292750852 2636
+5809790665868502019 2640
+9389430036410766853 2643
+15157257604368261123 2648
+523412383725034497 2651
+5270886391729814021 2652
+8987256414287503365 2657
+2751897370690544643 2662
+47819066577966599 2665
+9543124453318907909 2672
+15186331456703232514 2677
+9731347057535958023 2679
+6234700495105510914 2686
+17720066604242729989 2688
+611878128332703234 2693
+6029104170087404549 2695
+14612606995632327172 2700
+7357792311987945475 2704
+6074856230289873410 2707
+13368808999886628358 2709
+5918378978107988995 2715
+15624776793824203778 2718
+4241055509726121476 2720
+12687432015779367427 2724
+4003272975122620932 2727
+17483676776191982087 2731
+2701605488646040584 2738
+7387630099939362308 2746
+16331822462747681798 2750
+2197183442359868933 2756
+17624623361194542087 2761
+1749450990014992388 2768
+2888206094896619010 2772
+12985412669390948353 2774
+9843120678422464515 2775
+15590458610270713859 2778
+5950622975418741251 2781
+17607672802725530117 2784
+1225097419526011394 2789
+3758572251524375044 2791
+5891371767718009858 2795
+6843754938996156419 2797
+13418347525088883204 2800
+2887280155684756490 2804
+7867196614872225796 2814
+10992396837241625094 2818
+15526482250456426497 2824
+7582254907030848515 2825
+14309589056601523716 2828
+2843794758628944386 2832
+10106627892829635078 2834
+11117505412117820418 2840
+17559521087909430786 2842
+18410508844162253834 2844
+7796754440171003912 2854
+1826091018065355268 2862
+5568124937607335426 2866
+9164033835486570503 2868
+7917102923116225537 2875
+10708221634884163076 2876
+966446973350329348 2880
+1882776320247897092 2884
+18137433528115911172 2888
+7577505208556149252 2892
+3902521102041700356 2896
+11942362790107158020 2900
+2328713611561709573 2904
+8376513561567004165 2909
+18415012889800110091 2914
+7983446382889179652 2925
+2304166271864391689 2929
+708759182721729026 2938
+10774631175750681603 2940
+2608247964063907842 2943
+7317603117343176707 2945
+12615180422705001477 2948
+17995452459822326275 2953
+12439250137675515394 2956
+9947610136498965509 2958
+10340600516380348420 2963
+10073894039732477444 2967
+15954561361998232578 2971
+6039226287079734788 2973
+12684813664097613833 2977
+8337524429261820932 2986
+0 2990
+5738139389410570757 2990
+0 2995
+163262518049440773 2995
+11390362112332120070 3000
+7666496378417453571 3006
+17188351170280199170 3009
+14157925477049500677 3011
+16535316221715341826 3016
+701193705161007105 3018
+15417977144980853763 3019
+9623949443365348357 3022
+16537640731048440324 3027
+9880057250380779521 3031
+10507448958568448514 3032
+9901540867816521219 3034
+10882434502571251716 3037
+15939490563935542790 3041
+3818155241101528578 3047
+10810785028031231493 3049
+17268925026504538113 3054
+6000103580025957894 3055
+14492044616225970179 3061
+8964295197943843335 3064
+13244227239481936387 3071
+2072267724499101186 3074
+735562179013069826 3076
+3271477415853879302 3078
+1150251700717751812 3084
+11835839830005115393 3088
+17028480913889055238 3089
+16864969398419772420 3095
+9646252156141336066 3099
+5589333819644110342 3101
+14729039479109188098 3107
+2256025994407046148 3109
+5630416426912279555 3113
+23611161351524356 3116
+16061932977440933889 3120
+7560058124185071106 3121
+8943767870065516551 3123
+17388385529962317834 3130
+11686727589179028995 3140
+2993671307613155843 3143
+7451626547139373061 3146
+12726375988952098305 3151
+0 3152
+1735273330892205060 3152
+2746028049042776065 3156
+17093562035495421445 3157
+7598703106262353411 3162
+17526920923827930631 3165
+0 3172
+18087597149122765317 3172
+11336730259137625602 3177
+9704022087244797957 3179
+14531181144788964866 3184
+5103530438547424773 3186
+7049971328222257156 3191
+2593832991454060548 3195
+2549992206172832771 3199
+2656864556911864322 3202
+3094347590740453380 3204
+0 3208
+10556974365044028932 3208
+12597146506913681926 3212
+18243354473097630721 3218
+4168646291002030084 3219
+8893226051755120644 3223
+7904367695210051587 3227
+17247367703075879942 3230
+1338287165638264836 3236
+6734394253777139715 3240
+14645087877274778627 3243
+1841749727013933062 3246
+0 3252
+9793622484838288388 3252
+15384076833580083718 3256
+14678310837729104389 3262
+8947895455599830021 3267
+12421729442783160325 3272
+14382812703434878978 3277
+3484468606955360259 3279
+2411175954345499653 3282
+18322361710054416389 3287
+8989744845956541448 3292
+9637438279185886726 3300
+8282725403817063939 3306
+10727259769060221446 3309
+280860399088910340 3315
+3074647116268871172 3319
+9311932047626983431 3323
+2990333995786696707 3330
+11415454184475025922 3333
+8194042667332418565 3335
+11269986522125913093 3340
+10773634478079810565 3345
+0 3350
+4302235270674672643 3350
+4579270605621971460 3353
+3687011949425630213 3357
+9678333478858482691 3362
+14661606109051090440 3365
+9504123850532876291 3373
+14299233528797568008 3376
+10370491504729965060 3384
+286239823911254530 3388
+7969121812144744451 3390
+16606218867148559880 3393
+11756345184017143302 3401
+8204961944753809412 3407
+12456910480062157316 3411
+7569786299014196739 3415
+3372309516929818119 3418
+16631131943564946948 3425
+4436969913528429575 3429
+14467771002258720772 3436
+15278270405312088583 3440
+6638334178561090565 3447
+8154814430089498114 3452
+17289464348431017987 3454
+13185969354886446085 3457
+4725380864147687429 3462
+14933071000620043778 3467
+12471883028204926466 3469
+13286302152236950530 3471
+12020003522260348419 3473
+11784545509165047810 3476
+10311182359550097412 3478
+2262872037167824902 3482
+15672162207595698690 3488
+8479660175647360516 3490
+543122224331105283 3494
+8738610060644560897 3497
+15969479020845567490 3498
+3500
+5303047073946667464
+210658854139
+493093586
+15289397349632312454
+5941764183477191834
+3477193953305167424
+236453760381
+7470284155521404014
+24445261
+16426766960960540026
+14549236
+817365937
+1873618471841499416
+71893492
+10694515171064744788
+29330183088506125
+61997475
+4653200
+109445719
+8926052536804313893
+7528330190111771360
+1418462186
+5887104182899575287
+2625321597997091447
+23407864425745813
+1647838213
+6152225753094686522
+14151987057237756511
+18058417591402760409
+538510099
+17855463731522440261
+240752528220
+27920040887059601
+11078361536363433136
+12517601
+15885957841278600403
+518718202
+805438326
+2621553
+1550910461
+2411070513
+59965836
+13012951802392676509
+97518103
+2625321602295859611
+30277976
+546374457
+16759426304739641933
+259654328
+27356063970624739
+1873618458944931675
+6209987959894902621
+5728764444739437994
+18413109988782047308
+13885455448020813663
+13464164481390611573
+5514354709969504081
+6364097374632348674
+2676033351739376985
+1136798196293306910
+5299098874403555921
+2120987217453057458
+17306856587979066781
+1873618532028844481
+5572365145471912335
+18412263926676652075
+105382480
+5303047039553965447
+9881712940254169714
+152830562
+8610102806501591788
+15524263781940136850
+14282671233461718187
+2857298572705729021
+29330122900898936
+10554335258691243263
+8453377129057749572
+18411417864571256842
+811271050
+1873618489038604579
+4657106642463886071
+2676033356038145381
+514654951
+10757572347027851837
+4237766514325588729
+571999061
+9821766011288487605
+7230168968130792223
+2704904949959166469
+1823671323
+103350839
+46006654
+2755882956846859930
+15289397371128186695
+12662636664722033563
+16318735
+18411417894664929297
+5462796894122411284
+9950019064427710530
+6981729909914862956
+1992588707391932346
+63766972
+6422699
+23407808536904833
+15394822466617412826
+16881139139804531782
+14312300901618944289
+2625321593698061230
+9870724570679212
+5780604289886653255
+3870997034531752803
+2531021389865944442
+10908568553618343357
+1860700038481053299
+196215461
+1801847830
+24183115
+18424247431471827427
+14287090
+417019855960
+71631344
+4391052
+61735328
+18413674012989259870
+2625321597996829544
+17957750408840481687
+9870724568648556
+41943405
+2789363542978135882
+18412827950883864637
+548143940
+22151483
+17257283845880874759
+899112529018292807
+538247952
+69599701
+8510664359869943178
+27356081165698156
+27638084672359236
+12255453
+11400819049620310987
+1321272283
+16881139122607162703
+2359405
+3101815889301670444
+518456056
+9232147856523987724
+3758799212073651272
+3591160524196219107
+154600049
+17946608694533885076
+11500631658516907905
+825323275339564903
+9870724566615620
+39911783
+12318365723907459763
+546112310
+18412827980977537092
+536216330
+2676033351739114988
+11069796553860646809
+7880043043777809442
+451412296787
+18411981918872141859
+11678577273375754735
+8856014234050823647
+105120332
+1309344723
+162464400
+681145240220010584
+2626514825137096412
+6589396841525218018
+356832249381
+6156738032733324876
+11202456151687629452
+27638041680086900
+11243723090649876783
+5726358144768542273
+12498251711624252784
+13702827714901707594
+811008904
+8192198
+8714520725396523830
+514392806
+9960543895307946415
+15287141235608259625
+5727354401416546168
+1808894516123993997
+3686437022462641529
+5249797181178709209
+2625321589399030850
+103088691
+3062219857732765097
+830399540494469985
+530117487457144076
+12454108019635062383
+197984938
+8930986418384079868
+818873277
+16056587
+11526999220155450649
+6160551
+63504826
+7621890105505615217
+11847668763332905754
+10377426660276898779
+1873618519132015281
+18092519415945890646
+15882855708139391266
+7993599274919922706
+2789363538679106064
+2150364451440035988
+9870724570416301
+2625321593697799226
+91161094
+1410073577
+23920969
+7513578521803359945
+22279798815198594
+15520597512816297356
+1023125932615797552
+540017436
+8910392170935354895
+195953314
+644809585
+14024943
+71369196
+1873618476141774348
+816841645
+10906583479868327250
+1454041666728626384
+4128904
+18413392005184749654
+108921430
+468609401971
+16204201012116260706
+99025451
+9870724568385196
+18412545943079354421
+11878630053446878902
+18204249488608200784
+5566476545725367766
+17951898368652543383
+7558005371879033601
+16542141154387102177
+6316393479032998553
+11694336983993944146
+11427331956784106382
+4662073785906890031
+1873618454645640429
+537985804
+12999620585941961275
+2295119206548507606
+11993306
+1597536180772867045
+5299098844309358384
+8294669686619703163
+69337553
+1873618506235448739
+518193910
+5406444726343502428
+16765215479188031591
+5460499803636172954
+3431717683755289915
+28202117477106938
+5249797172580910311
+5745384143842643344
+14065038233622153931
+14311172801615955497
+16758489844492275047
+5510538272098551989
+11065487220741573048
+9870724566353399
+5679882735784101879
+259130038
+87097857
+3491703471172619422
+545850164
+18271599167641487963
+5991347923196709309
+1873618458944406678
+7033448275620070919
+812778389
+434977997061097911
+3445982126355516078
+2676033351738852867
+3545799512027105927
+1873618484739311861
+12749251354825264418
+14836382508930370955
+2625321585100000596
+21997756618246082
+8716776809328151764
+15580874176502892132
+3332575624131774585
+4445946672738010859
+5780604328577598853
+2848264744227112681
+1873618441749072804
+257098416
+4930631980557601532
+6877319166685482198
+1005889956380019628
+820642761
+17826079
+23125779236849772
+810746758
+7930050
+8929320279979198383
+9654763076979264499
+11949535972653271176
+1873618514832984063
+514130660
+18066207382028748450
+2573543666009114673
+18613585580197092
+1427238547443354327
+2625321589398768544
+102826544
+5903884228619468800
+4279043148
+7036226112429884975
+818611132
+15794439
+3324580943442478547
+1903640920853056624
+5898403
+1873618497637649718
+1133620887485417426
+10156853965084755435
+63242678
+282723005
+13586095437453200186
+9082058141968173941
+1987794462939089941
+13237708531286474753
+5240852582657493474
+1915314009235720841
+9870724570154139
+90898949
+17090754651615726815
+492307151
+195691169
+11050161621988804687
+23658823
+11623400942792738969
+9304480456320748248
+71107048
+816579498
+23971751058934778
+17869638717220195611
+1873618476141513316
+361675971417279818
+61211034
+1873618501936418049
+3866756
+567411536
+5302201063430292982
+8486888319115725460
+12406930521299355297
+9870724568123690
+11034422950646711803
+4287350254045103750
+5566476545725106758
+1923875870
+547619651
+6366353527348595732
+8597156797828894009
+13590665243542948895
+13237708561380147208
+4254959725487523541
+2907303882175415846
+1873618454645376983
+9230753948926543533
+11731158
+527827717
+5511666307614640107
+1330643932
+69075405
+28202091681942395
+4727296740454696303
+1992881785902860007
+18301216972081072101
+4076606659425995504
+9870724566091296
+39387493
+154075756
+5459976644113468289
+545588016
+12461042340477994821
+223556406340
+32432337723721245
+19595563
+2573543610120276856
+24535874149025753
+5196265237615086368
+17735566651085687884
+6204347601746593065
+1873618484739049815
+812516243
+6152225714402428442
+15291935501556190620
+15505670362359531298
+451411772583
+9484411285755463284
+161940107
+15292499508566297469
+563348302
+506004186
+11238431078799509026
+18323667541285735009
+2625321610894640833
+103179363763488430
+503001580666
+12769025487284210679
+17785259844527786731
+29612147900877606
+15290243377345399572
+17563932
+7667902
+3186488476490139978
+810484612
+1192315333980326167
+1873618514832721746
+15292499491370961900
+513868514
+5347351719937377689
+45220217
+11775490430040476325
+12240192446106372977
+35324256
+2396555433535145871
+7409502855497715015
+7888341864134085054
+4278781002
+1732546121802517809
+2374936041605498895
+21433680820701635
+12189960762281954023
+869984510486186619
+3598203394278688718
+6103488079777762245
+72876542
+16990917635978692369
+818348984
+15532291
+1146796961722731823
+17761874897365304540
+62980530
+4534407021717882867
+5636255
+32714379920409891
+12552846396214610071
+6262673798361580735
+2528483177756102046
+9870724569894177
+9297735470756268616
+5831598115918776853
+32432303331018178
+6064762127302393958
+6156455943246842659
+23396678
+13500652
+16916327697533962956
+70844900
+816317351
+18411699885273055253
+5884848047378859255
+5837238405281154301
+14311736903207619026
+5141736951422061236
+3604608
+31022281504523376
+3599049409094225259
+577045344
+2974323816123992770
+8021450341214588326
+3577503648415550265
+509805280
+9870724567861628
+11098517635487303139
+7462549834646555859
+98501157
+5779476207078475458
+219257375260
+490013379
+4222974949961697922
+6366353553143235674
+3158171969379764633
+21365044
+27638058876667848
+29330140097217635
+1873618454645114642
+2703776923039566000
+68813257
+279448782049
+814285726
+12237654319976351671
+517669620
+5779476284463187670
+10375505326587315831
+18411699915366727708
+6205475624366966000
+3307734082
+39125348
+1087507565178193378
+545325868
+15986098390340470919
+223556143025
+19177592590632702
+8865366478519731984
+19333416
+32432337723461001
+812254097
+11305519054433421356
+1873618484738787248
+5105416417023100899
+572982104
+505742040
+563086155
+104333894
+8070528080642443989
+11327137566841769230
+2625321610894378836
+16377260960560187819
+15586729198848181726
+1873618441748546884
+18413109971585663048
+4825924017323379312
+5915592292141435844
+5832726151436896491
+17247780946628644032
+810222466
+7405754
+11549275701007551889
+10161648502327149991
+570950482
+1873618514832459339
+313841222762
+4452458274095237609
+1445774942907271091
+6101795934071424788
+92406286
+5293539447540681024
+18331491793766525
+197198505
+11199980773228349986
+32432320526091507
+818086838
+1997667722089860216
+2524806027085153844
+1964966944
+15270143
+1370042529145686776
+5565348523104797810
+18331539082773742
+62718382
+2012415014
+18413110001679335503
+5374107
+14282027259104724924
+10375505339483621145
+9887461037680036022
+1873618544926132491
+4662355883991631380
+18412263939573940270
+157614716
+3295137431799204142
+9870724569630759
+491782859
+214958343888
+16875205763331852041
+7241607903360452069
+5408471212899110030
+23134531
+18411417877468545037
+27356081166681957
+644023149
+70582752
+816055205
+3342460
+5246976952665638015
+14212253575230457510
+576783198
+1842511416005692464
+806159226
+5566476498435574920
+15292217517958891614
+13516735047310051359
+5728764487730398405
+468608617008
+4025969582498383295
+16044698410490725659
+1519546451849645365
+9870724567599405
+5566476545724581156
+5619444426388998007
+98239009
+547095362
+27356033875641745
+219257112483
+8140646021471143544
+4713167439824750602
+16357059045845960667
+5462796881224795644
+9138963602338286574
+21102898
+10905173367761798655
+13701595356116683915
+2477484405147109478
+1880166538706292058
+11206864
+1283692271244348427
+68551110
+5885543833259674054
+18413673995792875610
+2352415791
+14947075702982868
+5299098870103476096
+681145240220994278
+163447447
+331038328206
+38863202
+96207382
+153551462
+2625321606595348609
+5461104757014004985
+10744889200825601240
+1988559907
+258343605
+6517011693716180143
+535167753
+2530175340657839273
+811991951
+15291935475760762248
+4397798264919820154
+18413674025886548065
+12109395139072755174
+475082778886408323
+104071746
+161415815
+8697110475982376165
+15584540329550678645
+13669583335851559254
+2625321610894116800
+1873618441748286746
+18412827963781152832
+819856323
+6209141854797957852
+1783548230307677653
+18411981901675757599
+637928298
+7143606
+15855332315905657597
+2625321864544389907
+12020808312486431384
+3076135121411313050
+10139438201185111279
+6152225744495577231
+33560368941368890
+210659313158
+4278256712
+27638024483702949
+24904017
+32432320525830439
+13263754581809432790
+817824692
+15007995
+359800716494834349
+18613516794268696
+9839328478246341893
+62456234
+5111959
+18411981931769430054
+16219982623696489082
+6261827792145090364
+7692717626264324682
+42664306
+13806855580317125108
+9870724569368358
+16269555352897260337
+214958081659
+11214563466575480865
+15636771529559117046
+13271165719268362246
+2652485274356286816
+538968856
+3784724792312663401
+18263821886743185772
+1986666427421953426
+5565348480114297669
+5352348827359053328
+12976359
+1873618476140725820
+421319345246
+70320604
+11703165067112811597
+21715697223994697
+3757107087862401328
+60424594
+3080312
+10697899350700788395
+1873618527730534170
+468608354196
+509280991
+50528646
+1193603335023233930
+16635669954819197974
+15426482629288462533
+5460499803637156023
+2625321602296318353
+9870724567336570
+97976862
+8818864638845060491
+14288223544298637564
+88080898
+6996745855548787140
+5566476571519223063
+546833214
+220421203678071202
+31022238513759415
+1873618458945389823
+6406389097441592980
+20840752
+813761433
+27356085465188671
+68288962
+5865888353649363875
+109394696450803010
+12213481117926952067
+18413391987988365394
+10944716
+517145329
+5723537903358642458
+21715753112570631
+7758478083289188556
+10675690836223986039
+153289315
+95945236
+11547019543992076059
+9649086479758069023
+2625321606595086582
+258081459
+544801575
+5887799994573980828
+2845029447323880298
+18809125
+8510103668314541335
+6205475701751155414
+1990332636357069057
+429916882098
+2673382969485886910
+1873618489039064439
+18413392018082037849
+10914208898869168291
+3773122177597967623
+161153669
+103809598
+14107087915135404740
+6366071515245381876
+18412545955976642616
+15289397371128645360
+5462796868327967227
+1402930148
+28202057290482949
+797695489810761887
+16777494
+18116142943679220675
+5142301044413893172
+17219576355390295334
+5249797112394286460
+13735950183222348532
+6881458
+29048192479791616
+16896582888638318388
+14517406836956661503
+5458848655886518922
+313840698753
+5197393273133271298
+3861350810962691992
+6375653898722412075
+16885380374869314205
+361129707266
+210659050964
+29048123694646491
+3017170418691476659
+1873618450347593089
+15290243360149277503
+14745847
+72090103
+14546784569801180959
+7431889721301470079
+6364097387529111599
+2435475427475262665
+1873618497636600365
+6151097734773868363
+62194086
+17083693200934636558
+32150372909516328
+4849811
+3172873313800750756
+2150364429944620611
+3862478902367620470
+9305858029919208637
+2625321597997287853
+2508194873
+491258567
+1408762855
+5015996636573993090
+2414921941537785811
+538706709
+5734260728554980678
+22610237
+12714212
+70058456
+6208295882974168451
+32714336929384395
+16643035121679272213
+20023641798084435
+4770547828131824981
+2818164
+1930668198955452820
+13726068529822894439
+468608091255
+5569296714050766113
+17490170188584258190
+8694008299851745161
+7073102484926630551
+155058804
+97714714
+40370537
+2625321602296056238
+1703347206
+15895039144349470066
+5352348805862656188
+3068049059797011246
+5880738612678821404
+12309852946450942075
+33560429128451329
+15289397384024950845
+4767727591019973374
+10682570
+10233718743719545342
+850088361543927300
+2792183694107936667
+1107456968073808590
+5759560470823897206
+162923155
+29612216687004362
+5875369269012203157
+95683088
+294416195335096411
+22279760122415532
+5639662680184522626
+17619012653768771484
+13237708544183762948
+8550520059753138843
+27356042474686002
+249849483538007723
+544539427
+13390152586296232130
+10906513561824594910
+18546980
+1873618489038801706
+2676033356038342054
+6313103561496791450
+2063139881
+6848542126596623056
+160891523
+103547450
+14101293042239958
+6151097653090126690
+1584595969
+12424382439595706534
+17698252132056434004
+4129856573689694799
+16885259953617962521
+12393440069873436875
+32432320527338097
+21433680821684597
+8617826180017097033
+1413046597527668667
+3973491001936446780
+819332033
+17305802226190387588
+1873618467542665344
+16515346
+6619310
+6206321690771522709
+4089771542585346905
+1223976962194278208
+13487493291780736605
+2487491354099451134
+8854886172739175692
+9870724570875039
+2625321593698257851
+1535116279
+6262673798362565305
+91619849
+493028049
+5352348797264856883
+8143564249694210398
+6151097683183797493
+9386257309953099582
+196412070
+3865299044899163405
+71827955
+18613366323088485
+18157949162008873831
+7562235583526800081
+817300400
+4618470194090937269
+4587663
+3932922014897081298
+61931938
+1873618497636337289
+2522831856378710008
+6364097413323754682
+6053028402293443390
+42140016
+12287601267178473523
+2625321597997025900
+538444562
+15991329612793777185
+15291089478142986477
+12452064
+2676033644081056812
+2556016
+16508579235574254010
+805372789
+59900299
+14787093348585572176
+2575517759332551933
+2412665810316625225
+7730749911729375728
+6155298010574883251
+10488220504998020326
+1311572948
+883931539946605906
+5352348805862394041
+2786543383251193103
+546308920
+3346269252
+5782296426993943791
+4469799173763958889
+6205475671656957491
+7872981661881076049
+18116424960081923281
+2676033351739311464
+516621038
+1465168459078698840
+5677488692584514734
+105316943
+4562124351240801677
+5245848874158263187
+16432982289349543214
+162661010
+3971798877726246151
+4787251587800828866
+5875369294806846690
+12217235256243064050
+95420943
+5354604868299326678
+4502324021619918399
+544277281
+5940918086979029952
+2014710471177341259
+2140013610
+1873618463243635741
+18284834
+2676033356038079832
+10531295876509927029
+5458848625792321791
+18411699898170343448
+7410231625909407077
+3478039985316562895
+6204347606046083061
+31586254122912349
+6829167320236755019
+27920101074341046
+13165236096819726043
+32432389312220424
+571933524
+5727354401416743090
+10225919154718574351
+4127600472563058730
+160629376
+103285302
+8483828720842049762
+15740334315622960494
+206359759935
+9813006656186419950
+9319686106503382840
+5515085278788979157
+232154663489
+26149204
+6208295848581203181
+3094190453106412515
+6520986101609793850
+32432320527074663
+5245848925746038203
+5942328186188203485
+1873618467542403595
+16253198
+15881445561639371975
+6357162
+63701435
+15515478115209971466
+5833854247140395797
+283181761
+19177532404009207
+16567374854657149772
+684134257893509654
+9870724570613070
+15680489859993767209
+12826571498698443033
+2625321593697995819
+10329316755526125416
+10754752208794748192
+10758418391935812957
+12105446909435186010
+3143159678306028631
+236453432350
+540214046
+14848239906707278405
+29330157293274228
+684134210602468610
+817038254
+4977791693940394179
+71565807
+1873618497636075077
+807142269
+61669791
+11287403619712895066
+4325515
+13819298136066198
+7734678113259293802
+6098975847429179176
+99222062
+18056758355458722638
+9870724568582655
+16224960573811657069
+2625321597996763849
+4078298757842341053
+17625510063045740642
+10528906628815718922
+490734276
+5412367062202975465
+22085946
+12751507524739009261
+538182415
+12189916
+18413109984482951243
+2541195915421354200
+6671860954713623381
+2893509029140760671
+69534164
+747829823970020707
+6770804071406897080
+2293868
+5566476498434524382
+6534429686359852912
+18412263922377556010
+164430493
+9870724566550039
+154534512
+10167299845199168903
+12754891682880490747
+5250413516934022944
+3315661715940248009
+451651625195343029
+32432333423379563
+5941764217869305943
+2141783083
+283748271730
+10161648493728303880
+5240846595623881868
+67502526
+15618641120352995308
+2676033351739049517
+6205475697451599682
+4023356732265137752
+14986955239351847842
+31304272112126853
+516358893
+2207492698791414354
+477207135345
+1309279186
+105054795
+17859691850682797212
+162398863
+4238330517036600601
+152502880
+18412263952471228465
+257295025
+10905173350565414454
+17498716255300421272
+8881019260503721949
+18022689
+534119176
+18411417890365833232
+6293435910568086045
+9374458755688828226
+820839372
+6153071780807051278
+5909364179964069981
+8126661
+3735453693364143828
+6155045908522469290
+745740842898098858
+2625321589398965240
+12142525752872799042
+160367231
+17958290734101235336
+9523554809025136564
+16892239439269464715
+15289397371127860096
+1736311827
+15991050
+63439289
+6095014
+12484855343804124176
+9658025172156550406
+18067928153034001057
+292345808939
+16572875051796793000
+10542598463376395267
+12772641161582545873
+18413674008690163805
+1544487931
+14737352740221028816
+282919615
+12808641794728789765
+2625321593697733840
+17128487303121020
+1706624008
+14101026494875963
+11214563466576463780
+18412827946584768572
+11966722661119888545
+6156455943247300775
+5300226909920168653
+6004915412369541960
+816776108
+4223816177647290930
+71303659
+1873618476141710425
+12477949191893683608
+417019528294
+9511403338599564690
+4063367
+61407645
+2543805385922512178
+9870724578216632
+5407707525201267705
+9870724568320021
+2564752444
+98959914
+15494005608834598990
+15140097999495498431
+21823800
+12734096628671909131
+537920267
+18412827976678441027
+11927769
+69272016
+18411981914573045794
+2571498445011814318
+10592171188278987146
+2057911839619745748
+9870724566287831
+154272366
+545784627
+17616192489740896443
+21715680027609308
+16886908734816455284
+583336804
+2246313005
+516096747
+2625321585099935141
+620888934
+162136717
+331037018572
+477206873177
+503001777494
+15592058013925444099
+1652810939277510396
+10531295803425490030
+3205882223899445065
+31304323701671300
+28484129580057898
+1873618441749006513
+16893851890367073119
+820577224
+16904712944498838074
+1394017249
+17760542
+4160689491693538063
+4047541379259827663
+7864513
+14219872676477209184
+504169174
+17244622751296785814
+2625321589398702921
+4278977611
+7239633818635733091
+5462796868326918190
+1334641629
+73073152
+7460569593843485201
+15287141188316891641
+818545595
+9339868219275806468
+15728902
+5382561551670903978
+9373330690077689939
+18413392000885653589
+5832866
+63177141
+438515402871
+2373415502940997016
+2148672322930150296
+168849237244054062
+12339564610979564477
+8327325764367420682
+7630443591734791098
+12608147700378373379
+9870724570088730
+2150364451439708714
+18412545938780258356
+13221120945827219803
+492241614
+4129856608083381232
+15740733274947783803
+15858116883009440274
+1873618476141446514
+816513961
+17564225130023161250
+13697261
+10668197763104573447
+71041511
+5357143003026951378
+31022281504720056
+1873618501936351339
+3801219
+442814170389
+5701610621477129021
+8520914754064026558
+15289397306641222853
+108593749
+98697768
+9870724568058057
+5780604294184830225
+156041850
+5192881006389626514
+32150304123324262
+219257572663
+18412545968873930811
+5249797099496672683
+11127945220196076778
+9103100569952650951
+11665621
+421318034537
+17619012718254098754
+14443179094226111164
+1873618480440216958
+69009868
+10594427319499622429
+814482337
+13968724050119231192
+28202091681875145
+27638110466671725
+16166203682344470241
+1712194570
+472907842721
+507970270
+15580874172203795679
+23689855033805297
+154010219
+17092164759424403479
+12893049762838873864
+6877309693745106245
+545522479
+5887800020369606783
+14977809576148535095
+19530026
+14105033451515939293
+6795216411027442152
+2543452128325209336
+1385890784
+114426460
+6444189713816225654
+6152225714402364510
+524384476410219715
+17953567922355439196
+17113993018971653874
+573178715
+515834601
+17090754617222956318
+161874570
+1538130937
+47186305
+30458188512103543
+2449021711964768402
+2414448843017751282
+5214737420442796133
+505938649
+2625321610894575340
+13965057806789381527
+970700105235760464
+15223822230290106035
+16285378285009240167
+16940455997476965252
+2601013084734032090
+5248157445900799208
+1580068669843704469
+15043322265989680207
+29048166685607288
+3863606942184311140
+820315079
+17045009756596405420
+29048192480512516
+11510172448171493799
+5885976160280708469
+7602365
+17785259896117529586
+8856014216854897981
+14477731067643038195
+1873618514832657292
+2578187325
+15292499491370895395
+33560368941827284
+13146357072728951328
+17353152791227993245
+159842942
+15530553734630409457
+5569296726948055802
+494159375523777824
+1812923415
+6366353518750729401
+4278715465
+17097308613030775025
+35258719
+1899651063193471062
+12103109825679658143
+6364338522051512284
+2429880031182916564
+11621189233770302317
+72811005
+15466754
+3880024017885400135
+818283447
+62914993
+4076606625033226775
+1873618497637320883
+7746405201714873917
+5570718
+10859426818132543221
+6925759835249836137
+3506237898852665380
+23407812836853915
+1873618523432225060
+17166316876055971050
+18008952305986046279
+43123062
+9870724569826462
+7410173966093388838
+33560399035500221
+511599051947
+214958540605
+13237708557081051143
+20587696099952690
+15339421027537585423
+6104586261132347910
+11103300151687644832
+1456931819
+1873618450346281005
+9181531069949872018
+14650572868605052119
+17783567759008991682
+575239712866634722
+15288269284022357372
+6206321673575138470
+644219759
+13435115
+399811749952817933
+145335345147610979
+70779363
+6366071455058494624
+7529998377695250462
+519635711
+3539071
+576979807
+9568723490388248888
+634323816
+13012951802393594980
+853643387796785445
+98435620
+28766107292140894
+9181555677596944971
+5195701200510977145
+5129024196560096606
+5831598124518278362
+4844858457232050089
+219257310372
+7569568047215545466
+5461104800004441485
+1518418407735101149
+814220189
+11403474
+18005251247539029895
+10333839787251271664
+1836516380
+8054758354584013306
+507708124
+163644058
+9001701177466488459
+2625321606595545096
+153748072
+4787251587801811388
+39059811
+545260331
+2036204584
+5356296971014964874
+19267879
+9714916684781063078
+3055188874828713383
+14576212124415364447
+2150364417046743283
+4662355849599126556
+1372824966366170355
+1318388695
+15289397293744393060
+8423108281783224429
+505676503
+104268357
+477206348880
+5831598081526006949
+4625631396377398109
+2625321610894313322
+6206321759557388696
+12237654281284815334
+17236251
+9391897711091583990
+3891732840317912522
+8856014216854636141
+5758903550139959418
+7340217
+638124907
+810156929
+6206321690772243584
+112132697
+15287987228927658628
+339636063086
+7721139320100816372
+684134305183500639
+22279768720672168
+5831598111619679502
+14814059355306855043
+4211213383
+15290243360149735302
+18411699880973959188
+15204606
+11507341268100646834
+62652845
+6365225483234117329
+5308570
+3491703531359374171
+17791918762976347730
+4127600455366674792
+11130039777759856047
+13951205954302381098
+18115578910873816258
+8659114857360722535
+6153353844499089111
+157549179
+9870724569564298
+16327183209838150989
+491717322
+214958278120
+32432303330691092
+17684252729367202593
+16965951797418331227
+23068994
+2272905061487347697
+1873618450346019367
+7515799761807542411
+815989668
+2576363817137867614
+70517215
+17763448248357489818
+13172970
+3276923
+806093689
+17621268802185464283
+60621205
+18411699911067631643
+576717661
+1685722535145180234
+23689824939607125
+17256155806064642777
+5516892801706297876
+12982659022915898414
+9870724567533791
+15515140725455259155
+547029825
+219257046468
+4180850416920431050
+21037361
+68485573
+11141327
+813958043
+189614828176542708
+1873618480439692390
+279448454880
+16253215886083360174
+572110149897422243
+9896616181508082455
+153485925
+8021450371307931626
+38797665
+19177566795402134
+27356016680241600
+669582195
+2625321606595283106
+554894151
+5512098557251945790
+9568883447315500158
+1440671446449589035
+4502324021620638916
+3249068390006196153
+15292781563660995825
+821822415
+27356063969248337
+18413109967286566983
+10911952793442192048
+6064503826171693679
+11161692095903435283
+1004761907965660269
+2207210695286917386
+6388664954993575829
+46662016
+5885976061401368013
+104006209
+5572809636517250553
+2625321610894051277
+17955470565775510239
+4661227814082512385
+6368045642960996241
+5463642874544129714
+16974104
+533070599
+809894783
+18413109997380239438
+7078069
+637862761
+6288511205539515238
+3974700764184054454
+18613559784442970
+2791055594105669609
+4504298205224635444
+18412263935274844205
+2605266760616185153
+15287987228927396675
+339635799228
+92078603
+8501910827968825512
+5991347884504386492
+210659247559
+17284241873202253123
+16893851873170950707
+651404368114879038
+18411417873169448972
+24838480
+5726226344404977639
+10259573046193883986
+2676958769323838072
+72286714
+6886936648282539655
+14942458
+521143041
+5046422
+13980703149896829784
+1495991284
+62390697
+18199185222634702635
+8834282535679560676
+15925946803693423456
+42598769
+9870724569302153
+5459976661309982295
+11084138473134491150
+5303047078245827995
+214958016090
+12451287838412704489
+5509410202188647833
+2681814701524780811
+10628953736434486617
+9774054990929462949
+18411417903263121427
+3865299049198390675
+12910822
+5356297009705911966
+2421359666
+70255067
+2248112069177510680
+3493395634074945822
+60359057
+12654580528992553525
+519111421
+3808100888100343209
+3014775
+13513632858283052077
+15289397310941235057
+8861613698626554738
+9697577994188492052
+155255415
+10381427610856195682
+9870724567271440
+2625321602296252770
+14512708438227029368
+97911325
+489423554
+4022831255438034250
+30671195
+1873618458945324208
+20775215
+5459976691403654584
+813695896
+12665415616966166285
+5645056620059298667
+68223425
+1319896024
+2390363305266056430
+17634738504986593825
+20305632407192782
+17462509665872383079
+1606616067
+305243098454
+163119765
+48431492
+10590197086357423689
+2787671431665157349
+6366353484357502971
+18413674021587452000
+17620986833073014515
+105775699
+20869665212206112
+4445946672738929841
+95879699
+2625321606595021110
+10906583445476542150
+18412827959482056767
+17205553309096938840
+12294570438877711433
+5461104782808583112
+544736038
+9950019055828534995
+5991347927496394467
+811664269
+5403008449516603011
+18411981897376661534
+572392279
+7677136701370927115
+6155045908523191668
+18067928196024961188
+20587511236070012
+103744061
+161088132
+335336768790
+6155045934318095559
+13322381941750499717
+15291371425760087333
+30740222110467489
+5245848925746498573
+5349308051975768286
+4548309565419816229
+255984301
+5461104787107351969
+16711957
+10906583475570214623
+6365225453139920066
+6177363118375897150
+6815921
+7032232753418799293
+5558136817694803400
+4030203865610717075
+12718336251608304605
+18411981927470333989
+1545208828
+15287141235606883137
+5837238474067478018
+11705421198335413148
+5524868651610213131
+210658985303
+6098975770044925746
+24576334
+13151687854617134836
+4662073803102881076
+72024566
+817497011
+29330157293733695
+17096567568145714575
+1454859013759438228
+14680310
+4784274
+62128549
+1493907215600323645
+6364097387529046615
+12583654612056476062
+12851509922494416016
+1495729137
+15287141218411547437
+828143439367899804
+2523959969279970191
+3919394969679695174
+7595953279435999504
+2625321597997222413
+491193030
+1839046019115124804
+7241043922144659849
+18613499598604650
+18413391983689269329
+10594427319500605883
+12648675
+4861149623842704773
+5782296448490276391
+5516046782590617836
+518849275
+10015828607276288922
+15662612681012938353
+2752627
+60096910
+5133829485924779401
+7003516464553396964
+12903069678853164419
+2625321602295990612
+97649177
+259785401
+5464488953846367762
+546505531
+30409049
+374027977988
+1396769762
+21715680028329254
+5637072609524124450
+7731877951544692100
+1873618458945062288
+6767393152337644543
+9467310877347154547
+5429433323061448040
+10617033
+1730937871
+107356700000258304
+425617786716
+451412690018
+18413392013782941784
+12020684574736647824
+105513554
+3541851256594893702
+16038494049631274933
+497025749
+4661227783988316231
+18412545951677546551
+5565348467217401524
+14428481252717692252
+544473890
+3344434243
+2169005683868174908
+5993603989931887912
+12972952285742288
+13117263636444153530
+811402123
+2676033356038276482
+1873618514833639109
+514786024
+572130134
+160825986
+1938490399
+10280579133800254203
+285938493736356261
+6425213859614951480
+103481913
+11364576519499679975
+1881294612915292853
+15739206202722094240
+4397798316509039896
+17011915733784398286
+1873618446048496233
+14383326641327005
+26345813
+6156455960443095577
+14975681650483333306
+819266496
+16449809
+15288269301218674108
+1873618493337504776
+5782296461386581535
+12162857194684744950
+16633695839999756254
+6553773
+6206321690771457172
+5411573444917201071
+14273081993166850387
+17297538988880889355
+9870724570810095
+339635275824
+101450287
+2625321593698192308
+91554312
+3812049113439014303
+492962512
+15289397349632182266
+342928503145892901
+9257009393629660721
+13674941621707869313
+17952462371364276975
+24314188
+7676326001635166459
+12622921449567619867
+14471968401314024391
+14418163
+71762418
+4522126
+1873618497636273356
+1873618523431177265
+31304285008889193
+2625321597996960522
+42074479
+18895601982637667
+14883032307819284131
+32178524
+490930885
+5459976661309458015
+194314911
+1873618454646032908
+9386257314251803173
+13950077918785243724
+5831598146013367591
+5882159627828332650
+69730775
+6100103913039400051
+15744000533156660854
+12386527
+518587129
+59834762
+9231865831523354279
+2490479
+2148672331528407961
+2908260051937332390
+16876615841046071902
+9950583114428779661
+154731123
+13237708539884666883
+30458205708158447
+2964529530791004471
+40042856
+2933734509745341832
+5459976691403131036
+1730675726
+1873618484739705502
+2676033351739245930
+15215179494928287321
+14866462842593414402
+5463642917535614049
+631243623
+5885261859847867262
+11391362031143292020
+506659547
+105251406
+5778348197355914873
+16324853745603185849
+5509410163496651347
+152699489
+15292499534361856724
+496763604
+544211744
+4078298792234977417
+5461104782808057591
+14648423506775771515
+10504814416598927327
+8709732826087622782
+2544766567488424310
+811139977
+17088205463377873568
+15798241638577276499
+2676033356038014277
+2785415326238639918
+12562453432512743836
+12350988444867431112
+1873618514833377412
+16940553195690134509
+45875581
+103219765
+8854886168440079511
+5941764153383128192
+2625321589399162008
+11818157132458100908
+2785415278947600352
+15257764832492062794
+232154598652
+819004351
+16187661
+4644563108626631009
+4000515045253449269
+16872667624306444468
+1873618493337242815
+6291625
+6156737968247080128
+292346005443
+283116224
+3220426554520570467
+12356593998396393868
+684134257893444250
+17175427809786595961
+9870724570547380
+1992881803100621054
+2625321593697930351
+9450798976826149302
+16655465042802838677
+6474545510181176536
+11740202404159819072
+15289397349631921063
+9714916620293637762
+6098975770044401989
+16364556117061994922
+196084388
+540148509
+24052042
+11065179658016983681
+12480382642832672298
+71500270
+7285785859232107205
+14156017
+17632571483632043275
+61604254
+4259978
+17750109864738752812
+1873618523430913566
+9830100417878166271
+14425661002709010016
+4794173760728861833
+464308734399
+510460641
+2507605048
+41812332
+2679637056
+99156525
+16044698410491643447
+9870724568517151
+5516046735301085409
+6261263733545503259
+3759645248384009814
+538116878
+5779476232874035736
+6104586261131037638
+10531295842117158093
+12124379
+69468627
+5565348505908348542
+814941090
+5299098870104394759
+14322284629040564382
+10440328872292254866
+2228331
+518324983
+16872385650894636566
+6284197438710222140
+8098722631875955846
+5727354392818878727
+9870724566484489
+154468975
+2292825785040636736
+3172873343893834792
+14418466534433295118
+2707725182771857350
+15293345523383077603
+259261111
+19988781
+15371922320578972378
+19741625396299098
+18411699893871247383
+12818875419963886521
+2676033351738984017
+14268291611706526293
+1309213649
+104989258
+6367324841362000185
+7432602967203907143
+11331649863678691999
+15292499534361593441
+1815413785
+5778348223150556659
+5572809636518234139
+11408348231855703653
+2446197814
+13001682102565734253
+17186370630874106258
+2785415274648570354
+14264783202905229777
+7171706723174648069
+820773835
+4645667113710455153
+16425638839461284611
+5353476806987745228
+1840738151924108521
+6153071806601889790
+810877831
+8061124
+5356297048398365877
+4770547841029572913
+12804866717273491655
+15580874133512784221
+514261733
+571605843
+12346762090311779845
+102957618
+10907429529076434052
+2625321589398899121
+5354604872597767596
+4279174221
+27638024484621167
+8483828720841721486
+1459422188
+23689889426704296
+17648172271756969893
+232154335723
+15925513
+10811668319096800853
+6365225478934037607
+9763237054719266042
+11633356565151157114
+63373752
+1873618493336979326
+6029477
+3580814869236944221
+5199085482290645376
+282854078
+2625321593697668091
+9870724570285675
+7449919019336600171
+1839046014815569788
+23789896
+9131616131521448314
+5779476228575003910
+5511666277521099409
+13940760354079114484
+18413109980183855178
+644678512
+71238122
+417019463453
+15131353489256221185
+447360420122266222
+520094464
+3997830
+15096032016463431129
+1873618501936549084
+61342108
+1873618523430651633
+18412263918078459945
+5344573059048999857
+5155859771100236117
+5405598659939206416
+27356033876298083
+2146416200305806198
+5303893093062347743
+21758263
+3189961199463959445
+527958790
+69206479
+11862232
+6364097396127827248
+1320879066
+365262179507571896
+23689855034002659
+1473119215
+18412263948172132400
+31243224015702806
+39518566
+9870724566222277
+545719090
+5301355009924597043
+9391897706793274792
+11514789185312918199
+18411417886066737167
+5299098848607995194
+2284412389694637269
+10530167802300925091
+10427987387505837891
+14322803714593785119
+2625321585099869531
+6829167367527204602
+6013889919468112625
+4181978486829943864
+8698802578697685482
+1654120425802828663
+5569296748444387676
+1873618441748940565
+256967343
+5245848947241584851
+15862817677379702068
+14633483086300318059
+288046714075
+2203332276215481610
+7798976
+810615685
+237175467
+11340219378265033230
+313841615983
+513999587
+18413674004391067740
+2116750858326574509
+8070938101082033295
+2625321589398637514
+25099937047839912
+5245848878456439955
+12118995007347033900
+4562124381333884039
+31586327206235137
+16436648502583690678
+9181481831755875838
+5516046752497929091
+4183106466458307862
+1991460714865167155
+17082847207615301902
+818480058
+15663365
+73007615
+3701600990787603378
+63111604
+5767329
+579208034
+1493907215601306869
+11535686880442518166
+3313969578832561394
+2704904932763174902
+6570315963541227654
+282591932
+5726226297114658480
+17160329975787685834
+8843457619279611284
+18413674034484740195
+9870724570023121
+492176077
+30740204914083091
+21433663625497129
+1629160452
+1873618450346477252
+18412827972379344962
+5243108696682924272
+7260902865540482639
+816448424
+70975974
+15287423196122254433
+1873618501936285414
+5151629580948802356
+3735682
+61079961
+18411981910273949729
+7837634943338155161
+3597357340772992368
+5133829485925763690
+51184007
+10956724774926813288
+98632231
+17309267256018536307
+9870724567992379
+29048106498198701
+3544107379218385465
+14386655907412249373
+219257507157
+21496117
+68944331
+16330874579771459902
+11600084
+11124082762859154482
+5459935770830768809
+814416800
+347984565637089693
+11923578915473263059
+575144796
+517800693
+3297856681506178941
+326737923180
+16038494049632258844
+15104099179857577674
+32996413518841137
+153944682
+2152780467316001469
+8722536002903082945
+10646954815923686447
+545456942
+14458654042895551171
+3935742187522887052
+16064731596255856452
+19464489
+17648172288953812474
+6213874949885069218
+14851060135220743194
+6471725260172231870
+4504298175131421894
+573113178
+11701191021079496730
+12314601354656483126
+13957562954616997312
+161809033
+563217229
+104464968
+1366033375
+1133620930477295468
+6209141923583494372
+2625321610894509848
+5052785364214352114
+6155298040667702671
+5246977012853376412
+4074350485214726972
+27328854
+1873618441748677997
+2000487899013646903
+7465404271946632160
+7239351853821397993
+11742834345080916462
+6368045642961454306
+5516046795487905107
+434216307724
+3493677603186412637
+810353539
+16633695840000739887
+821147663836514852
+18413391996586557524
+7536828
+4151361015346562251
+14540810596246030644
+5995296139937712949
+159777405
+8816997369364548341
+45089144
+18412545934481162291
+9298403582666148514
+15108492788614827244
+35193182
+5568582435113995179
+5570988833963444820
+15289397375428069113
+15401217
+8430474765433179073
+10750398672578676906
+72745468
+5405598728725859379
+9250794030848869727
+62849456
+17422075106091075868
+5505181
+1873618497637255436
+578945889
+13106160036035691955
+282329787
+5570988786672405753
+9870724569761068
+7031431794891230329
+43057525
+1706034183
+491913932
+214958474959
+90505732
+18412545964574834746
+32432303330887118
+846140170598090257
+5458848587099997499
+17607182983838566334
+195297952
+539362075
+5460499872422693597
+23265605
+943759021439519007
+70713826
+816186278
+2207492642904016905
+644154222
+60817815
+806290300
+3473534
+1873618501936022824
+13307798926833551183
+1873618527730926929
+11349795265056081195
+567018319
+9388513449772451585
+165610142
+2625321576501808484
+7290339324003420579
+15287141244205140113
+41025899
+9870724567730368
+5569296739846327213
+98370083
+1531970550
+219257244681
+2065251783916127931
+6151097665987347595
+1407386597
+3973490993339565383
+12463417266756127924
+17631161371525515669
+21233971
+3232498753
+4767727591020628301
+8972557000702888938
+1873618458945784014
+15290525376551717170
+1559626750
+68682184
+12689613402799605860
+527434500
+517538547
+3542979343701772038
+447112610911
+163578521
+326737659857
+30458205707109873
+2625321606595479619
+498702419026
+555090760
+11846037961957312985
+2286775792223980496
+2676819007
+11599686562536949325
+3968978683605551949
+5831598103022077418
+15175534989820758889
+3812049126336301758
+545194794
+12348736218027264207
+12743882002561631754
+12318365723906541324
+8882845388820581451
+12769623874203027091
+1732546160493595960
+10430737389551487761
+9512531412808567772
+21433723812579518
+812123024
+9140909979694467183
+4025048830681353606
+1873618489039455401
+18331530485106038
+5516046791188875281
+6156456003434055463
+12474564753552836994
+17621561863500597513
+104202820
+29612220986426501
+1996555300
+2625321610894247837
+17489156252859434801
+103179363763095696
+15920335005095365860
+13112992413209136128
+2034107431
+17291573824845253535
+9772926989806013640
+819987397
+17170714
+1873618467543321286
+16156684754098128751
+6925759830950740072
+7274680
+16161820259100396848
+3698377064120454404
+10296839827164892306
+13913370016116443160
+1363739614
+92275213
+210659444315
+1784112314702629632
+5461104765611674055
+507299956084
+13237708552781955078
+197067432
+4211147846
+14657391675119111356
+25035091
+1735459858
+15139069
+14426056237756189706
+12771845711499103316
+9940375093616053431
+6523880655054768550
+62587308
+10967349376607587326
+1873618497636993704
+15290807392954681807
+5243033
+1133620917580466754
+1873618523431898109
+11613165301442872555
+282067642
+9870724569498781
+2141513421469058406
+14318336791419094928
+5885976069999102359
+6153917830015027393
+214958212644
+548995910
+90243587
+16101055855214332856
+9409295256684857617
+539099930
+30458248699119542
+23003457
+252379820
+6173800107753209956
+70451678
+13107433
+815924131
+1873618476140856959
+3188833133853148985
+3211386
+60555668
+5514354727165429372
+18430745393540238720
+5566476498435442740
+8821966780582857359
+806028152
+31022281504130688
+15273884660262766886
+17153706162049649384
+15568274631689570656
+98107936
+9870724567468020
+2625321602296449309
+5250413516934940017
+10377197347619277484
+546964288
+2429420595
+68420036
+13840095604897025041
+11075790
+1873618506234530930
+517276402
+31304293607146613
+10225919150420460684
+32714392818354350
+163316374
+17480593072628501093
+3653991426073234491
+28202143271093720
+2625321606595217579
+669516658
+11075097734987253589
+544932649
+5248951136269502637
+24535874148371011
+5247593352907000017
+13750803869111880047
+821756878
+5565348488711963913
+18940198
+23407778443822783
+811860878
+3910652327921846506
+2372569380647405649
+6151097721875664077
+8481290603310483360
+15289115311734721621
+5197393238738928914
+8858552325786961082
+15270695523793439937
+103940672
+6206603741566403719
+151388766
+2531021385567766485
+7563081637033018620
+13044533461222491710
+6154199872212897041
+9126223058424237061
+1160107295621122785
+32714349826081871
+6152225697206437786
+4333982245204396969
+7012532
+5411521012994803182
+5249797159683425776
+570557265
+17619527108083517000
+3758799224970808644
+11069796609748044689
+210659181949
+14926165161459649868
+7570985824906512457
+3234866947851553000
+1906986264008723742
+24772943
+1873618446046923526
+7516607870825792868
+14876921
+72221177
+18411699906768535578
+1495925747
+62325160
+288043895627
+31304259214443724
+3685635809078676834
+4980885
+313838798363
+13951205954302051853
+464309454125
+7151957518376504179
+6153353870293665804
+365428606574
+14319322726341872694
+3493083035910933027
+214957950334
+13222096480399396057
+22741311
+538837783
+12845285
+1675756474409617568
+7676326031729298383
+1873618476140594617
+70189530
+2861086850442987769
+12590629664748537952
+15501473033754248808
+1733166096
+2949238
+5833854255738587405
+6405261049027955879
+60293520
+6364097417622914469
+50397573
+15289397310941170468
+1436145094782551981
+9870724567205432
+155189878
+7996312456522828750
+2413828615876118471
+1818166298
+97845788
+2625321602296187261
+4451323549999957434
+3544953467117898450
+40501610
+6364097443417820330
+1543385872365455415
+12606726616442537392
+16436379939763522008
+7562235540534921217
+546702141
+20709678
+18413109962987470918
+10939233345785957508
+1384869222252743071
+14383042897579063
+245051624454
+813630359
+5881866613803452649
+1455946274504313841
+68157888
+10813643
+4502606072414800438
+9388513432576593267
+517014256
+16739161091967945306
+6203168539198949844
+20305658202031811
+15122676476569913436
+48365955
+5941764144784016877
+12601357272775920269
+5900805793554762144
+163054228
+6155327937823509637
+95814162
+2625321606594955469
+544670501
+11092808190891527547
+6365225423046182853
+3545799490531822688
+5991347927496329957
+2676033356038473537
+6928358494714596151
+18895516000586505
+18413109993081143373
+1317798870
+3242943116712479419
+8468495303965871404
+10215782083327823122
+295544243748734701
+7536133444401891169
+13880529192106527090
+18412263930975748140
+103678524
+8816997365064994109
+5513226652957347114
+13427220419978791304
+4279895118
+2581508047683782932
+151126621
+16436648502584675667
+5245789596497153220
+18411417868870352907
+1574831104
+5512098613140196086
+16646420
+16881311723980129501
+580191075
+6750384
+460010423829
+17142588721119759321
+5411521012994540776
+13331692090551241408
+2236213724530672835
+10512763733196344280
+91750922
+493159123
+210658919829
+5353476789791099071
+2973047420892220660
+102615266471184862
+817431474
+71959029
+14614773
+29330157293667421
+18411417898964025362
+8854886129749066875
+62063012
+1631882651478526261
+1873618497636468806
+1626046306171619904
+4718737
+6971710725545264615
+15463390673086056969
+5996988225456246061
+2625321597997156982
+1258091056198584472
+2365498112266798670
+12258209558853782455
+548471621
+200191596416994196
+5565348480113903112
+10159392401199270768
+538575636
+5782296448490211725
+15289115277341755866
+12583138
+4959080478982475006
+4237766475632413481
+2687090
+60031373
+11241814380293784908
+18413674017288355935
+10162787574158199843
+5625593289148533238
+605557034314828631
+2625321602295925195
+97583640
+16546579671803956126
+546439994
+13513914891881875478
+18412827955182960702
+18142877345697171235
+8716776878113885241
+5991347923197297866
+21715680028265805
+5299098848608717979
+2686971790050919863
+10551496
+2676033351739442523
+5246976935469649046
+4236074403011431549
+5561348123192067576
+516752111
+13525196865559988902
+451412624470
+6813843502384089093
+3452050537366752044
+2723374776553770162
+105448017
+14284319595218536933
+356832576945
+1987904546
+2789363555876800106
+17063697102470777209
+6584302816815089825
+5727354422913010657
+13944415416121166662
+28311895
+11906248855590275274
+3707523343842937215
+18412827985276633157
+821232589
+18415907
+2676033356038210923
+17257283880273643533
+18331556279224644
+9117971362513815455
+18411981923171237924
+309541536868
+113312346
+46072191
+103416376
+27920126869375123
+160760449
+361131345578
+9234597529149245860
+14835085562484362568
+4585257123188181630
+1413046597527538184
+6208295874376239521
+13217980679449939250
+1966081057
+6101795981361546864
+16384272
+10370417990725208293
+4196703391028741586
+6488236
+63832509
+5153885660580611393
+6155045912821630127
+5197393273132877515
+2625321593698126810
+10720606758114626648
+9870724570745030
+30740204914804024
+91488775
+7792373120121047026
+3579577413
+5458848587100981064
+755605599842665887
+17404805271631431757
+417019921504
+9386257335747873389
+817169327
+18413391979390173264
+71696881
+8328637003859953646
+14665059300281706
+6101796011455220816
+4456589
+13070886371126478108
+8733200714257204941
+10913926882465549337
+29330183088310857
+61800865
+14949273699027977966
+1873618523431110190
+3573803894998305775
+5569296709751605280
+5835546375651263675
+9870724568714358
+42008942
+1746899701160150410
+9664889374910385451
+7406761759861377295
+2625321597996894992
+365428082633
+11888218815508973537
+6311975551774360856
+1408369638
+6101795942670075923
+15515140772745448064
+27638058877519937
+13361048879788721990
+2430665780
+22217020
+538313489
+927164962728314711
+69665238
+27638084672424186
+2573543627316201844
+12320990
+2424942
+18413392009483845719
+3660444556051220001
+18412545947378450486
+154665586
+9870724566681132
+546177847
+2229804632046437624
+5245848917148372136
+15906307047154976446
+827351178595273968
+5780604350074062990
+6350640494756627870
+9198943117821938833
+2676033351739180486
+1192315303887243384
+67633599
+6205475723246636047
+17419818910382754661
+162529937
+17083693235326683482
+105185869
+8912366315847026281
+5249797202674912471
+2446394423
+1461650414
+257426098
+17299513133793348673
+4451048243670025981
+14597841535548131734
+14130457194541352666
+15290525359355331959
+9195012299735698785
+524354306
+429916226796
+6153353788611431303
+1728578573
+6153071806602085789
+2676033356037948725
+8257735
+2785415326238575484
+1873618489038408278
+8072726556923202784
+7731878007432940921
+16271603835638319461
+11229884474259868248
+5835546388547569431
+2704904949958969710
+103154228
+2625321589399096275
+6887529782530082437
+45810044
+16365628939578247566
+4408861808311732424
+3554388240579364748
+3431353251379022211
+4131548706499659810
+3229097897723824621
+818938814
+16122124
+10831084194895235709
+6226088
+6366071472254485645
+10441809166173275876
+9538952396691934382
+5994450030541998229
+6835382734606174906
+4397798273518472097
+2625321593697864817
+9870724570481756
+17782439637510195701
+31304332299601191
+4074350515307087985
+10758418391935682553
+11405246090117384413
+196018851
+17943317531894613402
+15289397375426759758
+1801651221
+12716605781588708278
+5353476789790574588
+1873618450346936800
+14462121002204464918
+2785415309041207732
+71434733
+10770155859627543824
+1873618476141841211
+5780604362970367638
+2530739313276357975
+14090480
+5567604589840172352
+296644709200
+11266915032714840583
+4194441
+2200512120787569683
+2549492329236335496
+6211116016906930204
+99090988
+9625506809262378259
+13237708535585570818
+490103571663
+14541340640523322842
+9870724568450966
+1793158821936040552
+9486667438472824267
+21954873
+538051341
+1398211555
+5408700909154273182
+5356297014005859746
+8444237263823374707
+69403090
+2599235317101562153
+15897859265386515143
+6097847713031849822
+2162794
+9796067026192895123
+13117159209037203716
+164299420
+17088031212435737557
+8099682237308012832
+8971880411373045432
+3099205763721988894
+9870724566418979
+545915701
+13237708565679243273
+4449074137450482853
+18115860927276518423
+5247593352907982888
+16533468055605152863
+1873618458944474091
+19923244
+3188833116656765520
+2676033351738918494
+4501477955215362649
+17621268784989013395
+14581169549127125939
+6206321707968234614
+33278352538406314
+516227820
+6890349946557761313
+1411918553413126104
+162267790
+2474797953316292924
+1694703987789596868
+18172096623373846790
+28766090095429261
+1223976979390989739
+3221822110943152678
+104923721
+15185362616787929146
+10003084053115964048
+2625321585100065781
+437798118096833445
+1815348248
+31304323701802109
+152371807
+14046027923586223423
+2021331689141374237
+20869691006257762
+13044533461223476582
+16778219695595128445
+12057002331826554305
+17465760298758178660
+7576852735584046364
+129168850403198609
+820708298
+17891616
+1873618489038145001
+7995587
+11911353550167017696
+4522983015860209939
+12612941966326959190
+102892081
+2625321589398833886
+45547899
+11548493110908749415
+4076606693818764590
+7851156332894489575
+12779163922391107832
+5991347884505304103
+1095239150174145285
+3863606920688567965
+10771469979967884371
+15859976
+14312864964518020808
+17245750799710423012
+5963940
+10655291933708585535
+4162099616697747321
+63308215
+1873618519131818153
+30176189305784773
+53412232
+318140582948
+15611911946388048179
+12640696470018459947
+30176223702288623
+9870724570219682
+33278412725750974
+1409876968
+28766150282773591
+1873618450346674286
+15290243360148359553
+14036340911856223966
+6365225461738636619
+816645035
+417019398489
+6206321673575531611
+12057284352529139627
+71172585
+13828334
+7528870385169533979
+5832726134240118664
+2785415334835848520
+2572415553107265488
+61276571
+3932293
+9870724568188981
+1873618549225491555
+2360543918673038210
+98828841
+12512221777814685432
+17939922315943150958
+6045857707735386835
+21692726
+4502324038816629924
+11490081257974859839
+17639632887023929831
+1316357237551401394
+6101795994259359091
+11796695
+69140942
+18411699889572151318
+12074216556992400767
+1320813529
+8618954206934993224
+164037275
+4160546838840674266
+12591757708863407913
+555549513
+9870724566156739
+154141293
+32714414313178248
+545653553
+223556471268
+12613788024133322735
+812581780
+5778348150066318224
+1500709877
+6741138607599781046
+9227353569080969220
+515965674
+13884327378110449525
+18411699919665823773
+16340493341965880015
+162005644
+620757861
+21997756618049241
+17007720368052373541
+13001845694847518363
+227855238971
+17629469
+1737950228
+9288263741171697848
+20305615210743190
+1873618489037883086
+18613533990193666
+7733439
+313841551493
+15288551330518206781
+17302333254828493968
+6153071832396467338
+2979056014524680527
+8857706336766199103
+2625321589398571980
+45285754
+5991347884505041337
+4502324004423927097
+16874702537456224943
+14911447610171655366
+13944990587222231178
+3308118261903721908
+18413109975884759113
+8412057600244518110
+15597828
+2538734651
+818414521
+17082847207615236134
+18276979644936029994
+5701792
+63046067
+5882159696614657105
+1410790466305853323
+18412263913779363880
+32714379920475611
+539325825270679628
+1873618519131556994
+13536993689470216
+9870724569957729
+43254135
+5153885686374731086
+9387385384162626351
+8336200085500660803
+5303047104041388600
+5512098595943810546
+5717788221838658971
+2324121364801391676
+12012735189037878155
+2192639020
+1873618476141316771
+70910437
+3670145
+2219404100148201532
+2544580112253650683
+61014424
+6155045921420412650
+18412263943873036335
+1873618549225229533
+9870724567926898
+98566694
+29894215892535509
+155910777
+6366353527348399255
+9956242218935388443
+31586340104504804
+219257441372
+13522668389390157414
+18411417881767641102
+11534547
+279448847671
+7242736046355514492
+68878794
+814351263
+1192315299587689576
+2524775482
+34124461934314600
+507839197
+5539270545646881104
+4974759074281293673
+5337229686545450161
+153879145
+12644080653952551280
+30458205707308380
+545391405
+17877509356004052233
+17520266449292560845
+11065487246536017596
+2011949215506761725
+6155045882728942511
+812319634
+1130753852548581517
+573047641
+5299098874402571932
+18413674000091971675
+18331556280207363
+17269866578628118199
+15289397293744523027
+161743496
+10649664295314066054
+6051485356288903427
+4347925833116091776
+30458188511970924
+104399431
+10184384893691038634
+7401639761433855789
+1308623824
+563151692
+2625321610894444316
+7239069803025663720
+11434534198373320614
+1873618441748613384
+5622264654903379074
+29330122899915877
+15636380174699072146
+820184006
+2597848126
+10233694917695638297
+14585410861575638263
+7471291
+85348920764927349
+6366353492955694732
+18413674030185644130
+4127600472562141528
+35127645
+5780604337176709161
+541328159
+2524806001290315567
+13850612818404510827
+18412827968080248897
+15335680
+3493395603981665996
+17858552114457937219
+62783919
+3875793754648151904
+5564423899624572258
+292345154665
+3489447322753895731
+18411981905974853664
+5439644
+42991988
+9870724569695611
+12269921124804135698
+559088458
+33278386930321618
+15289397353931868100
+214958409445
+6219166245997316001
+15289397379726773461
+30458248699315998
+23200068
+12163381674616883890
+70648289
+9000175594581527004
+806224763
+89657146100418951
+15475002888547338265
+3407997
+60752278
+18411981936068526119
+14267039342724252928
+13726068525522684375
+1873618527730862181
+4504298213822565083
+155648632
+98304546
+9870724567665640
+13681696359428851594
+219257178788
+24535844054893958
+50011031689890353
+10532987940533372886
+11272401
+23407795639356361
+68616647
+814089116
+15635925519041823968
+1998521381
+163512984
+797977540607610221
+32150286927595340
+4709060078846741586
+5967447917778832244
+5885976078596834724
+2625321606595414132
+153616999
+1744643526947965735
+17461812017531651650
+987047180239768912
+30740239306197230
+15288833278135765839
+525337347
+5885976155981547843
+18413391992287461459
+10532987970627045461
+56689033
+5722409915131627177
+114033243
+10159956468397444373
+18412545930182066226
+5349367342193968413
+13819010092172884
+104137283
+17953636526298302297
+2224234517276395067
+2789363555875490728
+2625321610894182276
+12426051065400527122
+9355193091131312182
+30740222110861163
+14361095630442006439
+3137288237381257087
+17105177
+819921860
+7209143
+1727529996
+810025856
+805679481429165719
+17298949057997047589
+21997713627284659
+16120716880803858984
+33560368941433940
+1535706104
+10229733804179524009
+18412545960275738681
+9714916620294556051
+4078298775038527628
+5461104765611607541
+210659378559
+92209676
+13418544886826534789
+14264208172476401284
+1917322269
+197001895
+24969554
+5405598728725530322
+15073532
+817890229
+72417787
+1873618471842024407
+17091318705916150977
+5946696443085589628
+5177496
+5847102830955857465
+62521771
+1873618523431831649
+5835546371351184527
+14824583848163281869
+42729843
+9870724569433729
+5780604315680310424
+16385074671182940805
+214958147231
+3007753865419557454
+491586249
+17943317531893566468
+1801912319444323213
+22937920
+539034393
+27356055371580547
+1873618476140792146
+5198803303557629187
+6103488088376871190
+13041896
+1733362705
+70386141
+2306802734
+643826540
+3145849
+14637903957824965363
+519242494
+60490131
+805962615
+5784522635265967958
+1873618527730601376
+18301216972082383618
+11644189250161151139
+2625321602296383846
+9870724567402585
+98042399
+15741861301866530650
+494403323033
+6729754102968812754
+546898751
+6208295835683456476
+33560403333875446
+14409153078548760239
+15530271666638163275
+1873618458945456185
+16951650337051970851
+5144036663261072615
+813826970
+12133908888583014197
+68354499
+11010253
+279448324634
+14749580058850363919
+6633286351216577743
+2089265852158774334
+8929038315166239946
+31586271318836879
+13678484518713821516
+105906772
+96010773
+2625321606595152102
+153354852
+10831360821402142464
+5652457623480305518
+8503320935775669540
+16483453074211931840
+363084051790629688
+544867112
+258146996
+5944020284604679310
+5782296431293302176
+28484176870181368
+23407778443758207
+3973491023432910866
+5778348175860436286
+1873618514834032208
+5438906422044199526
+103875135
+7697026996393675938
+1709507593
+161219206
+13237708548482859013
+3701601059573925529
+879419277503368073
+3822179681402096264
+5565348445721659362
+532291916112267238
+256115374
+1460339693
+13351948495571782591
+14665351642484132
+3008657884776564221
+2341393787733871788
+16904712944497920326
+3967850626592737364
+16843031
+4131548702199581670
+6946995
+809763710
+1928986057181235415
+11964228788262537512
+2989761681675848960
+1873618519132801026
+7276444624641068235
+5994450030542718433
+12284124821458521275
+111739480
+4076606646528706921
+13650504529854072320
+15804734059994287439
+14425661019905001872
+2395604016
+14465116522071263669
+210659116497
+15290243360149343057
+15777957523720635747
+10167863869407233224
+18331517588211470
+12884708026702235763
+14811384
+72155640
+7042731044489660311
+15288269305517836796
+5675796551176948530
+14264208198271043974
+1495860210
+5787083718919720300
+25099894056749168
+683965395648908415
+62259623
+4915348
+12974919760129952993
+6155045917120857525
+1873618523431569790
+9013091190501541709
+4392112055939237960
+2625321597997353452
+15897908900500866947
+6177363174264606048
+15872788267758849077
+491324104
+33560399034844286
+22675774
+17542946455516547053
+2431124533
+538772246
+27920040887322186
+8704274751914773568
+12085352355710699032
+6153353775713551670
+70123993
+27356081166223293
+7885152524183078888
+60227983
+2883701
+11700344903086704893
+7329667560521271617
+518980348
+5833854255738521265
+8618954206935976415
+3901910077209972079
+1713308683
+1992881785903908578
+4530582984922301900
+16130159995999161574
+155124341
+2625321602296121720
+1884114794138700522
+5778348218852443426
+97780251
+4240022615453076686
+6097847786116483627
+6361518319333476776
+30540122
+28484146776247610
+546636604
+5741055947585816645
+6100103891543657570
+8807886331112851129
+813564822
+10223260478367337870
+746324852
+15287423226215073909
+11226550812567014265
+1491796976
+8097653480026868144
+5995296157134227520
+1873618532029106835
+1539245050
+48300418
+331037869860
+95748625
+6314795724398267312
+5888081980883929307
+544604964
+34124418943289166
+5245848947242502849
+32432363517642192
+2676033356038407648
+811533196
+1317733333
+8920676095134336910
+17149817495305717193
+918014392040164136
+103612987
+8695136395555507435
+18349504802666319185
+14847634415788362123
+1584661506
+4287350266942457603
+525512494730316455
+5881302580997523790
+1574765567
+3784125305237867347
+819397570
+8326286517935867839
+16149105318148965958
+16580883
+6684847
+18411699902469439513
+11229983338076703492
+15292499491369977714
+339635406848
+9870724570940976
+100
+101
+102
diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab.txt
new file mode 100644
index 00000000000..57c08778e36
--- /dev/null
+++ b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab.txt
@@ -0,0 +1,3500 @@
+[PAD]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[unused99]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+[unused100]
+[unused101]
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+\
+]
+^
+_
+`
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+¡
+¢
+£
+¥
+§
+¨
+©
+ª
+«
+¬
+®
+°
+±
+²
+³
+´
+µ
+¶
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+À
+Á
+Â
+Ä
+Å
+Æ
+Ç
+È
+É
+Í
+Î
+Ñ
+Ó
+Ö
+×
+Ø
+Ú
+Ü
+Þ
+ß
+à
+á
+â
+ã
+ä
+å
+æ
+ç
+è
+é
+ê
+ë
+ì
+í
+î
+ï
+ð
+ñ
+ò
+ó
+ô
+õ
+ö
+÷
+ø
+ù
+ú
+û
+ü
+ý
+þ
+ÿ
+Ā
+ā
+ă
+ą
+Ć
+ć
+Č
+č
+ď
+Đ
+đ
+ē
+ė
+ę
+ě
+ğ
+ġ
+Ħ
+ħ
+ĩ
+Ī
+ī
+İ
+ı
+ļ
+Ľ
+ľ
+Ł
+ł
+ń
+ņ
+ň
+ŋ
+Ō
+ō
+ŏ
+ő
+Œ
+œ
+ř
+Ś
+ś
+Ş
+ş
+Š
+š
+Ţ
+ţ
+ť
+ũ
+ū
+ŭ
+ů
+ű
+ų
+ŵ
+ŷ
+ź
+Ż
+ż
+Ž
+ž
+Ə
+ƒ
+ơ
+ư
+ǎ
+ǐ
+ǒ
+ǔ
+ǫ
+Ș
+ș
+Ț
+ț
+ɐ
+ɑ
+ɔ
+ɕ
+ə
+ɛ
+ɡ
+ɣ
+ɨ
+ɪ
+ɲ
+ɾ
+ʀ
+ʁ
+ʂ
+ʃ
+ʊ
+ʋ
+ʌ
+ʐ
+ʑ
+ʒ
+ʔ
+ʰ
+ʲ
+ʳ
+ʷ
+ʻ
+ʼ
+ʾ
+ʿ
+ˈ
+ː
+ˡ
+ˢ
+ˣ
+́
+̃
+̍
+̯
+͡
+Α
+Β
+Γ
+Δ
+Ε
+Η
+Θ
+Ι
+Κ
+Λ
+Μ
+Ν
+Ο
+Π
+Σ
+Τ
+Φ
+Χ
+Ψ
+Ω
+ά
+έ
+ή
+ί
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+ό
+ύ
+ώ
+І
+Ј
+А
+Б
+В
+Г
+Д
+Е
+Ж
+З
+И
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ё
+і
+ї
+ј
+њ
+ћ
+Ա
+Հ
+ա
+ե
+ի
+կ
+մ
+յ
+ն
+ո
+ս
+տ
+ր
+ւ
+ְ
+ִ
+ֵ
+ֶ
+ַ
+ָ
+ֹ
+ּ
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+פ
+צ
+ק
+ר
+ש
+ת
+،
+ء
+آ
+أ
+إ
+ئ
+ا
+ب
+ة
+ت
+ث
+ج
+ح
+خ
+د
+ذ
+ر
+ز
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ف
+ق
+ك
+ل
+م
+ن
+ه
+و
+ى
+ي
+َ
+ِ
+ٹ
+پ
+چ
+ک
+گ
+ہ
+ی
+ے
+ं
+आ
+क
+ग
+च
+ज
+ण
+त
+द
+ध
+न
+प
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ा
+ि
+ी
+ु
+े
+ो
+्
+।
+॥
+আ
+ই
+এ
+ও
+ক
+খ
+গ
+চ
+ছ
+জ
+ট
+ত
+থ
+দ
+ধ
+ন
+প
+ব
+ম
+য
+র
+ল
+শ
+স
+হ
+়
+া
+ি
+ী
+ু
+ে
+ো
+্
+য়
+க
+த
+ப
+ம
+ய
+ர
+ல
+வ
+ா
+ி
+ு
+்
+ร
+་
+ག
+ང
+ད
+ན
+བ
+མ
+ར
+ལ
+ས
+ི
+ུ
+ེ
+ོ
+ა
+ე
+ი
+ლ
+ნ
+ო
+რ
+ს
+ᴬ
+ᴵ
+ᵀ
+ᵃ
+ᵇ
+ᵈ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵖ
+ᵗ
+ᵘ
+ᵢ
+ᵣ
+ᵤ
+ᵥ
+ᶜ
+ᶠ
+ḍ
+Ḥ
+ḥ
+Ḩ
+ḩ
+ḳ
+ṃ
+ṅ
+ṇ
+ṛ
+ṣ
+ṭ
+ạ
+ả
+ấ
+ầ
+ẩ
+ậ
+ắ
+ế
+ề
+ể
+ễ
+ệ
+ị
+ọ
+ố
+ồ
+ổ
+ộ
+ớ
+ờ
+ợ
+ụ
+ủ
+ứ
+ừ
+ử
+ữ
+ự
+ỳ
+ỹ
+ἀ
+ἐ
+ὁ
+ὐ
+ὰ
+ὶ
+ὸ
+ῆ
+ῖ
+ῦ
+ῶ
+‐
+‑
+‒
+–
+—
+―
+‖
+‘
+’
+‚
+“
+”
+„
+†
+‡
+•
+…
+‰
+′
+″
+⁄
+⁰
+ⁱ
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+⁺
+⁻
+ⁿ
+₀
+₁
+₂
+₃
+₄
+₅
+₆
+₇
+₈
+₉
+₊
+₍
+₎
+ₐ
+ₑ
+ₒ
+ₓ
+ₕ
+ₖ
+ₘ
+ₙ
+ₚ
+ₛ
+ₜ
+₤
+€
+₱
+₹
+ℓ
+№
+ℝ
+⅓
+←
+↑
+→
+↔
+⇌
+⇒
+∂
+∈
+−
+∗
+∘
+√
+∞
+∧
+∨
+∩
+∪
+≈
+≠
+≡
+≤
+≥
+⊂
+⊆
+⊕
+⋅
+─
+│
+■
+●
+★
+☆
+☉
+♠
+♣
+♥
+♦
+♭
+♯
+⟨
+⟩
+ⱼ
+、
+。
+《
+》
+「
+」
+『
+』
+〜
+い
+う
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+つ
+て
+と
+な
+に
+の
+は
+ひ
+ま
+み
+む
+め
+も
+や
+ゆ
+よ
+ら
+り
+る
+れ
+ん
+ア
+ィ
+イ
+ウ
+エ
+オ
+カ
+ガ
+キ
+ク
+グ
+コ
+サ
+シ
+ジ
+ス
+ズ
+タ
+ダ
+ッ
+テ
+デ
+ト
+ド
+ナ
+ニ
+ハ
+バ
+パ
+フ
+ブ
+プ
+マ
+ミ
+ム
+ャ
+ュ
+ラ
+リ
+ル
+レ
+ロ
+ン
+・
+ー
+一
+三
+上
+下
+中
+事
+二
+井
+京
+人
+亻
+仁
+佐
+侍
+光
+公
+力
+北
+十
+南
+原
+口
+史
+司
+吉
+同
+和
+囗
+国
+國
+土
+城
+士
+大
+天
+太
+夫
+女
+子
+宀
+安
+宮
+宿
+小
+尚
+山
+島
+川
+州
+平
+年
+心
+愛
+戸
+文
+新
+方
+日
+明
+星
+書
+月
+木
+本
+李
+村
+東
+松
+林
+正
+武
+氏
+水
+氵
+江
+河
+海
+版
+犬
+王
+生
+田
+白
+皇
+省
+真
+石
+社
+神
+竹
+美
+義
+花
+藤
+西
+谷
+車
+辶
+道
+郎
+郡
+部
+野
+金
+長
+門
+陽
+青
+食
+馬
+高
+龍
+龸
+사
+씨
+의
+이
+한
+ﬁ
+ﬂ
+！
+（
+）
+，
+－
+／
+：
+the
+of
+and
+to
+in
+was
+The
+is
+for
+as
+on
+with
+that
+##s
+his
+by
+he
+at
+from
+it
+her
+He
+had
+an
+were
+you
+be
+In
+she
+are
+but
+which
+It
+not
+or
+have
+my
+him
+one
+this
+me
+has
+also
+up
+their
+first
+out
+who
+been
+they
+She
+into
+all
+would
+its
+##ing
+time
+two
+##a
+##e
+said
+about
+when
+over
+more
+other
+can
+after
+back
+them
+then
+##ed
+there
+like
+so
+only
+##n
+could
+##d
+##i
+##y
+what
+no
+##o
+where
+This
+made
+than
+if
+You
+##ly
+through
+we
+before
+##r
+just
+some
+##er
+years
+do
+New
+##t
+down
+between
+new
+now
+will
+three
+most
+On
+around
+year
+used
+such
+being
+well
+during
+They
+know
+against
+under
+later
+did
+part
+known
+off
+while
+His
+re
+...
+##l
+people
+until
+way
+American
+didn
+University
+your
+both
+many
+get
+United
+became
+head
+There
+second
+As
+work
+any
+But
+still
+again
+born
+even
+eyes
+After
+including
+de
+took
+And
+long
+team
+season
+family
+see
+right
+same
+called
+name
+because
+film
+don
+10
+found
+much
+school
+##es
+going
+won
+place
+away
+We
+day
+left
+John
+000
+hand
+since
+World
+these
+how
+make
+number
+each
+life
+area
+man
+four
+go
+No
+here
+very
+National
+##m
+played
+released
+never
+began
+States
+album
+home
+last
+too
+held
+several
+May
+own
+##on
+take
+end
+School
+##h
+ll
+series
+What
+want
+use
+another
+city
+When
+2010
+side
+At
+may
+That
+came
+face
+June
+think
+game
+those
+high
+March
+early
+September
+##al
+2011
+looked
+July
+state
+small
+thought
+went
+January
+October
+##u
+based
+August
+##us
+world
+good
+April
+York
+us
+12
+2012
+2008
+For
+2009
+group
+along
+few
+South
+little
+##k
+following
+November
+something
+2013
+December
+set
+2007
+old
+2006
+2014
+located
+##an
+music
+County
+City
+former
+##in
+room
+ve
+next
+All
+##man
+got
+father
+house
+##g
+body
+15
+20
+18
+started
+If
+2015
+town
+our
+line
+War
+large
+population
+named
+British
+company
+member
+five
+My
+single
+##en
+age
+State
+moved
+February
+11
+Her
+should
+century
+government
+built
+come
+best
+show
+However
+within
+look
+men
+door
+without
+need
+wasn
+2016
+water
+One
+system
+knew
+every
+died
+League
+turned
+asked
+North
+St
+wanted
+building
+received
+song
+served
+though
+felt
+##ia
+station
+band
+##ers
+local
+public
+himself
+different
+death
+say
+##1
+30
+##2
+2005
+16
+night
+behind
+children
+English
+members
+near
+saw
+together
+son
+14
+voice
+village
+13
+hands
+help
+##3
+due
+French
+London
+top
+told
+open
+published
+third
+2017
+play
+across
+During
+put
+final
+often
+include
+25
+##le
+main
+having
+2004
+once
+ever
+let
+book
+led
+gave
+late
+front
+find
+club
+##4
+German
+included
+species
+College
+form
+opened
+mother
+women
+enough
+West
+must
+2000
+power
+really
+17
+making
+half
+##6
+order
+might
+##is
+given
+million
+times
+days
+point
+full
+service
+With
+km
+major
+##7
+original
+become
+seen
+II
+north
+six
+##te
+love
+##0
+national
+International
+##5
+24
+So
+District
+lost
+run
+couldn
+career
+always
+##9
+2003
+##th
+country
+##z
+House
+air
+tell
+south
+worked
+woman
+player
+##A
+almost
+war
+River
+##ic
+married
+continued
+Then
+James
+close
+black
+short
+##8
+##na
+using
+history
+returned
+light
+car
+##ra
+sure
+William
+things
+General
+##ry
+2002
+better
+support
+100
+among
+From
+feet
+King
+anything
+21
+19
+established
+district
+2001
+feel
+great
+##ton
+level
+Cup
+These
+written
+games
+others
+already
+title
+story
+##p
+law
+thing
+US
+record
+role
+however
+By
+students
+England
+white
+control
+least
+inside
+land
+##C
+22
+give
+community
+hard
+##ie
+non
+##c
+produced
+George
+round
+period
+Park
+business
+various
+##ne
+does
+present
+wife
+far
+taken
+per
+reached
+David
+able
+version
+working
+young
+live
+created
+joined
+East
+living
+appeared
+case
+High
+done
+23
+important
+President
+Award
+France
+position
+office
+looking
+total
+general
+class
+To
+production
+##S
+football
+party
+brother
+keep
+mind
+free
+Street
+hair
+announced
+development
+either
+nothing
+moment
+Church
+followed
+wrote
+why
+India
+San
+election
+1999
+lead
+How
+##ch
+##rs
+words
+European
+course
+considered
+America
+arms
+Army
+political
+##la
+28
+26
+west
+east
+ground
+further
+church
+less
+site
+First
+Not
+Australia
+toward
+California
+##ness
+described
+works
+An
+Council
+heart
+past
+military
+27
+##or
+heard
+field
+human
+soon
+founded
+1998
+playing
+trying
+##x
+##ist
+##ta
+television
+mouth
+although
+taking
+win
+fire
+Division
+##ity
+Party
+Royal
+program
+Some
+Don
+Association
+According
+tried
+TV
+Paul
+outside
+daughter
+Best
+While
+someone
+match
+recorded
+Canada
+closed
+region
+Air
+above
+months
+elected
+##da
+##ian
+road
+##ar
+brought
+move
+1997
+leave
+##um
+Thomas
+1996
+am
+low
+Robert
+formed
+person
+services
+points
+Mr
+miles
+##b
+stop
+rest
+doing
+needed
+international
+release
+floor
+start
+sound
+call
+killed
+real
+dark
+research
+finished
+language
+Michael
+professional
+change
+sent
+50
+upon
+29
+track
+hit
+event
+2018
+term
+example
+Germany
+similar
+return
+##ism
+fact
+pulled
+stood
+says
+ran
+information
+yet
+result
+developed
+girl
+##re
+God
+1995
+areas
+signed
+decided
+##ment
+Company
+seemed
+##el
+co
+turn
+race
+common
+video
+Charles
+Indian
+##ation
+blood
+art
+red
+##able
+added
+rather
+1994
+met
+director
+addition
+design
+average
+minutes
+##ies
+##ted
+available
+bed
+coming
+friend
+idea
+kind
+Union
+Road
+remained
+##ting
+everything
+##ma
+running
+care
+finally
+Chinese
+appointed
+1992
+Australian
+##ley
+popular
+mean
+teams
+probably
+##land
+usually
+project
+social
+Championship
+possible
+word
+Russian
+instead
+mi
+herself
+##T
+Peter
+Hall
+Center
+seat
+style
+money
+1993
+else
+Department
+table
+Music
+current
+31
+features
+special
+events
+character
+Two
+square
+sold
+debut
+##v
+process
+Although
+Since
+##ka
+40
+Central
+currently
+education
+placed
+lot
+China
+quickly
+forward
+seven
+##ling
+Europe
+arm
+performed
+Japanese
+1991
+Henry
+Now
+Dr
+##ion
+week
+Group
+myself
+big
+UK
+Washington
+ten
+deep
+1990
+Club
+Japan
+space
+La
+directed
+smile
+episode
+hours
+whole
+##de
+##less
+Why
+wouldn
+designed
+strong
+training
+changed
+Society
+stage
+involved
+hadn
+towards
+leading
+police
+eight
+kept
+Institute
+study
+largest
+child
+eventually
+private
+modern
+Court
+throughout
+getting
+originally
+attack
+##E
+talk
+Great
+longer
+songs
+alone
+##ine
+wide
+dead
+walked
+shot
+##ri
+Oh
+force
+##st
+Art
+today
+friends
+Island
+Richard
+1989
+center
+construction
+believe
+size
+White
+ship
+completed
+##B
+gone
+Just
+rock
+sat
+##R
+radio
+below
+entire
+families
+league
+includes
+type
+lived
+official
+range
+hold
+featured
+Most
+##ter
+president
+passed
+means
+##f
+forces
+lips
+Mary
+Do
+guitar
+##ce
+food
+wall
+Of
+spent
+Its
+performance
+hear
+##P
+Western
+reported
+sister
+##et
+morning
+##M
+especially
+##ive
+Minister
+itself
+post
+bit
+groups
+1988
+##tion
+Black
+##ng
+Well
+raised
+sometimes
+Canadian
+Paris
+Spanish
+replaced
+schools
+Academy
+leaving
+central
+female
+Christian
+Jack
+whose
+college
+onto
+provided
+##D
+##ville
+players
+actually
+stopped
+##son
+Museum
+doesn
+##ts
+books
+fight
+allowed
+##ur
+beginning
+Records
+awarded
+parents
+coach
+##os
+Red
+saying
+##ck
+Smith
+Yes
+Lake
+##L
+aircraft
+1987
+##ble
+previous
+ft
+action
+Italian
+African
+happened
+vocals
+Act
+future
+court
+##ge
+1986
+degree
+phone
+##ro
+Is
+countries
+winning
+breath
+Love
+river
+matter
+Lord
+Other
+list
+self
+parts
+##ate
+provide
+cut
+shows
+plan
+1st
+interest
+##ized
+Africa
+stated
+Sir
+fell
+owned
+earlier
+ended
+competition
+attention
+1985
+lower
+nearly
+bad
+older
+stay
+Saint
+##se
+certain
+1984
+fingers
+blue
+try
+fourth
+Grand
+##as
+king
+##nt
+makes
+chest
+movement
+states
+moving
+data
+introduced
+model
+date
+section
+Los
+deal
+##I
+skin
+entered
+middle
+success
+Texas
+##w
+summer
+island
+##N
+Republic
+length
+husband
+1980
+##ey
+reason
+anyone
+forced
+via
+base
+500
+job
+covered
+Festival
+Roman
+successful
+rights
+cover
+Man
+writing
+Ireland
+##F
+related
+goal
+takes
+buildings
+true
+weeks
+1983
+Because
+opening
+novel
+ISBN
+meet
+gold
+##ous
+mid
+km²
+standing
+Football
+Chicago
+shook
+whom
+##ki
+1982
+Day
+feeling
+scored
+boy
+higher
+Force
+leader
+heavy
+fall
+question
+sense
+army
+Second
+energy
+meeting
+themselves
+kill
+##am
+board
+census
+##ya
+##ns
+mine
+meant
+market
+required
+battle
+campaign
+attended
+approximately
+Kingdom
+runs
+active
+##ha
+contract
+clear
+previously
+health
+1979
+Arts
+complete
+Catholic
+couple
+units
+##ll
+##ty
+Committee
+shoulder
+sea
+systems
+listed
+##O
+caught
+tournament
+##G
+northern
+author
+Film
+Your
+##men
+holding
+offered
+personal
+1981
+southern
+artist
+traditional
+studio
+200
+capital
+##ful
+regular
+ask
+giving
+organization
+month
+news
+Are
+read
+managed
+helped
+studied
+student
+defeated
+natural
+industry
+Year
+noted
+decision
+Government
+quite
+##id
+smiled
+1972
+Maybe
+tracks
+##ke
+Mark
+al
+media
+engine
+hour
+Their
+relationship
+plays
+property
+structure
+1976
+ago
+Hill
+Martin
+1978
+ready
+Many
+Like
+Bay
+immediately
+generally
+Italy
+Greek
+practice
+caused
+division
+significant
+Joseph
+speed
+Let
+thinking
+completely
+1974
+primary
+mostly
+##field
+##K
+1975
+##to
+Even
+writer
+##led
+dropped
+magazine
+collection
+understand
+route
+highest
+particular
+films
+lines
+network
+Science
+loss
+carried
+direction
+green
+1977
+location
+producer
+according
+Women
+Queen
+neck
+thus
+independent
+view
+1970
+Angeles
+Soviet
+distance
+problem
+Board
+tour
+western
+income
+appearance
+access
+Mexico
+nodded
+street
+surface
+arrived
+believed
+Old
+1968
+1973
+becoming
+whether
+1945
+figure
+singer
+stand
+Following
+issue
+window
+wrong
+pain
+everyone
+lives
+issues
+park
+slowly
+la
+act
+##va
+bring
+Lee
+operations
+key
+comes
+fine
+cold
+famous
+Navy
+1971
+Me
+additional
+individual
+##ner
+Zealand
+goals
+county
+contains
+Service
+minute
+2nd
+reach
+talking
+particularly
+##ham
+movie
+Director
+glass
+paper
+studies
+##co
+railway
+standard
+Education
+45
+represented
+Chief
+Louis
+launched
+Star
+terms
+60
+1969
+experience
+watched
+Another
+Press
+Tom
+staff
+starting
+subject
+break
+Virginia
+nine
+eye
+##age
+evidence
+foot
+##est
+companies
+Prince
+##V
+gun
+create
+Big
+People
+guy
+Green
+simply
+numerous
+##line
+increased
+twenty
+##ga
+##do
+1967
+award
+officer
+stone
+Before
+material
+Northern
+grew
+male
+plant
+Life
+legs
+step
+Al
+unit
+35
+except
+answer
+##U
+report
+response
+Edward
+commercial
+edition
+trade
+science
+##ca
+Irish
+Law
+shown
+rate
+failed
+##ni
+remains
+changes
+mm
+limited
+larger
+Later
+cause
+waiting
+Time
+##wood
+cost
+Bill
+manager
+activities
+likely
+allow
+operated
+retired
+##ping
+65
+directly
+Who
+associated
+effect
+hell
+Florida
+straight
+hot
+Valley
+management
+girls
+expected
+eastern
+Mike
+chance
+cast
+centre
+chair
+hurt
+problems
+##li
+walk
+programs
+Team
+characters
+Battle
+edge
+pay
+maybe
+corner
+majority
+medical
+Joe
+Summer
+##io
+attempt
+Pacific
+command
+Radio
+##by
+names
+municipality
+1964
+train
+economic
+Brown
+feature
+sex
+source
+agreed
+remember
+Three
+1966
+1965
+Pennsylvania
+victory
+senior
+annual
+III
+Southern
+results
+Sam
+serving
+religious
+Jones
+appears
+##der
+despite
+claimed
+Both
+musical
+matches
+fast
+security
+selected
+Young
+double
+complex
+hospital
+chief
+Times
+##ve
+Championships
+filled
+Public
+Despite
+beautiful
+Research
+plans
+Province
+##ally
+Wales
+##ko
+artists
+metal
+nearby
+Spain
+##il
+32
+houses
+supported
+piece
+##no
+stared
+recording
+nature
+legal
+Russia
+##ization
+remaining
+looks
+##sh
+bridge
+closer
+cases
+scene
+marriage
+Little
+##é
+uses
+Earth
+specific
+Frank
+theory
+Good
+discovered
+referred
+bass
+culture
+university
+presented
+Congress
+##go
+metres
+continue
+1960
+isn
+Awards
+meaning
+cell
+composed
+separate
+Series
+forms
+Blue
+cross
+##tor
+increase
+test
+computer
+slightly
+Where
+Jewish
+Town
+tree
+status
+1944
+variety
+responsible
+pretty
+initially
+##way
+realized
+pass
+provides
+Captain
+Alexander
+recent
+score
+broke
+Scott
+drive
+financial
+showed
+Line
+stories
+ordered
+soldiers
+genus
+operation
+gaze
+sitting
+society
+Only
+hope
+actor
+follow
+Empire
+Yeah
+technology
+happy
+focus
+policy
+spread
+situation
+##ford
+##ba
+Mrs
+watch
+Can
+1963
+Commission
+touch
+earned
+troops
+Under
+1962
+individuals
+cannot
+19th
+##lin
+mile
+expression
+exactly
+suddenly
+weight
+dance
+stepped
+places
+appear
+difficult
+Railway
+anti
+numbers
+kilometres
+star
+##ier
+department
+ice
+Britain
+removed
+Once
+##lo
+Boston
+value
+##ant
+mission
+trees
+Order
+sports
+join
+serve
+Major
+poor
+Poland
+mainly
+Theatre
+pushed
+Station
+##it
+Lady
+federal
+silver
+##ler
+foreign
+##ard
+Eastern
+##den
+box
+hall
+subsequently
+lies
+acquired
+1942
+ancient
+CD
+History
+Jean
+beyond
+##ger
+El
+##les
+growing
+championship
+native
+Parliament
+Williams
+watching
+direct
+overall
+offer
+Also
+80
+Secretary
+spoke
+Latin
+ability
+##ated
+safe
+presence
+##ial
+headed
+regional
+planned
+1961
+Johnson
+throat
+consists
+##W
+extended
+Or
+bar
+walls
+Chris
+stations
+politician
+Olympics
+influence
+share
+fighting
+speak
+hundred
+Carolina
+die
+stars
+##tic
+color
+Chapter
+##ish
+fear
+sleep
+goes
+Francisco
+oil
+Bank
+sign
+physical
+##berg
+Dutch
+seasons
+##rd
+Games
+Governor
+sorry
+lack
+Centre
+memory
+baby
+smaller
+charge
+Did
+multiple
+ships
+shirt
+Assembly
+amount
+leaves
+3rd
+Foundation
+conditions
+1943
+Rock
+Democratic
+Daniel
+##at
+winner
+products
+##ina
+store
+latter
+Professor
+civil
+prior
+host
+1956
+soft
+vote
+needs
+Each
+rules
+1958
+pressure
+letter
+normal
+proposed
+levels
+records
+1959
+paid
+intended
+Victoria
+purpose
+okay
+historical
+issued
+1980s
+broadcast
+rule
+simple
+picked
+firm
+Sea
+1941
+Elizabeth
+1940
+serious
+featuring
+highly
+graduated
+mentioned
+choice
+1948
+replied
+percent
+Scotland
+##hi
+females
+constructed
+1957
+settled
+Steve
+recognized
+cities
+crew
+glanced
+kiss
+competed
+flight
+knowledge
+editor
+More
+Conference
+##H
+fifth
+elements
+##ee
+##tes
+function
+newspaper
+recently
+Miss
+cultural
+brown
+twice
+Office
+1939
+truth
+Creek
+1946
+households
+USA
+1950
+quality
+##tt
+border
+seconds
+destroyed
+pre
+wait
+ahead
+build
+image
+90
+cars
+##mi
+33
+promoted
+professor
+et
+bank
+medal
+text
+broken
+Middle
+revealed
+sides
+wing
+seems
+channel
+1970s
+Ben
+loved
+effort
+officers
+Will
+##ff
+70
+Israel
+Jim
+upper
+fully
+label
+Jr
+assistant
+powerful
+pair
+positive
+##ary
+gives
+1955
+20th
+races
+remain
+kitchen
+primarily
+##ti
+Sydney
+easy
+Tour
+whispered
+buried
+300
+News
+Polish
+1952
+Duke
+Columbia
+produce
+accepted
+00
+approach
+minor
+1947
+Special
+44
+Asian
+basis
+visit
+Fort
+Civil
+finish
+formerly
+beside
+leaned
+##ite
+median
+rose
+coast
+effects
+supposed
+Cross
+##hip
+Corps
+residents
+Jackson
+##ir
+Bob
+basketball
+36
+Asia
+seem
+Bishop
+Book
+##ber
+ring
+##ze
+owner
+BBC
+##ja
+transferred
+acting
+De
+appearances
+walking
+Le
+press
+grabbed
+1954
+officially
+1953
+##pe
+risk
+taught
+review
+##X
+lay
+##well
+council
+Avenue
+seeing
+losing
+Ohio
+Super
+province
+ones
+travel
+##sa
+projects
+equipment
+spot
+Berlin
+administrative
+heat
+potential
+shut
+capacity
+elections
+growth
+fought
+Republican
+mixed
+Andrew
+teacher
+turning
+strength
+shoulders
+beat
+wind
+1949
+Health
+follows
+camp
+suggested
+perhaps
+Alex
+mountain
+contact
+divided
+candidate
+fellow
+34
+Show
+necessary
+workers
+ball
+horse
+ways
+questions
+protect
+gas
+activity
+younger
+bottom
+founder
+Scottish
+screen
+treatment
+easily
+com
+##house
+dedicated
+Master
+warm
+Night
+Georgia
+Long
+von
+##me
+perfect
+website
+1960s
+piano
+efforts
+##ide
+Tony
+sort
+offers
+Development
+Simon
+executive
+##nd
+save
+Over
+Senate
+1951
+1990s
+draw
+master
+Police
+##ius
+renamed
+boys
+initial
+prominent
+damage
+Co
+##ov
+##za
+online
+begin
+occurred
+captured
+youth
+Top
+account
+tells
+Justice
+conducted
+forest
+##town
+bought
+teeth
+Jersey
+##di
+purchased
+agreement
+Michigan
+##ure
+campus
+prison
+becomes
+product
+secret
+guess
+Route
+huge
+types
+drums
+64
+split
+defeat
+estate
+housing
+##ot
+brothers
+Coast
+declared
+happen
+titled
+therefore
+sun
+commonly
+alongside
+Stadium
+library
+Home
+article
+steps
+telling
+slow
+assigned
+refused
+laughed
+wants
+Nick
+wearing
+Rome
+Open
+##ah
+Hospital
+pointed
+Taylor
+lifted
+escape
+participated
+##j
+drama
+parish
+Santa
+##per
+organized
+mass
+pick
+Airport
+gets
+Library
+unable
+pull
+Live
+##ging
+surrounding
+##ries
+focused
+Adam
+facilities
+##ning
+##ny
+38
+##ring
+notable
+era
+connected
+gained
+operating
+laid
+Regiment
+branch
+defined
+Christmas
+machine
+Four
+academic
+Iran
+adopted
+concept
+Men
+compared
+search
+traffic
+Max
+Maria
+greater
+##ding
+widely
+##burg
+serves
+1938
+37
+Go
+hotel
+shared
+typically
+scale
+1936
+leg
+suffered
+yards
+pieces
+Ministry
+Wilson
+episodes
+empty
+1918
+safety
+continues
+yellow
+historic
+settlement
+400
+Come
+Corporation
+enemy
+content
+picture
+evening
+territory
+method
+trial
+solo
+driver
+Here
+##ls
+entrance
+Prize
+spring
+whatever
+##ent
+75
+##ji
+reading
+Arthur
+##cy
+Our
+clothes
+Prime
+Illinois
+Kong
+code
+##ria
+sit
+Harry
+Federal
+chosen
+administration
+bodies
+begins
+stomach
+Though
+seats
+Hong
+density
+Sun
+leaders
+Field
+museum
+chart
+platform
+languages
+##ron
+birth
+holds
+Gold
+##un
+fish
+combined
+##ps
+4th
+1937
+largely
+captain
+trust
+Game
+van
+boat
diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/test_sentences.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/test_sentences.txt
new file mode 100644
index 00000000000..6111117192a
--- /dev/null
+++ b/python/cudf/cudf/tests/data/subword_tokenizer_data/test_sentences.txt
@@ -0,0 +1,100 @@
+This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
+This sample text is public domain and was randomly selected from Project Guttenberg.
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+But the Goblin could no longer sit quietly listening to the wisdom and intellect downstairs. No, as soon as the light shone in the evening from the attic it seemed to him as though its beams were strong ropes dragging him up, and he had to go and peep through the key-hole. There he felt the sort of feeling we have looking at the great rolling sea in a storm, and he burst into tears. He could not himself say why he wept, but in spite of his tears he felt quite happy. How beautiful it must be to sit under that tree with the student, but that he could not do; he had to content himself with the key-hole and be happy there!
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+At last they reached the quay at the opposite end of the street;
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
+Nana also troubled him in another way. He had sometimes a feeling that she did not admire him. “I know she admires you tremendously, George,”
+Mrs. Darling would assure him, and then she would sign to the children to be specially nice to father. Lovely dances followed, in which the only other servant, Liza, was sometimes allowed to join. 
+Such a midget she looked in her long skirt and maid's cap, though she had sworn, when engaged, that she would never see ten again. 
+The gaiety of those romps!
+And gayest of all was Mrs. Darling, who would pirouette so wildly that all you could see of her was the kiss, and then if you had dashed at her you might have got it. 
+There never was a simpler happier family until the coming of Peter Pan.
+Finally, I always go to sea as a sailor, because of the wholesome exercise and pure air of the fore-castle deck. 
+For as in this world, head winds are far more prevalent than winds from astern (that is, if you never violate the Pythagorean maxim), so for the most part the Commodore on the quarter-deck gets his atmosphere at second hand from the sailors on the forecastle. He thinks he breathes it first; but not so. 
+In much the same way do the commonalty lead their leaders in many other things, at the same time that the leaders little suspect it. 
+But wherefore it was that after having repeatedly smelt the sea as a merchant sailor, I should now take it into my head to go on a whaling voyage; this the invisible police officer of the Fates, who has the constant surveillance of me, and secretly dogs me, and influences me in some unaccountable way—he can better answer than any one else. 
+And, doubtless, my going on this whaling voyage, formed part of the grand programme of Providence that was drawn up a long time ago. 
+It came in as a sort of brief interlude and solo between more extensive performances. 
+I take it that this part of the bill must have run something like this:
+“_Grand Contested Election for the Presidency of the United States._
+“WHALING VOYAGE BY ONE ISHMAEL. “BLOODY BATTLE IN AFFGHANISTAN.”
+Amy followed, but she poked her hands out stiffly before her, and jerked herself along as if she went by machinery, and her "Ow!" was more suggestive of pins being run into her than of fear and anguish.
+Jo gave a despairing groan, and Meg laughed outright, while Beth let her bread burn as she watched the fun with interest. 
+"It's no use!  Do the best you can when the time comes, and if the audience laughs, don't blame me.  Come on, Meg."
+Then things went smoothly, for Don Pedro defied the world in a speech of two pages without a single break.  Hagar, the witch, chanted an awful incantation over her kettleful of simmering toads, with weird effect.  
+Roderigo rent his chains asunder manfully, and Hugo died in agonies of remorse and arsenic, with a wild, "Ha! Ha!"
+This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
+This sample text is public domain and was randomly selected from Project Guttenberg.
+The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
+Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
+Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
+"Cass" Beard had risen early that morning, but not with a view to discovery.
+A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
+The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
+This was nearly opposite.
+Mr. Cassius crossed the highway, and stopped suddenly.
+Something glittered in the nearest red pool before him.
+I had one experience with Master Philip before our visitors betook themselves back to Kent, which, unfortunate as it was, I cannot but relate here. My cousin would enter into none of those rough amusements in which I passed my time, for fear, I took it, of spoiling his fine broadcloths or of losing a gold buckle. He never could be got to wrestle, though I challenged him more than once. And he was a well-built lad, and might, with a little practice, have become skilled in that sport. He laughed at the homespun I wore about the farm, saying it was no costume for a gentleman's son, and begged me sneeringly to don leather breeches. He would have none of the company of those lads with whom I found pleasure, young Harvey, and Willis's son, who was being trained as Mr. Starkie's assistant. Nor indeed did I disdain to join in a game with Hugo, who had been given to me, and other negro lads. Philip saw no sport in a wrestle or a fight between two of the boys from the quarters, and marvelled that I could lower myself to bet with Harvey the younger. He took not a spark of interest in the gaming cocks we raised together to compete at the local contests and at the fair, and knew not a gaff from a cockspur. Being one day at my wits' end to amuse my cousin, I proposed to him a game of quoits on the green beside the spring-house, and thither we repaired, followed by Hugo, and young Harvey come to look on. Master Philip, not casting as well as he might, cries out suddenly to Hugo: "Begone, you black dog! What business have you here watching a game between gentlemen?"
+But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
+Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
+Like most of his fellow gold-seekers, Cass was superstitious.
+The fountain of classic wisdom, Hypatia herself.
+As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
+From my youth I felt in me a soul above the matter-entangled herd.
+She revealed to me the glorious fact, that I am a spark of Divinity itself.
+A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
+There is a philosophic pleasure in opening one's treasures to the modest young.
+Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
+Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
+but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
+Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
+His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
+while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
+and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
+He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
+Nana also troubled him in another way. He had sometimes a feeling that she did not admire him. “I know she admires you tremendously, George,”
+Mrs. Darling would assure him, and then she would sign to the children to be specially nice to father. Lovely dances followed, in which the only other servant, Liza, was sometimes allowed to join. 
+Such a midget she looked in her long skirt and maid's cap, though she had sworn, when engaged, that she would never see ten again. 
+In the Year 1676, the Prince of _Orange_ having, in concert with the _Spaniards_, resolv'd upon the important Siege of _Maestrich_ (the only Town in the _Dutch_ Provinces, then remaining in the Hands of the _French_) it was accordingly invested about the middle of _June_, with an Army of twenty Thousand Men, under the Command of his Highness Prince
+_Waldeck_, with the grand Army covering the Siege. It was some Time before the heavy Cannon, which we expected up the _Maes_, from _Holland_, arrived; which gave Occasion to a Piece of Raillery of Monsieur _Calvo_, the Governor, which was as handsomely repartec'd. That Governor, by a Messenger, intimating his Sorrow to find, we had pawn'd our Cannon for Ammunition Bread. Answer was made, That in a few Days we hoped to give him a Taste of the Loaves, which he should find would be sent him into the Town in extraordinary plenty. I remember another Piece of Raillery, which pass'd some Days after between the _Rhingrave_ and the same _Calvo_. The former sending Word, that he hoped within three Weeks to salute that Governor's Mistress within the Place. _Calvo_ reply'd, He'd give him leave to kiss her all over, if he kiss'd her anywhere in three Months.
+And gayest of all was Mrs. Darling, who would pirouette so wildly that all you could see of her was the kiss, and then if you had dashed at her you might have got it. 
+There never was a simpler happier family until the coming of Peter Pan.
+Finally, I always go to sea as a sailor, because of the wholesome exercise and pure air of the fore-castle deck. 
+For as in this world, head winds are far more prevalent than winds from astern (that is, if you never violate the Pythagorean maxim), so for the most part the Commodore on the quarter-deck gets his atmosphere at second hand from the sailors on the forecastle. He thinks he breathes it first; but not so. 
+In much the same way do the commonalty lead their leaders in many other things, at the same time that the leaders little suspect it. 
+But wherefore it was that after having repeatedly smelt the sea as a merchant sailor, I should now take it into my head to go on a whaling voyage; this the invisible police officer of the Fates, who has the constant surveillance of me, and secretly dogs me, and influences me in some unaccountable way—he can better answer than any one else. 
+And, doubtless, my going on this whaling voyage, formed part of the grand programme of Providence that was drawn up a long time ago. 
+It came in as a sort of brief interlude and solo between more extensive performances. 
+The British Isles have been ringing for the last few years with the word 'Art' in its German sense; with 'High Art,' 'Symbolic Art,' 'Ecclesiastical Art,' 'Dramatic Art,' 'Tragic Art,' and so forth; and every well-educated person is expected, nowadays, to know something about Art.  Yet in spite of all translations of German 'AEsthetic' treatises, and 'Kunstnovellen,' the mass of the British people cares very little about the matter, and sits contented under the imputation of 'bad taste.'  Our stage, long since dead, does not revive; our poetry is dying; our music, like our architecture, only reproduces the past; our painting is only first-rate when it handles landscapes and animals, and seems likely so to remain; but, meanwhile, nobody cares.  Some of the deepest and most earnest minds vote the question, in general, a 'sham and a snare,' and whisper to each other
+confidentially, that Gothic art is beginning to be a 'bore,' and that Sir Christopher Wren was a very good fellow after all; while the middle classes look on the Art movement half amused, as with a pretty toy, half sulkily suspicious of Popery and Paganism, and think,
+apparently, that Art is very well when it means nothing, and is merely used to beautify drawing-rooms and shawl patterns; not to mention that, if there were no painters, Mr. Smith could not hand down to posterity likenesses of himself, Mrs. Smith, and family.  But
+when 'Art' dares to be in earnest, and to mean something, much more to connect itself with religion, Smith's tone alters.  He will teach 'Art' to keep in what he considers its place, and if it refuses, take the law of it, and put it into the Ecclesiastical Court.  So he says, and what is more, he means what he says; and as all the world, from Hindostan to Canada, knows by most practical proof, what he means, he sooner or later does, perhaps not always in the wisest way, but still he does it.Ah!  It's pleasant to drop into my own easy-chair my dear though a little palpitating what with trotting up-stairs and what with trotting down, and why kitchen stairs should all be corner stairs is for the builders to justify though I do not think they fully understand their trade and never did, else why the sameness and why not more conveniences and fewer
+draughts and likewise making a practice of laying the plaster on too thick I am well convinced which holds the damp, and as to chimney-pots putting them on by guess-work like hats at a party and no more knowing what their effect will be upon the smoke bless you than I do if so much, except that it will mostly be either to send it down your throat in a straight form or give it a twist before it goes there.  And what I says speaking as I find of those new metal chimneys all manner of shapes (there's a row of 'em at Miss Wozenham's lodging-house lower down on the other side of the way) is that they only work your smoke into artificial patterns for you before you swallow it and that I'd quite as soon swallow mine plain, the flavour being the same, not to mention the conceit of putting up signs on the top of your house to show the forms in which you take your smoke into your inside
+Amy followed, but she poked her hands out stiffly before her, and jerked herself along as if she went by machinery, and her "Ow!" was more suggestive of pins being run into her than of fear and anguish.
+Jo gave a despairing groan, and Meg laughed outright, while Beth let her bread burn as she watched the fun with interest. 
+"It's no use!  Do the best you can when the time comes, and if the audience laughs, don't blame me.  Come on, Meg."
+Then things went smoothly, for Don Pedro defied the world in a speech of two pages without a single break.  Hagar, the witch, chanted an awful incantation over her kettleful of simmering toads, with weird effect.'''
\ No newline at end of file
diff --git a/python/cudf/cudf/tests/data/vocab_hash/bert-base-uncased-vocab-5per.txt b/python/cudf/cudf/tests/data/vocab_hash/bert-base-uncased-vocab-5per.txt
deleted file mode 100644
index 7d156a38f28..00000000000
--- a/python/cudf/cudf/tests/data/vocab_hash/bert-base-uncased-vocab-5per.txt
+++ /dev/null
@@ -1,2475 +0,0 @@
-[PAD]
-[unused0]
-[unused1]
-[unused2]
-[unused3]
-[unused4]
-[unused5]
-[unused6]
-[unused7]
-[unused8]
-[unused9]
-[unused10]
-[unused11]
-[unused12]
-[unused13]
-[unused14]
-[unused15]
-[unused16]
-[unused17]
-[unused18]
-[unused19]
-[unused20]
-[unused21]
-[unused22]
-[unused23]
-[unused24]
-[unused25]
-[unused26]
-[unused27]
-[unused28]
-[unused29]
-[unused30]
-[unused31]
-[unused32]
-[unused33]
-[unused34]
-[unused35]
-[unused36]
-[unused37]
-[unused38]
-[unused39]
-[unused40]
-[unused41]
-[unused42]
-[unused43]
-[unused44]
-[unused45]
-[unused46]
-[unused47]
-[unused48]
-[unused49]
-[unused50]
-[unused51]
-[unused52]
-[unused53]
-[unused54]
-[unused55]
-[unused56]
-[unused57]
-[unused58]
-[unused59]
-[unused60]
-[unused61]
-[unused62]
-[unused63]
-[unused64]
-[unused65]
-[unused66]
-[unused67]
-[unused68]
-[unused69]
-[unused70]
-[unused71]
-[unused72]
-[unused73]
-[unused74]
-[unused75]
-[unused76]
-[unused77]
-[unused78]
-[unused79]
-[unused80]
-[unused81]
-[unused82]
-[unused83]
-[unused84]
-[unused85]
-[unused86]
-[unused87]
-[unused88]
-[unused89]
-[unused90]
-[unused91]
-[unused92]
-[unused93]
-[unused94]
-[unused95]
-[unused96]
-[unused97]
-[unused98]
-[UNK]
-[CLS]
-[SEP]
-[MASK]
-[unused99]
-[unused100]
-[unused101]
-[unused102]
-[unused103]
-[unused104]
-[unused105]
-[unused106]
-[unused107]
-[unused108]
-[unused109]
-[unused110]
-[unused111]
-[unused112]
-[unused113]
-[unused114]
-[unused115]
-[unused116]
-[unused117]
-[unused118]
-[unused119]
-[unused120]
-[unused121]
-[unused122]
-[unused123]
-[unused124]
-[unused125]
-[unused126]
-[unused127]
-[unused128]
-[unused129]
-[unused130]
-[unused131]
-[unused132]
-[unused133]
-[unused134]
-[unused135]
-[unused136]
-[unused137]
-[unused138]
-[unused139]
-[unused140]
-[unused141]
-[unused142]
-[unused143]
-[unused144]
-[unused145]
-[unused146]
-[unused147]
-[unused148]
-[unused149]
-[unused150]
-[unused151]
-[unused152]
-[unused153]
-[unused154]
-[unused155]
-[unused156]
-[unused157]
-[unused158]
-[unused159]
-[unused160]
-[unused161]
-[unused162]
-[unused163]
-[unused164]
-[unused165]
-[unused166]
-[unused167]
-[unused168]
-[unused169]
-[unused170]
-[unused171]
-[unused172]
-[unused173]
-[unused174]
-[unused175]
-[unused176]
-[unused177]
-[unused178]
-[unused179]
-[unused180]
-[unused181]
-[unused182]
-[unused183]
-[unused184]
-[unused185]
-[unused186]
-[unused187]
-[unused188]
-[unused189]
-[unused190]
-[unused191]
-[unused192]
-[unused193]
-[unused194]
-[unused195]
-[unused196]
-[unused197]
-[unused198]
-[unused199]
-[unused200]
-[unused201]
-[unused202]
-[unused203]
-[unused204]
-[unused205]
-[unused206]
-[unused207]
-[unused208]
-[unused209]
-[unused210]
-[unused211]
-[unused212]
-[unused213]
-[unused214]
-[unused215]
-[unused216]
-[unused217]
-[unused218]
-[unused219]
-[unused220]
-[unused221]
-[unused222]
-[unused223]
-[unused224]
-[unused225]
-[unused226]
-[unused227]
-[unused228]
-[unused229]
-[unused230]
-[unused231]
-[unused232]
-[unused233]
-[unused234]
-[unused235]
-[unused236]
-[unused237]
-[unused238]
-[unused239]
-[unused240]
-[unused241]
-[unused242]
-[unused243]
-[unused244]
-[unused245]
-[unused246]
-[unused247]
-[unused248]
-[unused249]
-[unused250]
-[unused251]
-[unused252]
-[unused253]
-[unused254]
-[unused255]
-[unused256]
-[unused257]
-[unused258]
-[unused259]
-[unused260]
-[unused261]
-[unused262]
-[unused263]
-[unused264]
-[unused265]
-[unused266]
-[unused267]
-[unused268]
-[unused269]
-[unused270]
-[unused271]
-[unused272]
-[unused273]
-[unused274]
-[unused275]
-[unused276]
-[unused277]
-[unused278]
-[unused279]
-[unused280]
-[unused281]
-[unused282]
-[unused283]
-[unused284]
-[unused285]
-[unused286]
-[unused287]
-[unused288]
-[unused289]
-[unused290]
-[unused291]
-[unused292]
-[unused293]
-[unused294]
-[unused295]
-[unused296]
-[unused297]
-[unused298]
-[unused299]
-[unused300]
-[unused301]
-[unused302]
-[unused303]
-[unused304]
-[unused305]
-[unused306]
-[unused307]
-[unused308]
-[unused309]
-[unused310]
-[unused311]
-[unused312]
-[unused313]
-[unused314]
-[unused315]
-[unused316]
-[unused317]
-[unused318]
-[unused319]
-[unused320]
-[unused321]
-[unused322]
-[unused323]
-[unused324]
-[unused325]
-[unused326]
-[unused327]
-[unused328]
-[unused329]
-[unused330]
-[unused331]
-[unused332]
-[unused333]
-[unused334]
-[unused335]
-[unused336]
-[unused337]
-[unused338]
-[unused339]
-[unused340]
-[unused341]
-[unused342]
-[unused343]
-[unused344]
-[unused345]
-[unused346]
-[unused347]
-[unused348]
-[unused349]
-[unused350]
-[unused351]
-[unused352]
-[unused353]
-[unused354]
-[unused355]
-[unused356]
-[unused357]
-[unused358]
-[unused359]
-[unused360]
-[unused361]
-[unused362]
-[unused363]
-[unused364]
-[unused365]
-[unused366]
-[unused367]
-[unused368]
-[unused369]
-[unused370]
-[unused371]
-[unused372]
-[unused373]
-[unused374]
-[unused375]
-[unused376]
-[unused377]
-[unused378]
-[unused379]
-[unused380]
-[unused381]
-[unused382]
-[unused383]
-[unused384]
-[unused385]
-[unused386]
-[unused387]
-[unused388]
-[unused389]
-[unused390]
-[unused391]
-[unused392]
-[unused393]
-[unused394]
-[unused395]
-[unused396]
-[unused397]
-[unused398]
-[unused399]
-[unused400]
-[unused401]
-[unused402]
-[unused403]
-[unused404]
-[unused405]
-[unused406]
-[unused407]
-[unused408]
-[unused409]
-[unused410]
-[unused411]
-[unused412]
-[unused413]
-[unused414]
-[unused415]
-[unused416]
-[unused417]
-[unused418]
-[unused419]
-[unused420]
-[unused421]
-[unused422]
-[unused423]
-[unused424]
-[unused425]
-[unused426]
-[unused427]
-[unused428]
-[unused429]
-[unused430]
-[unused431]
-[unused432]
-[unused433]
-[unused434]
-[unused435]
-[unused436]
-[unused437]
-[unused438]
-[unused439]
-[unused440]
-[unused441]
-[unused442]
-[unused443]
-[unused444]
-[unused445]
-[unused446]
-[unused447]
-[unused448]
-[unused449]
-[unused450]
-[unused451]
-[unused452]
-[unused453]
-[unused454]
-[unused455]
-[unused456]
-[unused457]
-[unused458]
-[unused459]
-[unused460]
-[unused461]
-[unused462]
-[unused463]
-[unused464]
-[unused465]
-[unused466]
-[unused467]
-[unused468]
-[unused469]
-[unused470]
-[unused471]
-[unused472]
-[unused473]
-[unused474]
-[unused475]
-[unused476]
-[unused477]
-[unused478]
-[unused479]
-[unused480]
-[unused481]
-[unused482]
-[unused483]
-[unused484]
-[unused485]
-[unused486]
-[unused487]
-[unused488]
-[unused489]
-[unused490]
-[unused491]
-[unused492]
-[unused493]
-[unused494]
-[unused495]
-[unused496]
-[unused497]
-[unused498]
-[unused499]
-[unused500]
-[unused501]
-[unused502]
-[unused503]
-[unused504]
-[unused505]
-[unused506]
-[unused507]
-[unused508]
-[unused509]
-[unused510]
-[unused511]
-[unused512]
-[unused513]
-[unused514]
-[unused515]
-[unused516]
-[unused517]
-[unused518]
-[unused519]
-[unused520]
-[unused521]
-[unused522]
-[unused523]
-[unused524]
-[unused525]
-[unused526]
-[unused527]
-[unused528]
-[unused529]
-[unused530]
-[unused531]
-[unused532]
-[unused533]
-[unused534]
-[unused535]
-[unused536]
-[unused537]
-[unused538]
-[unused539]
-[unused540]
-[unused541]
-[unused542]
-[unused543]
-[unused544]
-[unused545]
-[unused546]
-[unused547]
-[unused548]
-[unused549]
-[unused550]
-[unused551]
-[unused552]
-[unused553]
-[unused554]
-[unused555]
-[unused556]
-[unused557]
-[unused558]
-[unused559]
-[unused560]
-[unused561]
-[unused562]
-[unused563]
-[unused564]
-[unused565]
-[unused566]
-[unused567]
-[unused568]
-[unused569]
-[unused570]
-[unused571]
-[unused572]
-[unused573]
-[unused574]
-[unused575]
-[unused576]
-[unused577]
-[unused578]
-[unused579]
-[unused580]
-[unused581]
-[unused582]
-[unused583]
-[unused584]
-[unused585]
-[unused586]
-[unused587]
-[unused588]
-[unused589]
-[unused590]
-[unused591]
-[unused592]
-[unused593]
-[unused594]
-[unused595]
-[unused596]
-[unused597]
-[unused598]
-[unused599]
-[unused600]
-[unused601]
-[unused602]
-[unused603]
-[unused604]
-[unused605]
-[unused606]
-[unused607]
-[unused608]
-[unused609]
-[unused610]
-[unused611]
-[unused612]
-[unused613]
-[unused614]
-[unused615]
-[unused616]
-[unused617]
-[unused618]
-[unused619]
-[unused620]
-[unused621]
-[unused622]
-[unused623]
-[unused624]
-[unused625]
-[unused626]
-[unused627]
-[unused628]
-[unused629]
-[unused630]
-[unused631]
-[unused632]
-[unused633]
-[unused634]
-[unused635]
-[unused636]
-[unused637]
-[unused638]
-[unused639]
-[unused640]
-[unused641]
-[unused642]
-[unused643]
-[unused644]
-[unused645]
-[unused646]
-[unused647]
-[unused648]
-[unused649]
-[unused650]
-[unused651]
-[unused652]
-[unused653]
-[unused654]
-[unused655]
-[unused656]
-[unused657]
-[unused658]
-[unused659]
-[unused660]
-[unused661]
-[unused662]
-[unused663]
-[unused664]
-[unused665]
-[unused666]
-[unused667]
-[unused668]
-[unused669]
-[unused670]
-[unused671]
-[unused672]
-[unused673]
-[unused674]
-[unused675]
-[unused676]
-[unused677]
-[unused678]
-[unused679]
-[unused680]
-[unused681]
-[unused682]
-[unused683]
-[unused684]
-[unused685]
-[unused686]
-[unused687]
-[unused688]
-[unused689]
-[unused690]
-[unused691]
-[unused692]
-[unused693]
-[unused694]
-[unused695]
-[unused696]
-[unused697]
-[unused698]
-[unused699]
-[unused700]
-[unused701]
-[unused702]
-[unused703]
-[unused704]
-[unused705]
-[unused706]
-[unused707]
-[unused708]
-[unused709]
-[unused710]
-[unused711]
-[unused712]
-[unused713]
-[unused714]
-[unused715]
-[unused716]
-[unused717]
-[unused718]
-[unused719]
-[unused720]
-[unused721]
-[unused722]
-[unused723]
-[unused724]
-[unused725]
-[unused726]
-[unused727]
-[unused728]
-[unused729]
-[unused730]
-[unused731]
-[unused732]
-[unused733]
-[unused734]
-[unused735]
-[unused736]
-[unused737]
-[unused738]
-[unused739]
-[unused740]
-[unused741]
-[unused742]
-[unused743]
-[unused744]
-[unused745]
-[unused746]
-[unused747]
-[unused748]
-[unused749]
-[unused750]
-[unused751]
-[unused752]
-[unused753]
-[unused754]
-[unused755]
-[unused756]
-[unused757]
-[unused758]
-[unused759]
-[unused760]
-[unused761]
-[unused762]
-[unused763]
-[unused764]
-[unused765]
-[unused766]
-[unused767]
-[unused768]
-[unused769]
-[unused770]
-[unused771]
-[unused772]
-[unused773]
-[unused774]
-[unused775]
-[unused776]
-[unused777]
-[unused778]
-[unused779]
-[unused780]
-[unused781]
-[unused782]
-[unused783]
-[unused784]
-[unused785]
-[unused786]
-[unused787]
-[unused788]
-[unused789]
-[unused790]
-[unused791]
-[unused792]
-[unused793]
-[unused794]
-[unused795]
-[unused796]
-[unused797]
-[unused798]
-[unused799]
-[unused800]
-[unused801]
-[unused802]
-[unused803]
-[unused804]
-[unused805]
-[unused806]
-[unused807]
-[unused808]
-[unused809]
-[unused810]
-[unused811]
-[unused812]
-[unused813]
-[unused814]
-[unused815]
-[unused816]
-[unused817]
-[unused818]
-[unused819]
-[unused820]
-[unused821]
-[unused822]
-[unused823]
-[unused824]
-[unused825]
-[unused826]
-[unused827]
-[unused828]
-[unused829]
-[unused830]
-[unused831]
-[unused832]
-[unused833]
-[unused834]
-[unused835]
-[unused836]
-[unused837]
-[unused838]
-[unused839]
-[unused840]
-[unused841]
-[unused842]
-[unused843]
-[unused844]
-[unused845]
-[unused846]
-[unused847]
-[unused848]
-[unused849]
-[unused850]
-[unused851]
-[unused852]
-[unused853]
-[unused854]
-[unused855]
-[unused856]
-[unused857]
-[unused858]
-[unused859]
-[unused860]
-[unused861]
-[unused862]
-[unused863]
-[unused864]
-[unused865]
-[unused866]
-[unused867]
-[unused868]
-[unused869]
-[unused870]
-[unused871]
-[unused872]
-[unused873]
-[unused874]
-[unused875]
-[unused876]
-[unused877]
-[unused878]
-[unused879]
-[unused880]
-[unused881]
-[unused882]
-[unused883]
-[unused884]
-[unused885]
-[unused886]
-[unused887]
-[unused888]
-[unused889]
-[unused890]
-[unused891]
-[unused892]
-[unused893]
-[unused894]
-[unused895]
-[unused896]
-[unused897]
-[unused898]
-[unused899]
-[unused900]
-[unused901]
-[unused902]
-[unused903]
-[unused904]
-[unused905]
-[unused906]
-[unused907]
-[unused908]
-[unused909]
-[unused910]
-[unused911]
-[unused912]
-[unused913]
-[unused914]
-[unused915]
-[unused916]
-[unused917]
-[unused918]
-[unused919]
-[unused920]
-[unused921]
-[unused922]
-[unused923]
-[unused924]
-[unused925]
-[unused926]
-[unused927]
-[unused928]
-[unused929]
-[unused930]
-[unused931]
-[unused932]
-[unused933]
-[unused934]
-[unused935]
-[unused936]
-[unused937]
-[unused938]
-[unused939]
-[unused940]
-[unused941]
-[unused942]
-[unused943]
-[unused944]
-[unused945]
-[unused946]
-[unused947]
-[unused948]
-[unused949]
-[unused950]
-[unused951]
-[unused952]
-[unused953]
-[unused954]
-[unused955]
-[unused956]
-[unused957]
-[unused958]
-[unused959]
-[unused960]
-[unused961]
-[unused962]
-[unused963]
-[unused964]
-[unused965]
-[unused966]
-[unused967]
-[unused968]
-[unused969]
-[unused970]
-[unused971]
-[unused972]
-[unused973]
-[unused974]
-[unused975]
-[unused976]
-[unused977]
-[unused978]
-[unused979]
-[unused980]
-[unused981]
-[unused982]
-[unused983]
-[unused984]
-[unused985]
-[unused986]
-[unused987]
-[unused988]
-[unused989]
-[unused990]
-[unused991]
-[unused992]
-[unused993]
-notions
-bc
-宀
-rudd
-1839
-appeals
-fabric
-charges
-roadside
-interview
-goethe
-rr
-##ス
-heroine
-housing
-##xx
-owens
-who
-青
-prominent
-##ments
-spare
-sloane
-paranormal
-昭
-physiological
-reply
-josiah
-mort
-316
-descend
-pencil
-nepal
-minus
-janata
-106
-hardin
-enamel
-ђ
-##urance
-scans
-happening
-advising
-identifies
-institutional
-dominican
-charts
-convince
-51st
-kingdoms
-heritage
-comedy
-banker
-##me
-##its
-1611
-##dos
-##pire
-fog
-bulky
-light
-reborn
-consultation
-##loading
-teachers
-ashe
-frank
-bandits
-governmental
-##rdon
-privileged
-##mins
-##ᅭ
-adherence
-ko
-motionless
-fighters
-sui
-significant
-others
-incurred
-posts
-defected
-alexandra
-continue
-tormented
-encourage
-delaying
-stomach
-conscience
-deserve
-pumps
-tanaka
-##fles
-flushed
-parks
-##nam
-##hold
-submitted
-duran
-monroe
-parasites
-emptiness
-prakash
-break
-##xley
-haynes
-neil
-gallons
-##onus
-manages
-69
-aragon
-memorable
-matching
-inclination
-molten
-humboldt
-riders
-##ː
-marketed
-##yala
-squealed
-##qual
-talk
-tournaments
-##boot
-##a
-##gart
-##rra
-##ifies
-deny
-personal
-nancy
-splits
-##ˈ
-permitting
-projected
-fool
-attach
-pill
-1776
-##ს
-raoul
-cocaine
-conservation
-luther
-appeal
-##tal
-dq
-##gny
-##vial
-reasonable
-unitarian
-melville
-stephens
-##民
-balancing
-lowe
-swore
-matrix
-concluded
-##cap
-tee
-capacity
-searing
-living
-lc
-qualified
-shoes
-##sam
-rounds
-293
-237
-attic
-shafts
-older
-underworld
-tehran
-simulcast
-##abe
-##lich
-quarterfinals
-elton
-unreasonable
-ষ
-m1
-upbringing
-faith
-misconduct
-##ᄀ
-miner
-sudanese
-dense
-whimpered
-sabre
-professionally
-ex
-grasp
-infections
-knee
-shoving
-ł
-##ehan
-##ʊ
-skyscraper
-##mont
-purchases
-fright
-drivers
-buyer
-shit
-foreigner
-1764
-gloria
-##rem
-rejecting
-petra
-spots
-memoirs
-tracey
-##gy
-##ades
-reused
-##igh
-3
-hopeless
-rita
-specialised
-〈
-portion
-distress
-##ו
-##tub
-jamestown
-clause
-journalists
-network
-napoleon
-##դ
-fortunate
-focus
-ahead
-concordia
-168
-hitch
-managed
-superliga
-shu
-euroleague
-clock
-johnny
-merits
-softened
-grimaced
-losses
-ferdinand
-gmbh
-wreath
-##astic
-harlem
-den
-algae
-43rd
-##kushima
-welding
-abilities
-##nting
-assumption
-announcer
-auditioned
-mathematician
-retrospective
-##谷
-laced
-motive
-fixed
-joshua
-emerging
-foreign
-brutal
-れ
-seaside
-manfred
-connects
-##nesian
-heir
-ս
-experience
-jem
-##sett
-mark
-harper
-##de
-one
-additive
-bypass
-assembly
-continents
-boom
-minimal
-##wheel
-unification
-participating
-shareholder
-arizona
-casimir
-##স
-viet
-##inski
-restoring
-abolished
-dea
-glowed
-aeronautics
-selfish
-serbia
-easily
-count
-##llet
-justified
-descendant
-racer
-authentic
-##care
-weightlifting
-annapolis
-songs
-abrams
-deciduous
-costing
-silence
-spent
-##form
-##kou
-screens
-braden
-##scribe
-1934
-permit
-##ric
-##ane
-gaston
-comet
-gift
-trembled
-##fa
-everyday
-forged
-milwaukee
-##«
-larger
-nickel
-danes
-appropriated
-malibu
-banquet
-shared
-26
-##ucher
-ky
-##urse
-ᅴ
-divided
-##itarian
-endeavors
-blank
-kw
-investor
-##ners
-moldova
-veteran
-pandit
-deposited
-involved
-pace
-experiencing
-£
-algorithms
-safely
-##pore
-xp
-unite
-fin
-clearance
-intriguing
-indications
-almighty
-##iable
-corvette
-##uding
-panzer
-monster
-ban
-eternal
-trap
-shot
-saxophonist
-##iam
-dai
-studio
-##eak
-baltic
-placed
-northern
-suppliers
-fungus
-prasad
-##qu
-priorities
-robinson
-list
-conner
-1960
-hendricks
-##ani
-shoulder
-kite
-starting
-remorse
-mysteriously
-##master
-lena
-1923
-##〜
-##ln
-pianist
-##ision
-adulthood
-century
-skier
-cannot
-mai
-summarized
-oxfordshire
-##dded
-ordination
-##vis
-tin
-resource
-expanse
-inmate
-doyle
-##osa
-stained
-yemen
-spoken
-stealth
-natalie
-neatly
-stamp
-pens
-beautiful
-ک
-pulp
-shortlisted
-anastasia
-models
-theory
-coincided
-##dor
-deception
-##rt
-comparable
-##न
-offered
-##ק
-indonesian
-canberra
-martinez
-whereas
-clutch
-aqua
-roadway
-dorothea
-triumph
-landfall
-ernesto
-##quist
-מ
-turk
-##bau
-##num
-teammate
-injuring
-vampire
-##լ
-somewhat
-sophia
-standards
-##♦
-actions
-judas
-electoral
-robertson
-crap
-depictions
-inherited
-##ffey
-byron
-demands
-gorilla
-##mona
-misunderstood
-dante
-##yah
-##ark
-compiled
-gaulle
-win
-activity
-giggling
-polynomials
-plantation
-included
-ɛ
-kai
-282
-vol
-1650
-nationally
-stick
-friar
-bourbon
-vow
-erratic
-drifting
-pornography
-क
-remains
-##ನ
-dumping
-rifle
-backdrop
-found
-pondered
-gil
-##hetic
-outlaw
-secure
-razor
-survived
-behind
-objectives
-josef
-##空
-##typical
-intuitive
-within
-##秀
-torch
-relevant
-scoop
-sox
-buckle
-motion
-##gging
-chorale
-genetically
-##ར
-##·
-yell
-yielding
-##riam
-campbell
-archive
-hillsborough
-diamond
-somerville
-glitter
-graduating
-##॥
-kendall
-天
-videos
-disdain
-nigerian
-##esses
-##hoe
-jillian
-bloomberg
-##miya
-fleeting
-quilt
-facebook
-witnessed
-destruction
-cleansing
-christophe
-explained
-mistaken
-##toy
-college
-##arus
-islanders
-releasing
-tourist
-psychotic
-thrilled
-alt
-rafael
-wessex
-pastry
-enthusiast
-assists
-##ress
-出
-##aker
-eu
-specializing
-##borg
-##if
-unofficially
-studied
-locating
-cello
-hawks
-chevalier
-182
-tissues
-cactus
-onslaught
-##kur
-purely
-清
-invited
-##χ
-joey
-##ered
-##uin
-##brook
-club
-billion
-culturally
-##hale
-##hot
-##quisite
-riches
-ghosts
-definite
-fern
-boycott
-worse
-arrange
-##西
-##br
-timber
-eva
-might
-truss
-nude
-r
-envisioned
-1635
-dependence
-##ault
-##lding
-##nh
-eyelashes
-##agi
-spun
-##vich
-grouped
-problem
-helium
-##dge
-dent
-##rick
-mustache
-progress
-cuthbert
-##鈴
-hurling
-1779
-shepherd
-pd
-tiffany
-pad
-modules
-scientology
-rapper
-electors
-colorado
-enforcement
-##inate
-##rent
-murderer
-boise
-pollard
-mason
-drafting
-straightforward
-accompanying
-extensions
-scaled
-yamamoto
-.
-facilitate
-bombardment
-101
-inviting
-##mute
-198
-dammit
-layne
-dubai
-clint
-shelves
-briggs
-##istan
-revised
-remembers
-soloist
-gables
-cranes
-##ower
-##ok
-kaladin
-1961
-abdul
-weiss
-startup
-thereof
-hose
-unofficial
-##berg
-knicks
-enroll
-das
-satisfactory
-playwright
-functional
-##oted
-exposure
-credited
-国
-returned
-searched
-wii
-banning
-champaign
-asthma
-##son
-##yk
-elias
-##ᵉ
-directorial
-##vos
-swing
-karin
-fours
-cbc
-##bers
-residences
-exploration
-depending
-credentials
-sykes
-а
-##マ
-restarted
-##nivorous
-357
-attacks
-trumpets
-combinations
-banking
-stops
-##listic
-maneuvers
-duffy
-mutation
-stop
-explorers
-behaviour
-subtly
-developing
-ions
-responsibility
-racks
-colombo
-widowed
-addison
-ironic
-ventured
-utc
-##ゆ
-eh
-volvo
-extinguished
-248
-##ク
-enclosure
-impulse
-shocks
-rainforest
-waiter
-##月
-circumstances
-windmill
-cambrian
-granada
-nichols
-ordained
-terraces
-renewed
-geese
-cite
-##oc
-anticipating
-eccentric
-macau
-jing
-##itz
-spike
-stunt
-democratic
-talmud
-flooded
-blew
-tate
-sharks
-##ali
-##ロ
-blanc
-defense
-driven
-snapped
-halfway
-whispering
-##mist
-debuted
-##good
-legion
-sun
-alias
-##lier
-rigid
-amounted
-mari
-relates
-allowed
-negotiations
-rosary
-investigators
-gossip
-transcribed
-broker
-nocturnal
-《
-reckon
-##ट
-##kla
-wil
-amid
-lincolnshire
-н
-mister
-remarked
-##yd
-##hardt
-##escence
-##を
-soothe
-accounted
-pavel
-nonstop
-baronetcy
-##irs
-##aly
-preparing
-specializes
-##ena
-strategies
-steelers
-語
-145
-madness
-##ays
-stresses
-isolate
-uranium
-vu
-collections
-cabbage
-bremen
-jews
-trailers
-森
-##eg
-protectorate
-174
-collapsing
-止
-offer
-302
-ached
-degrees
-421
-physiology
-silk
-toward
-eventual
-##court
-talents
-##ular
-cheng
-parameters
-surge
-fifties
-##ز
-proclaimed
-billing
-watery
-handbook
-compositions
-##nished
-piano
-ministries
-⽥
-arterial
-##nous
-##th
-publishes
-integration
-hotter
-decision
-souza
-pitchfork
-started
-hezbollah
-##s
-vietnamese
-initiated
-ubiquitous
-と
-af
-electrical
-bend
-minnesota
-pigeon
-cardiovascular
-##ol
-sparrow
-readers
-wilkinson
-bled
-hugo
-brute
-1811
-dust
-distinguishing
-##lett
-goldstein
-pontiac
-detainees
-bautista
-94
-##wald
-##45
-年
-blowing
-bartholomew
-##dit
-airplane
-projection
-##furt
-lighting
-##ke
-1830s
-cantata
-tumbling
-iraqi
-shape
-mortals
-albans
-shannon
-warlock
-ventral
-smug
-quoting
-buy
-overboard
-kelley
-##lifting
-order
-##50
-foothills
-lahore
-##ctus
-refugee
-##ion
-deepest
-wetland
-telugu
-น
-clears
-hiring
-freighter
-throbbing
-attacked
-##lian
-haiti
-repeal
-##れ
-##59
-paddy
-bladder
-beating
-soviets
-pit
-blackmail
-##ifier
-timed
-advances
-##tment
-gesellschaft
-ronan
-newtown
-niger
-neon
-##boards
-barbarians
-barrie
-longitude
-##cio
-immigrant
-annum
-sprint
-trumpeter
-1956
-uprising
-##ও
-sustain
-interchange
-blazed
-motif
-sediment
-absorbing
-wah
-jun
-艹
-310
-##miento
-##nas
-venture
-##idal
-extend
-locker
-constraint
-newport
-identifying
-arctic
-##ہ
-cary
-handkerchief
-##ٹ
-upright
-nouns
-म
-admitting
-contested
-duke
-healing
-##29
-photographers
-equestrian
-thrusts
-##fi
-249
-propose
-balloons
-interfering
-terrifying
-leinster
-sweeps
-ₕ
-##hi
-accuracy
-trunk
-bundesliga
-slug
-handled
-stance
-vibrations
-northbound
-gamma
-##oor
-cigar
-buster
-thirds
-loch
-##nessy
-detector
-nascar
-terminated
-321
-chooses
-horizontal
-genesis
-##urs
-##bility
-listen
-45th
-worker
-mc
-mexican
-kendrick
-info
-##uw
-pediatric
-rev
-isolated
-observation
-##nified
-orient
-delicate
-breakers
-##chet
-1662
-hovered
-##nl
-beck
-75
-##tama
-hawaii
-wrestler
-singers
-tire
-suspected
-measurement
-ᄊ
-sprawling
-gordon
-excess
-support
-kind
-commemorated
-skilled
-insertion
-consider
-mustered
-covent
-##mad
-##vision
-##ria
-arsenic
-ನ
-policemen
-enrollment
-ios
-graveyard
-peruvian
-unemployment
-regulations
-above
-civilians
-cents
-worry
-townspeople
-forced
-##uru
-##sz
-cause
-sighs
-compiler
-sable
-fact
-surgery
-dc
-##pants
-##beat
-sings
-##field
-recall
-cape
-eyeing
-hana
-bullshit
-baird
-navigator
-netting
-squire
-delayed
-##nai
-cretaceous
-medina
-presided
-lengths
-##trix
-schedules
-cardiac
-heal
-brody
-lal
-entire
-crossings
-preach
-brigade
-diva
-breaks
-boogie
-##tre
-cycling
-goblin
-spread
-engineers
-servants
-1000
-posthumous
-beyond
-jerry
-¶
-procession
-togo
-coyote
-organist
-##abad
-them
-##view
-app
-##alle
-interviewed
-oblivion
-##rosis
-hu
-talking
-530
-bombing
-philosopher
-escalated
-miocene
-oyster
-donkey
-assaults
-##graphs
-honest
-falcons
-korean
-collaborator
-azerbaijan
-gunpowder
-stuck
-demo
-embryo
-forty
-##or
-finland
-wrenched
-confederation
-eng
-paving
-##west
-baroque
-execute
-personnel
-lars
-##oise
-yanked
-needy
-leah
-nicknamed
-bestowed
-guangzhou
-hanson
-##bbling
-hundred
-kilometer
-night
-##│
-##fields
-racehorse
-curved
-後
-median
-guantanamo
-superstructure
-##isan
-##odes
-grins
-み
-##channel
-burnley
-##pins
-auxiliary
-caine
-white
-valuation
-##lving
-expert
-##rea
-mauritius
-iona
-font
-valentine
-tribes
-##bate
-profitable
-##rogate
-ebony
-soo
-initial
-marin
-helicopters
-debates
-dragging
-decent
-##ours
-yourself
-microscopy
-compilation
-intensive
-bronx
-kimberly
-wills
-digging
-rumored
-werner
-##fleet
-bracing
-sloan
-album
-meaningful
-marine
-essence
-hindi
-##dorf
-doubt
-thirties
-habits
-mangrove
-miracles
-ballast
-`
-bodied
-persist
-browning
-mas
-##sl
-child
-historically
-certificates
-polished
-lucivar
-benevolent
-tipperary
-repealed
-centennial
-##analysis
-middle
-difficulty
-archived
-elk
-ha
-lamp
-barrow
-##ध
-meteor
-##cellular
-turkey
-藤
-dune
-xml
-##hered
-lo
-unbearable
-ottoman
-licked
-talon
-ლ
-yorker
-smashed
-damage
-landau
-visited
-##ressed
-suit
-reactors
-gracefully
-cleveland
-baptiste
-martini
-unified
-ratified
-kobe
-attract
-accelerated
-##duk
-suffix
-builders
-infection
-wit
-##qa
-entering
-ninety
-touching
-hue
-headline
-advises
-##ש
-sand
-rms
-##am
-paddington
-ebook
-brightened
-td
-butter
-rayon
-bilingual
-granite
-³
-eden
-##vres
-distinguish
-madman
-syntax
-rigorous
-shortages
-##sian
-bancroft
-praised
-administrators
-##bolic
-##pool
-paved
-compatible
-morrow
-thumping
-declares
-convincing
-freelance
-differently
-accelerating
-arrive
-##nated
-preliminary
-wraps
-fringe
-glow
-##tension
-stockings
-emmanuel
-according
-carmen
-sprayed
-pulsed
-lyman
-angola
-styled
-sewing
-confronting
-##idad
-bitterness
-eating
-presley
-district
-stevie
-##ddle
-##nel
-clutches
-carry
-##¡
-##para
-royals
-1936
-rebranded
-departing
-ᵏ
-overrun
-legend
-kitty
-##400
-enchanted
-funny
-activities
diff --git a/python/cudf/cudf/tests/data/vocab_hash/ground_truth_vocab_hash_5per.txt b/python/cudf/cudf/tests/data/vocab_hash/ground_truth_vocab_hash_5per.txt
deleted file mode 100644
index b63a3b883a4..00000000000
--- a/python/cudf/cudf/tests/data/vocab_hash/ground_truth_vocab_hash_5per.txt
+++ /dev/null
@@ -1,3100 +0,0 @@
-26899
-27424
-618
-0 0
-9308113254891556867 0
-436896553784417287 3
-2385682838371229189 10
-1558319825288788995 15
-1694636932519955459 18
-10804654694511444996 21
-18098925775253437954 25
-6737924902096233986 27
-12420164637450604548 29
-10052209762446030853 33
-17136277123483534341 38
-3816962228559380485 43
-10000432285949172226 48
-3595721221582369282 50
-8300629869194307073 52
-13201888020163964420 53
-1911143973647033348 57
-6800828846832400386 61
-11236791597074179585 63
-9168098438593696772 64
-7262274946389325317 68
-2138829751639336965 73
-5569775297965074438 78
-6731361918165622787 84
-1000159591814497798 87
-12420398894488158722 93
-17276977059282736646 95
-7635600731429963779 101
-18378821292746091014 104
-2022931426448033286 110
-6648129782745846792 116
-4965400020463248899 124
-5551261944186869761 127
-11423501398563713540 128
-12015445961309659140 132
-14804906647292574211 136
-6319334893796554242 139
-7402632997775432193 141
-11021449308525840900 142
-7668313675584212486 146
-8416534454442286083 152
-14786755608460337669 155
-10252394770332982789 160
-10551221318314345989 165
-16096303530760742404 170
-9259785140742614530 174
-14075929275811587074 176
-317926420102044679 178
-779174831516893192 185
-1715448660039157764 193
-4021514903658568197 197
-14867534496075520001 202
-12719282834178743811 203
-17465980749405462021 206
-8925774748288079874 211
-7623696658283396611 213
-17929326859744067590 216
-12671154751486019589 222
-12039326031585824261 227
-18347446751167835650 232
-17064637144737108996 234
-11021218191937939462 238
-13156493120251779587 244
-8337641745437302789 247
-7529617071985650692 252
-15855511622382862341 256
-9880671126311259651 261
-4648648374527148548 264
-9643729509462706180 268
-16079254189330675202 272
-5912862662665074690 274
-3064447562379233285 276
-4679971614763188741 281
-5680163499515301381 286
-13723289572021756419 291
-12449253329930581508 294
-4151764428362670084 298
-11785134993959286786 302
-3078863087420124165 304
-15185198810953060869 309
-8297057339576492548 314
-13461860185754054147 318
-3436285402337504774 321
-14202036946604226052 327
-1429187245268482565 331
-1523079254859276295 336
-16225831803604224005 343
-7556943554185476098 348
-17338728822182943748 350
-16838388931654467591 354
-5518924113746022405 361
-16613589795813448195 366
-16245874414877492227 369
-16854419533518505478 372
-2539676113398158854 378
-12042249427062731780 384
-18319457514850241026 388
-4667739140302370305 390
-0 391
-6381331313726406148 391
-3908582673448970756 395
-12826200984566221323 399
-1008697446161023492 410
-1045330962753696770 414
-12519080959524890629 416
-3292967686690497028 421
-9027706222753190402 425
-7627948746545135618 427
-7113204062919493122 429
-1902402424944787974 431
-8582449967256837125 437
-5093494603780084229 442
-15952412032150427656 447
-11024499228689010178 455
-6599137048949621252 457
-12441911088884211715 461
-587330806707068418 464
-6920270191183132164 466
-6534248742955117062 470
-17914513122977299972 476
-18340097295513629191 480
-13676127406031174657 487
-1387609897069507076 488
-1763073162200316424 492
-10502193626894192132 500
-6191089615101093380 504
-3995303291068676098 508
-12574723308968873478 510
-9457112246136620037 516
-2659981631401145858 521
-5678978765742315521 523
-2321007311402405891 524
-7262894069470853636 527
-5310255954615393799 531
-0 538
-4947937557054473220 538
-16886302206430107652 542
-1841567627260397061 546
-1726903440913575428 551
-14879875423439711745 555
-10406819091468369412 556
-11283521635310509569 560
-3680912523998301189 561
-15838266909581394435 566
-12391804375709570050 569
-1020307774365172228 571
-1922720619059759621 575
-10324920589640484359 580
-7346645981525338116 587
-14105351180271546371 591
-18409889360013782019 594
-4787164988982490117 597
-10284218591671275522 602
-13929073926559228929 604
-13709653197282269188 605
-11724057172276600835 609
-866864533352873475 612
-1687533916118276610 615
-14962991064861439492 617
-3075368686919669253 621
-7596099351713916421 626
-9307207198362792453 631
-7250040662483023365 636
-3682040329320065539 641
-793686382876029444 644
-14333782297404873732 648
-1825793630924954627 652
-84619541393612289 655
-13182271411077086213 656
-9374841722325437444 661
-11891971769486715395 665
-8043937648578607618 668
-14046873830950905859 670
-8968303908578660869 673
-12592319890201396232 678
-11282395698657584643 686
-3688562500562735107 689
-4497389441661304322 692
-864575183575292419 694
-1036491259062044675 697
-14213351843646783493 700
-7708978415027076099 705
-6987701493730463747 708
-4532132061628409865 711
-1889838133194080259 720
-14464742354403783685 723
-11471457412392914945 728
-17301684950834749955 729
-2740600303999292421 732
-1861681694596322820 737
-2466512513818280965 741
-567322683033477639 746
-4219982383453752324 753
-4414272813071813121 757
-18296812503650796552 758
-14106238638274939908 766
-8343039090563483140 770
-12899982907471985156 774
-5995542065075040773 778
-1943074365889546244 783
-16642415363908390916 787
-681630493618758662 791
-4461862742627854853 797
-3899665054961662466 802
-9373615873629220356 804
-1672886650488992771 808
-16881710947150074885 811
-5794767402723235334 816
-5402108652823464964 822
-11161133355542430211 826
-17005555433296348161 829
-16516779711980265477 830
-1238971681242829316 835
-7495779865157534723 839
-5869815147311233538 842
-12543930235265506819 844
-16347355750539933194 847
-1618182687158588418 857
-5475339393482339330 859
-1820163941351445507 861
-9781870903757679621 864
-6959962931369883652 869
-6165520526154289668 873
-4608135616958660097 877
-5347147311423033859 878
-4274882894736749062 881
-12469579236131750918 887
-14358338421054221826 893
-14307263809632171012 895
-12240285085875511812 899
-3612936311740236807 903
-15402889954020054020 910
-4862865790365874693 914
-12873671979026176002 919
-5762125948543834631 921
-16549663359069884422 928
-17127464586321114627 934
-5489118657490741761 937
-1439915557249128451 938
-5591921604825754115 941
-12215727902556801029 944
-1292449902890727941 949
-252852670543746566 954
-6662678800904557571 960
-11937079555774719491 963
-9580831801307231749 966
-585888224829458434 971
-2614731491636606978 973
-4039106186911400965 975
-85542184548733958 980
-1232905200495278082 986
-12898496382527018499 988
-4488285801271117316 991
-5603423086377753092 995
-2857864416179998726 999
-100295911725428228 1005
-15911528881864497155 1009
-10785833246735411207 1012
-7176774503550155271 1019
-14860337537539733508 1026
-16041518408725938691 1030
-11338701253034655748 1033
-14943908174444697092 1037
-3242489974158897670 1041
-3082940798033554949 1047
-3158850387764404229 1052
-2812954313732796417 1057
-17857707129346166789 1058
-5269004761822808068 1063
-6111611187710453252 1067
-7026790498114792451 1071
-10421187431999041539 1074
-10962946788411535365 1077
-11508567829579753477 1082
-5586796232331222020 1087
-10664688802114188290 1091
-7101223008705196548 1093
-15730557498751873029 1097
-15622178182517453828 1102
-13023280475588667395 1106
-12736192986547669507 1109
-54449156097307139 1112
-7793412450098625026 1115
-13082902456897469447 1117
-11032923750376691202 1124
-768621266939883522 1126
-7680301716982797315 1128
-2529765954999401987 1131
-3092278718215461382 1134
-9721979104592801286 1140
-7638965909717933573 1146
-14786119811292616708 1151
-6950372959952776705 1155
-12443107208611825154 1156
-9339657551534781955 1158
-2637569863210881541 1161
-17203452450577793542 1166
-17969146007912340484 1172
-2200068367901267460 1176
-12757405660591319556 1180
-2215466944120424966 1184
-10360201018982236676 1190
-8757372217936483331 1194
-4619739097826383361 1197
-5258481580443618818 1198
-12767191547735252995 1200
-709131867671395842 1203
-14381518510326427139 1205
-7474380832040809474 1208
-10420528068240579075 1210
-4871472258172502537 1213
-14752872109699970567 1222
-1512107771448952322 1229
-8931318797796630019 1231
-17442635440581817347 1234
-6352160538927010308 1237
-7854574878209706502 1241
-12790477974029380098 1247
-1694268250394806274 1249
-18190476908103646723 1251
-6347737109409161731 1254
-18109962854929396739 1257
-18427225309876158471 1260
-9821587521358472709 1267
-6090671432051474948 1272
-10840793900646830595 1276
-16995529728013218306 1279
-3884460060356541442 1281
-11328174051558773766 1283
-2190762182871099394 1289
-10795994333162192385 1291
-5749185296348500997 1292
-15519907096103866882 1297
-4660993720606920707 1299
-865021105594269187 1302
-15732547829511033859 1305
-10370189278075376644 1308
-5643403162652225545 1312
-7491082473953581573 1321
-10267635999979525638 1326
-16917208132136155651 1332
-14938122712096231426 1335
-10458787752950376965 1337
-4661643778774054919 1342
-7378032314220749314 1349
-18262506560361936388 1351
-1777721411781745668 1355
-16284189979364697095 1359
-465731536183074823 1366
-1279204163871993862 1373
-1281083695929372167 1379
-1248628108208937989 1386
-16877732867220667397 1391
-12694354789085753351 1396
-11607902699193694212 1403
-4736550549618409987 1407
-3707377337025079300 1410
-15252497877242817539 1414
-1387328235986371593 1417
-6603032558061404163 1426
-17360253640789048836 1429
-4366853681792936456 1433
-12654867339467232261 1441
-17526554569512741895 1446
-17362261302665304067 1453
-1571924693327722498 1456
-5346032278378646020 1458
-4905765968575512581 1462
-6735440265877186050 1467
-2684582464658702853 1469
-12305455794035352066 1474
-10214763548894856708 1476
-5898818347968949767 1480
-8853876862443757062 1487
-1778255355643845123 1493
-9880096907488009732 1496
-14128719217189307399 1500
-1038097769717637636 1507
-16182828358028252163 1511
-9758783803467969029 1514
-5734508289193397251 1519
-17317991154864180229 1522
-3929011798945830403 1527
-2397676608572967937 1530
-15050420126488734213 1531
-9030951389346630659 1536
-6828609694434054148 1539
-15161672926172597254 1543
-12561916901436574724 1549
-13175017957196582402 1553
-5163667233751697414 1555
-8298531402410056193 1561
-3302216018490556417 1562
-8188022378496687108 1563
-6899994089344537605 1567
-14371356649301229570 1572
-6234142974089459201 1574
-8108331937702179332 1575
-6988695179225596420 1579
-4783268154102625283 1583
-5158962768530123268 1586
-8451852824882960900 1590
-16684923005574289925 1594
-435360132353818115 1599
-11179708428252470790 1602
-12560590898560496643 1608
-9052444957053236231 1611
-6942914933011863557 1618
-4685132753411777032 1623
-16127598791952926725 1631
-17139405263038539779 1636
-14416974839438770179 1639
-184674079454644738 1642
-15371113443157556742 1644
-6125564898470072836 1650
-6049476413298229761 1654
-2016046076376584709 1655
-9102901609995427333 1660
-14279380086715328002 1665
-16071171343743534084 1667
-3003661872837116932 1671
-1544053734049606146 1675
-17912839388887512068 1677
-18367485572659629066 1681
-62486557281101315 1691
-15341749693046047746 1694
-4462259451528626691 1696
-2198440243211776514 1699
-1739990500338348548 1701
-6736705032720670728 1705
-460022204996210692 1713
-13316259771467543558 1717
-11844773108515011076 1723
-13208087868677074947 1727
-15327490137854692355 1730
-12082487592532424709 1733
-16377323572476873736 1738
-0 1746
-622886781057602561 1746
-16338526270417904138 1747
-9002667202186955781 1757
-8853816989295250439 1762
-11704947860454198788 1769
-10586310579976363524 1773
-12274078960777730562 1777
-9504312777814311939 1779
-12938939440083727877 1782
-4482870732030096900 1787
-11469312931933075457 1791
-3010319078507518468 1792
-11746036803692802052 1796
-2845804857533704708 1800
-6113501269614124035 1804
-1300574605566718978 1807
-10066236704043377154 1809
-7972385596552911875 1811
-9467192894482478598 1814
-14898777861151503875 1820
-1944274375974874115 1823
-10386723956120624646 1826
-18320025862115844613 1832
-1387818835494813188 1837
-18296018469774430211 1841
-14855935328857078786 1844
-6009618969907709956 1846
-2589125298910864388 1850
-16233494802218098697 1854
-2481120375688643077 1863
-17348525962385858053 1868
-12345390066536357892 1873
-11898897602124667396 1877
-2640139887503545347 1881
-6231765986224989698 1884
-9439401787143523334 1886
-12293464382470856197 1892
-16085904910360067589 1897
-7252534814412510724 1902
-11702778818573550086 1906
-9474460568585535494 1912
-5394564267141025283 1918
-2149355951313254402 1921
-17924873918814083585 1923
-598191802606476803 1924
-8284578931129436164 1927
-10209356185787986442 1931
-1435256119122221570 1941
-1751234962195953666 1943
-4311520098760549381 1945
-14246310464944558596 1950
-12371324908926062594 1954
-1728194076248328196 1956
-3206212432676588545 1960
-608028375064114693 1961
-12735035537535459848 1966
-1161317385392644610 1974
-5963411090947829250 1976
-14747492552155802116 1978
-16061594303567645704 1982
-11401558272625990148 1990
-9760118054252460548 1994
-1590553955289263625 1998
-10340997646340822531 2007
-523412383725034499 2010
-611878128332703238 2013
-16958179163660410371 2019
-4049722845406929413 2022
-7868172937392666114 2027
-5918378978107988996 2029
-12787502422435844615 2033
-1060009293879599108 2040
-2360292516675315714 2044
-9882807690136518146 2046
-14696092264443978756 2048
-1467831371341617669 2052
-532975774202937346 2057
-11905286291268558854 2059
-531011598217908739 2065
-828942665123565063 2068
-16920169218690190853 2075
-7591945855482943490 2080
-14985689627450757635 2082
-14044560149551330824 2085
-17188203557127851010 2093
-12477774362747447814 2095
-1435155859536064516 2101
-1383226851914584068 2105
-5950622975418741254 2109
-11573330487936600066 2115
-3499677126004927489 2117
-7754519505545821700 2118
-3063606795384108551 2122
-9141715975960833030 2129
-184455528113873410 2135
-14290946452897026052 2137
-3303218105480425475 2141
-4921212561864013316 2144
-11659206998782866434 2148
-3483784077328335362 2150
-15349300100599457794 2152
-2991158137975641607 2154
-11671836184239777283 2161
-13903348973527331847 2164
-13972685059135606789 2171
-5067855901273612291 2176
-11203581229879824387 2179
-7398166954536630787 2182
-8137301061780033541 2185
-14461051202225956357 2190
-18364391265692359173 2195
-13202854215310531076 2200
-8735953751694682627 2204
-15668075878531113478 2207
-12181518985980303366 2213
-2996117190304599556 2219
-6321134583181571077 2223
-18292784224560228355 2228
-9730339360761143298 2231
-15423393047428954630 2233
-2289663845294497287 2239
-17293282718410542598 2246
-16848796704425902087 2252
-15868806819374243844 2259
-7170876063537340419 2263
-7450653208363214856 2266
-8532024281356555270 2274
-16059255557578908162 2280
-1877124332184628739 2282
-2891997770652300804 2285
-9380477352886239748 2289
-0 2293
-18421659490129039873 2293
-10046669908312212994 2294
-5751756535955732485 2296
-9063799367706849795 2301
-197150231261699074 2304
-12240146887946385922 2306
-14309589056601523716 2308
-1624985398293287428 2312
-7587234449739517958 2316
-15018247234507853314 2322
-14339897192752756740 2324
-241576182571638276 2328
-8692544418167008260 2332
-18142354925381021188 2336
-4357033544932333062 2340
-17783269028567291398 2346
-2150566123188780038 2352
-6826864893756137990 2358
-2570512448148033537 2364
-3926559756499773957 2365
-7320130823370788355 2370
-8279967564081321986 2373
-10487133814282217989 2375
-4474596476108096515 2380
-9162424448056197124 2383
-14148207203733810691 2387
-17544170151884681732 2390
-8376160833193445378 2394
-17178210789916656645 2396
-168694152552959492 2401
-8301334775237721091 2405
-5366880444274330116 2408
-2275685050251168772 2412
-5129073922893954566 2416
-7337914693295266824 2422
-9232645461774346756 2430
-9281558265232282115 2434
-17538929436013548545 2437
-8171970898838379521 2438
-407976145144453637 2439
-11917619836674704901 2444
-7463384942613006339 2449
-11628547823982306307 2452
-8367507709438611460 2455
-7426323158152134659 2459
-4692304143317559813 2462
-3164999859583895048 2467
-2475
-7352605297446749095
-7088047764357120496
-1873618248285620134
-15293063541372749212
-6979009792504496341
-3755115258874496180
-2534438137508857227
-7299919415988454234
-18411981910273949730
-17205553309096936780
-5248951067483899801
-7247233534530159373
-7192573503860179570
-6928015970770550971
-5302201063430293086
-14184058584951490062
-5354604945682204822
-7139887622401884709
-5727354401416546391
-7351759274033218447
-7087201740943589848
-6979855828815315184
-7299073392574923586
-3886521448658569638
-72483853
-6979009762410823886
-6927169947357020323
-3226253748217055407
-6926323880952529025
-15291089452348344358
-11366435697098950012
-13310078522289489702
-28484163972630126
-619366483213223113
-2795970204732164463
-6206321690771458217
-7087201710849917393
-15287705195328439433
-111740332
-7034515829391622532
-15882855708138472805
-7299073362481251131
-13237708561380147209
-9273081553817896179
-7246387481022956270
-18029523497279423740
-6927169917263347868
-13513632858282984745
-7193701599564661409
-7139041568894681606
-15802896446603200075
-7086355687436386745
-7033669805978091884
-17956211270007850924
-4563816462554630517
-11129540511026317143
-13237708531286474754
-7194547635875480252
-9003675253593868694
-18413109975884759114
-17205553334892496668
-7193701569470988954
-1413046541639878608
-946825488890267006
-7033669775884419429
-7352887339644027831
-12155898262057583711
-2989761681675846645
-6980983894426124568
-14604797433968658301
-18412263952471228466
-7245541427515753167
-15292781524969719388
-15846041951231805521
-7192855546057458306
-1873618493337240604
-6928298012967829707
-11016920455496205379
-15289961373839132475
-7085509633929183642
-10531295876509927661
-9870724660790479
-17144416678735644465
-429916096632
-7353733375954846674
-7032823752470888781
-638094591060870297
-16092841662109387098
-7301047494496551813
-8541237495103949042
-7818890396069135776
-5178673286195185763
-8250235334236768512
-1841584140945524876
-18412263922377556011
-6928297982874157252
-7141015700909982288
-7192855515963785851
-1836789879294462027
-9870724716497203
-14947088600271026
-13702113181256910523
-7140169634505490990
-10225919154718574698
-7352041286136824728
-7087483753047196129
-10864467494618334192
-1773948700787017845
-417019004815
-7032823722377216326
-14193031196214429556
-7299355404678529867
-17264773178461980386
-1278770520813602818
-6980137840918921465
-18411417898964025363
-11722522519282124676
-7192009492550255203
-9870725379720958
-6153353810106713963
-4536471466122020769
-6927451959460626604
-1127379439846033088
-6204347640438458139
-7139323611091960342
-7141015670816309833
-292816209
-9407029712610658618
-6600019782220383520
-7300201440989348710
-1682680374277112930
-4859408499100419349
-7247515559531053849
-7299355374584857412
-18411417868870352908
-5272120000921863921
-16551960241296116093
-6927451929366954149
-17186370626574550605
-15292499534362314294
-7139323580998287887
-7086637699539993026
-7298509351171326764
-2625321593698060269
-7033951818081698165
-6979291787411718362
-282724672
-15692070679858513338
-6926605905953423501
-17630638801202448322
-7247515529437381394
-15043322321877076105
-2792885536610780866
-1845823063
-11773758314556753827
-6176104209096443228
-7072090857947137391
-16846695548644688878
-15291935475760760928
-15459386004803160013
-8343160331046226180
-152372727
-7034797854392517008
-13108405724425291903
-7354015418152125410
-7298509321077654309
-929260878924941285
-7033951787988025710
-16107267423789974637
-7246669506023850746
-5138916826085984492
-7245823439619359448
-28766150282052641
-11719741672025753203
-9945828332568577291
-15282335361333069253
-7085791646032789923
-1899651063193470519
-9391897706793207408
-7354015388058452955
-11398792770463140027
-7033105764574495062
-2572415617593705468
-8983295826499536249
-14732267094484321768
-6925759852446220398
-1714226175
-4310109704341227252
-4227285979208156735
-161798587300055395
-18413392000885653590
-1732546113203799682
-7246669475930178291
-90526560417220980
-2522831856378709294
-7193983594471883430
-1378362890422519435
-10318013789890151505
-1910933862240552025
-14683276054533375891
-7141297713013588569
-4847114588453668997
-7353169364644922307
-1934287791725413661
-7033105734480822607
-6981265919427019044
-450064107389060609
-11929763
-6980419853022527746
-18411699911067631644
-7193137571058352782
-7198684663705634814
-7192291504653861484
-17174064760732583170
-1801586532
-7139605623195566623
-524384476410218364
-2209547647594924308
-7032259711067291959
-7526932862385391531
-7353169334551249852
-7931231880446346780
-7300483453092954991
-3584013211340638230
-6981265889333346589
-18412545947378450487
-6979573829608997098
-18015051914722281720
-10683744
-7247797571634660130
-7193137540964680327
-18411699880973959189
-7140451659506385466
-14383047196674041
-14448472746991225173
-7352323311137719204
-7087765778048090605
-7139605593101894168
-33560403334269270
-7806125477758371395
-1574206491118601300
-16092233475575710729
-4721835318523791115
-15292499560155382992
-6979573799515324643
-13614615898053870661
-6927733984461521080
-14782687056266790897
-6926887918057029782
-7247797541540987675
-2843459484954527603
-1786086408025671247
-9289441360305064013
-7138759569688363520
-5566476498435049615
-6672837949675865094
-7352323281044046749
-5459976717197837927
-2325612370
-16165083394671708123
-7087765747954418150
-5566476532828210437
-109033155035858516
-7299637399585751888
-5246976991356848017
-6927733954367848625
-15292499547258292377
-7246951518127457027
-7244428084583335163
-7194265636669162166
-7086698151192823114
-10049924856077420313
-7086919724540887502
-7138759539594691065
-7298791376172221240
-7086073658136396204
-15289397375427675165
-8052454368722028838
-11251542888998503501
-14003328469233502159
-8631928107028383434
-18413674012989259871
-2528053666090387191
-2624757569491043976
-2907319229923985027
-6926041864549826679
-15291935475760891195
-14871830661500634496
-6154481914409125818
-7194265606575489711
-12484855343804385078
-17299513090802386086
-9870724574938647
-7086919694447215047
-7034233812988920186
-7353451376748528588
-15289961386737730869
-7298791346078548785
-7246105464620253924
-17094850760611596031
-6981547931530625325
-15287141218410562876
-12642214096204597433
-7193419583161959063
-18443164073222080895
-491719075
-2844912017228892087
-12020808312486430123
-6103488045384926186
-70256349
-8556410824327300183
-7033387789575389538
-12237172509275325915
-5893456428412241171
-1576180593040230658
-7353451346654856133
-1201738978701542472
-7032541723170898240
-7300765465196561272
-18412827959482056768
-17018729383741883913
-7193419553068286608
-16870411475888572178
-33278412725618157
-10808445661528262808
-15228627241849521321
-10895941260599690580
-7140733671609991747
-1893141935297136496
-7762829005246892330
-814482688
-1873618476140922442
-7352605323241325485
-7088047790151696886
-7033387759481717083
-29894172902294792
-7245259411113050821
-18411981936068526120
-7299919441783030624
-6980701878023422222
-15293063567167325906
-12913795363565864010
-18279521768686553293
-825323228049442984
-2366626164980320297
-7192573529654755960
-6928015996565127361
-7511405001902393465
-12020126521146280357
-17297538988880889455
-3169683337105049483
-4320397392167437764
-2996806038868067480
-13880529192106525749
-12258258950024070758
-9457728271880882414
-1948083978070788460
-7088047760058024431
-6747869491765118557
-396563964949038476
-2283061350295603413
-18411981905974853665
-9870724676257796
-6979009788205400276
-1873618463243831315
-7299919411689358169
-7247233530231063308
-7192573499561083505
-6928015966471454906
-13172383304428291992
-7139887618102788644
-9894329350782060506
-5622264594716035131
-7351759269734122382
-13970089714745935549
-7087201736644493783
-11780790212937254859
-5640826309532256347
-6250067602413848771
-7034515855186198922
-1997667752184056011
-6979855824516219119
-7299073388275827521
-8773391357768107870
-6927169943057924258
-5386265453423232211
-6979009758111727821
-2245985413502404712
-7139041594689257996
-6926323876653432960
-8472178325545420517
-7299073358182155066
-7034515825092526467
-13237708557081051144
-1249061871705589231
-7246387476723860205
-18413110001679335504
-2791055589805656257
-6927169912964251803
-916886386616763786
-15801863628634588225
-8864802475807081547
-2308422057122989461
-9870725183834332
-12011043047629784321
-7193701595265565344
-2203332293412784251
-4203593881757353951
-8253013595490092090
-7139041564595585541
-7758389320980628720
-7086355683137290680
-13519837262133135719
-6064762127302393958
-7033669801678995819
-6985684972259117296
-10277194893975947111
-11865320555827824922
-13067141397934573091
-2940293779302123205
-13237708526987378689
-14428924761094490470
-18413109971585663049
-7194547631576384187
-8910270495612602670
-7193701565171892889
-11835120105125054734
-7353733401749423064
-6922632561930405347
-7352887335344931766
-7033669771585323364
-18412263948172132401
-7245541423216657102
-6980983890127028503
-17070057731888842956
-2071645033680471512
-6156737994042444103
-13228094635257760069
-4799330753387759873
-6928298008668733642
-13889891850651109005
-7192855541758362241
-7085509629630087577
-13143536835616770002
-8031770806726101038
-9140276259359819415
-7140169660300067380
-17094376519214238045
-1693503902275602273
-5778348214553282767
-1773948726581593258
-7032823748171792716
-12757875056923772971
-7353733371655750609
-16812272571148667179
-1009366474452567451
-7301047490197455748
-18412263918078459946
-6108201112818484768
-2707637655416932544
-9407593762612381184
-6928297978575061187
-7141015696610886223
-5214737420442796133
-15394822466617411907
-7140169630206394925
-5109952608322914327
-7032823718078120261
-7087483748748100064
-7352041281837728663
-18411417894664929298
-16275783350713190092
-1092700946662950361
-7299355400379433802
-32432389312218921
-6980137836619825400
-6927451955161530539
-7604559457759135281
-5728764444738258095
-7192009488251159138
-7139323606792864277
-16857332490755245458
-3343560
-464309651771
-11312578691943499042
-10024345359443035686
-5353476841380448450
-882239428633953381
-12060668540762391720
-2052891733976941648
-6979291813206294752
-7300201436690252645
-18411417864571256843
-7247515555231957784
-5887022448955360480
-2787671392973227504
-7299355370285761347
-6619090118045271448
-879419277503366844
-2008078727273646424
-15865913240105781170
-14383064393516769
-12743882002561632192
-13555232411643742491
-15289397319538705607
-812124046
-1873618514833704373
-7139323576699191822
-7086637695240896961
-6908100892141816223
-7298509346872230699
-2695321185936868691
-1945545813249623166
-7033951813782602100
-15289397353931868102
-6979291783112622297
-9632241780191791292
-6926605901654327436
-1631882651478525035
-7247515525138285329
-7245823465413935838
-4442788352416548793
-4443634474708830204
-3239382674005951948
-7034797850093420943
-8597138013777561025
-7354015413853029345
-7033951783688929645
-7246669501724754681
-1873618519133194353
-5462796868327441521
-16038494049631274362
-7245823435320263383
-17008002440344700481
-10420287313124918500
-17164650024200374474
-154011658
-7193983620266459820
-9611454020314924519
-11422235346247419176
-7085791641733693858
-2621625390044283947
-7033105760275398997
-2577002748724381918
-7354015383759356890
-6678552048861382680
-6925759848147124333
-2118540511074649695
-18413391996586557525
-6980419878817104136
-11485232201928082493
-14154340218780386747
-7193983590172787365
-5357989108122126648
-6154199898006423584
-7141297708714492504
-7353169360345826242
-1873618252584388545
-15287141218410825094
-72513270612231730
-6981265915127922979
-6980419848723431681
-7193137566759256717
-4985063574640920329
-18411699906768535579
-14463310864315779363
-7192291500354765419
-8998114985810134320
-10207847739528775756
-12638147294677239230
-7139605618896470558
-15287141188317152890
-7032259706768195894
-7353169330252153787
-13725680859840121166
-7300483448793858926
-5257405381051615158
-6349026413187499377
-18412545943079354422
-6979573825309901033
-7193137536665584262
-15292499474174116987
-2789363547278214886
-7247797567335564065
-4223816177647289670
-1873618458944604000
-9181763921242359034
-4000515045254367287
-12910411188229637865
-7140451655207289401
-8515376925833300352
-7087765773748994540
-7139605588802798103
-7352323306838623139
-7299637425380328278
-8484110780235385151
-11640650963542673608
-14101060887774700
-6927733980162425015
-13724983541464369033
-6979573795216228578
-1419747374885111438
-6926887913757933717
-1930668198955452810
-7138759565389267455
-7087765743655322085
-5675796533981283930
-92079366
-7086073683930972594
-7299637395286655823
-14383047197066244
-6826664259619390926
-7246951513828360962
-6927733950068752560
-9870724846126197
-7194265632370066101
-11653222680468915374
-7822735708193293328
-7086919720241791437
-14378284476977055283
-11527067173424792740
-7298791371873125175
-7086073653837300139
-212338786
-7034233838783496576
-498701699429
-5514354658380875071
-4770844606942742432
-12507909061331388812
-16431008191725831459
-7558005371879032024
-6926041860250730614
-98436550
-15287987237524997760
-18413674008690163806
-571813020932180290
-162989216
-5152482613269497204
-12689613402799605552
-11847668771932013896
-7194265602276393646
-550243203354855282
-17971473236626703405
-8937544163598010547
-15091099644146747549
-7298791341779452720
-7353451372449432523
-7034233808689824121
-6981547927231529260
-7300765490991137662
-7246105460321157859
-18412827985276633158
-16505476999257589058
-12490303189019657716
-7193419578862862998
-11945800880170403371
-692605700306896591
-16294470476861867997
-11823044541028697510
-7033387785276293473
-7354297408760251366
-985637089627933081
-7032541718871802175
-2635993685958396071
-9894523072925796113
-7300765460897465207
-18412827955182960703
-3485224758199257354
-7193419548769190543
-12430143129524439379
-2380943364365616692
-9305145756033811687
-7140733667310895682
-7352605318942229420
-13486365294955398350
-18383714950999639313
-7033387755182621018
-7088047785852600821
-7299919437483934559
-9870724671539207
-7245259406813954756
-6100103917337838794
-2517386234133546535
-6980701873724326157
-5406444717745899883
-18411981931769430055
-7192573525355659895
-16358593412853139153
-6928015992266031296
-12424382439595705495
-7139887643897365034
-10906583509962065513
-2691365739792435341
-12452979941127620409
-2625321597996828341
-5558136804797384674
-17944163576803951650
-7299919407390262104
-18411981901675757600
-6979009783906304211
-6928015962172358841
-6926323902448009350
-7247233525931967243
-10714918364066023692
-14383042897708902
-9562777431589128006
-14399056675416442168
-7139887613803692579
-7087201732345397718
-7351759265435026317
-1094393045078770756
-6979855820217123054
-10709931946762503359
-7034515850887102857
-5782296465685677189
-7299073383976731456
-27356072568228011
-9870724725998874
-10305085813919844606
-12189320918056831076
-2625321585099736377
-6927169938758828193
-813631719
-2538734585
-6926323872354336895
-3722579929450415844
-7139041590390161931
-8109001260751259400
-2103039510800303200
-6591652942651787148
-17675072908219975424
-2113895771772946077
-13237708552781955079
-13964897540691461690
-7034515820793430402
-7299073353883059001
-4081176587746805898
-18413109997380239439
-7246387472424764140
-495542969232394639
-285870162
-14714880334493255623
-13730298810256196609
-7139041560296489476
-9870725380375539
-7193701590966469279
-7086355678838194615
-9087257516001788955
-6500635606105786312
-5353476845680198762
-6293435923465308245
-1042835216333014619
-3191712388857988942
-7033669797379899754
-2791055637096302358
-8238066788924721312
-236521418
-7245541449011233492
-6156456003434055403
-14045391212846450114
-5142301044413892722
-7194547627277288122
-32432363518297616
-33560403333547721
-18413109967286566984
-5779476207078738131
-9870724609606766
-16854238106389710928
-7396235467053271442
-13369405230387234641
-7353733397450326999
-12370327100086618251
-8954142712877746247
-153456695047357500
-7352887331045835701
-6153071780807312570
-7033669767286227299
-11662233534561978119
-18412263943873036336
-28484129580255433
-13672484538110837999
-7245541418917561037
-6980983885827932438
-1873618484739049391
-1469680747315529380
-6928298004369637577
-7192855537459266176
-5778348193057605937
-7085509625330991512
-421317903336
-7140169656000971315
-7353733367356654544
-4312966040774706541
-236453759050
-104307364886481272
-7472754
-7032823743872696651
-6980137862414401790
-18164832428166481311
-3342493899775477828
-7301047485898359683
-17149817495305717253
-18412263913779363881
-17947829764244047033
-15691697526684648453
-7141015692311790158
-1873618454645376268
-15685038007044278084
-7140169625907298860
-13816341447826081654
-13938987348593804675
-6366943767402055912
-12279042692635953334
-7352041277538632598
-7087483744449003999
-17619527108083517127
-7299355396080337737
-16737240503404332379
-14932478444660590859
-15273884660262765959
-18411417890365833233
-6980137832320729335
-15287705229721143272
-6927451950862434474
-7192009483952063073
-10354526296759273115
-460010554417
-5572809636517250537
-7139323602493768212
-13925023889216243624
-7300201432391156580
-1518418403436004696
-3523407702141503062
-6979291808907198687
-17299513159587988488
-7247515550932861719
-5269329687486663025
-1491033538627700799
-11619890057073068122
-9733674716123433175
-17461812064821184287
-15288833316828087421
-7139323572400095757
-2322094249834514694
-7086637690941800896
-3693573093146429379
-1873618527731189820
-1782138217079048145
-7298509342573134634
-7033951809483506035
-28202057289434486
-5150516383306155674
-6049123076685497339
-2625321585099868576
-15703528257170507956
-13819044485072465
-6979291778813526232
-7245823461114839773
-6926605897355231371
-12241985901088998680
-18141240613009360196
-16530489490799724510
-7085791667528270248
-1873618510533364055
-7354015409553933280
-2553424131400403162
-7034797845794324878
-6204347606046082147
-15292138696292895124
-7349438249010596137
-434216371783
-7246669497425658616
-29894241687898103
-7245823431021167318
-7193983615967363755
-11123291787522213546
-9870724570482570
-13819048784561829
-32150304124504126
-16911052854443771624
-7085791637434597793
-3866984390250988963
-7354015379460260825
-16425638839461283996
-11957902820310123780
-7033105755976302932
-2373415502940997012
-6925759843848028268
-18413391992287461460
-4141551246163838791
-14505034101877245842
-15292781563661912115
-6980419874518008071
-7193983585873691300
-2850570111682611307
-5034923912583317800
-10169273990111757991
-14375674833933502504
-7141297704415396439
-1467988623105721525
-5510538250602219377
-16573157102592722994
-7353169356046730177
-6981265910828826914
-18412545968873930812
-7861398105913427821
-8429692928904464468
-7300483474588435316
-13774681046469379319
-11406639409630546199
-18411699902469439514
-7193137562460160652
-5246976944067250249
-6980419844424335616
-13940478354871813627
-13701595356116681867
-16426766960961390571
-7192291496055669354
-5340613827489301924
-7139605614597374493
-11949535933962127511
-11248024162254325150
-5213027855012595380
-13150190137466815618
-6351416861983902064
-1467988623104476637
-7032259702469099829
-15292217517958891213
-9870725207164369
-755605599842665995
-18412545938780258357
-6979573821010804968
-17933750666284107619
-7300483444494762861
-7193137532366488197
-1880166547305399311
-7247797563036468000
-7140451650908193336
-7087765769449898475
-7352323302539527074
-17682574712751982576
-7299637421081232213
-10223260478366353693
-11663374006463433041
-9077802184452213114
-613166273235716174
-6979573790917132513
-2137124098070087866
-6927733975863328950
-878291220490619127
-15361003727878554622
-6926887909458837652
-29330208881903963
-2152780467316000177
-5357989129618589597
-7646567114842638318
-7138759561090171390
-1466296546184988453
-1873618239687690255
-16874641756321417469
-7086073679631876529
-1584662439
-1811826159652440291
-7299637390987559758
-12802594135359554633
-10846196352290785698
-18413674034484740200
-6927733945769656495
-7246951509529264897
-6926041886045307004
-5458848587100981366
-7194265628070970036
-429917013116
-7086919715942695372
-15287987297712736594
-10756726237632137002
-7034233834484400511
-15290807375757707399
-7086073649538204074
-7298791367574029110
-28202074486146284
-9392472737220396979
-14847634415788360916
-15289397366829483851
-6518315999997593014
-15290807392954288538
-6926041855951634549
-18413674004391067741
-7194265597977297581
-527217511716357463
-16363166289661921249
-17620704769380124659
-447112350000
-16699231963845560721
-13522668423781746725
-7298791337480356655
-7353451368150336458
-1894834111096227152
-7034233804390728056
-7300765486692041597
-18412827980977537093
-6981547922932433195
-7246105456022061794
-14266293419834869195
-9302231382455945522
-10803332050175068476
-7193419574563766933
-2211239741711975644
-11079423192795514730
-1945545830446729386
-7140733693105472072
-8758866611300140123
-15936300577150731882
-7354297404461155301
-1873618467543975108
-1040015035110261938
-2710531354671711585
-14424533014481667245
-7033387780977197408
-7032541714572706110
-1459422918
-9870724626318937
-12478172948446053915
-15292781563662043248
-1732546104605738522
-7245259432608531146
-7300765456598369142
-9944700266957440235
-13252643833684952629
-18412827950883864638
-4896553002078767161
-30458188511250577
-18359396230774917168
-7140733663011799617
-11272180868443080721
-16874923798518695576
-5885976095793742930
-7088047781553504756
-7352605314643133355
-7033387750883524953
-168849237244052984
-7245259402514858691
-18411981927470333990
-5249797159683556626
-7299919433184838494
-8328636978065704368
-6980701869425230092
-451651625195341092
-6928015987966935231
-1873618501935892373
-7192573521056563830
-11737382248731378999
-7139887639598268969
-775817183132189744
-15801421542798591337
-3273906465467926857
-17504806456180148447
-4999048390027118623
-5949516568421532902
-6979855846011699444
-14540810596246029752
-15290525359354677342
-17939521450163242260
-7318928098821080086
-7299919403091166039
-6979009779607208146
-18411981897376661535
-7081697764708451986
-7247233521632871178
-969023931668825518
-6926323898148913285
-8108333368108254535
-7139887609504596514
-18169457873000663376
-7087201728046301653
-17740106858442524965
-7351759261135930252
-7299073379677635391
-13010413693461201911
-6979855815918026989
-5910883915778885296
-7034515846588006792
-8618954206934992460
-7246387498219340530
-18129074234833831896
-6927169934459732128
-1873618540628542454
-7139041586091065866
-10442852986254656567
-2395019873428113205
-6926323868055240830
-468604683425
-8087249190780536753
-1251886678142749075
-2719268023842506192
-15484720670843995730
-7741956108160599025
-12822961480931869999
-7034515816494334337
-7029767686712723344
-10163069590560376057
-13237708548482859014
-7194547653071864512
-3698377064121436540
-7246387468125668075
-18413109993081143374
-3404560976776923069
-7139041555997393411
-7193701586667373214
-450980818488526510
-7086355674539098550
-9249568750854341715
-6153071806601889139
-1873618446048036460
-7033669793080803689
-7245541444712137427
-6980983911622508828
-18413109962987470919
-7990307559300466109
-13222096480399393786
-1197790749063775841
-12902749409780696076
-7194547622978192057
-5561192151909205460
-7085509651125567902
-3357575107008071546
-1873618544926787278
-7353733393151230934
-7352887326746739636
-8268575981438895233
-6716515055005533871
-6980983881528836373
-18412263939573940271
-7245541414618464972
-7192855533160170111
-15289115337529558626
-13365777358560822813
-3028626790437816712
-6471725307461895704
-6928298000070541512
-17785259814435424400
-848960360419623992
-3511510786883322944
-1873618497636533880
-7085509621031895447
-7140169651701875250
-7353733363057558479
-8373123327081842861
-7032823739573600586
-7352041303333208988
-2947421221745985235
-15287987271918683167
-747201041967810181
-7301047481599263618
-6980137858115305725
-17447639780156180862
-504901560953342357
-7141015688012694093
-2524806052879271814
-7140169621608202795
-10940245256426751073
-7087483740149907934
-14383051495835695
-6176793320558560244
-7352041273239536533
-7300201458185732970
-9870725054269136
-18411417886066737168
-6980137828021633270
-15169666149850743870
-7299355391781241672
-931541139053413675
-11888218815508972893
-6927451946563338409
-10585391818487171337
-7192009479652967008
-10367111403802331152
-13105031987622054228
-5541264770713322802
-7139323598194672147
-5432665585007396118
-879419204419586170
-7086637716736377286
-1388146554381010284
-14288862971683801479
-16285378285009240167
-39519722
-6979291804608102622
-14947330030372718319
-7300201428092060515
-9009348648356481082
-9870725377033977
-14338844047743256282
-7247515546633765654
-10576074740845971129
-15288269284021110607
-16485145185525761680
-4927877544240416780
-7086637686642704831
-7034797871588901268
-7298509338274038569
-6979291774514430167
-7033951805184409970
-6926605893056135306
-7245823456815743708
-5701497137720395794
-13553803778814903538
-15291089469543613858
-15293909556187366880
-7085791663229174183
-7034797841495228813
-3459957951274485460
-7354015405254837215
-7872981627489289774
-6925759869642604658
-13088377121872283058
-5780604350074062210
-7246669493126562551
-17785259857426319623
-9870724661970361
-14101043691849748
-18413392018082037850
-4009970402599045444
-12596269962708321697
-7245823426722071253
-15417905864638990439
-13819027288885075
-2235142484806141593
-7193983611668267690
-15290807375757968455
-2061656954403227379
-13749155634582652047
-7085791633135501728
-5352762575340504654
-9231301807315879437
-7033105751677206867
-11733354
-6925759839548932203
-879419277504153303
-18413391987988365395
-6980419870218912006
-7093139039356454668
-245051885476
-7193983581574595235
-7192291521850245744
-5670079292018853456
-1731418116379051588
-7141297700116300374
-3295137431799203195
-15287141209812633373
-7353169351747634112
-7300483470289339251
-6981265906529730849
-18412545964574834747
-18411699898170343449
-6980419840125239551
-7193137558161064587
-7140451676702769726
-9282455518003529325
-7192291491756573289
-13517017123900556406
-12909283101122299560
-13001682132658816270
-7139605610298278428
-268323666682316511
-434215519994
-5195983173921998653
-7032259698170003764
-12400858092474729508
-1049541662233592999
-1652992377389188147
-9870724610131038
-6979573816711708903
-18412545934481162292
-7300483440195666796
-8122508159220254825
-6926887935253414042
-4854750274073986812
-7247797558737371935
-7140451646609097271
-8603359422188488186
-7087765765150802410
-7352323298240431009
-17228001359505262946
-830681586991892086
-16575900413863724702
-7299637416782136148
-5223960103625950402
-5622546645510588659
-6927733971564232885
-104307364887005000
-6926887905159741587
-15894193095142279476
-15412359869797107874
-7138759556791075325
-1466296571978319404
-6598609717403059389
-7086073675332780464
-3461791464603256049
-2624757569490846836
-18390735680858949823
-7299637386688463693
-6926041881746210939
-18413674030185644131
-7246951505230168832
-7194265623771873971
-2388013311603639908
-2661960728632166488
-1626046306172405994
-3292988330812113977
-7086919711643599307
-3350348129897809669
-7298791363274933045
-2949113337358451846
-13819040186107746
-7086073645239108009
-460011078998
-5726358075983398113
-3531693765072848000
-13521137881826199057
-7034233830185304446
-14262234001769891808
-15896859674140673387
-7246105481816638184
-15586729198849165186
-12999620585941961130
-6926041851652538484
-5515482706794513640
-18413674000091971676
-8245128912192341302
-4935175925304394923
-2914587895472588825
-16743444847067268462
-5511666307614640418
-9870724573497757
-10183587777134266649
-17461812064821708117
-7032541740367282500
-9870724662102366
-3000062970512082916
-7034233800091631991
-7353451363851240393
-4764015496858961551
-6981547918633337130
-18412827976678441028
-7246105451722965729
-7300765482392945532
-1413046558836524843
-9870724848157861
-6864771691154835689
-7193419570264670868
-2836065437668410391
-7140733688806376007
-5569296739845342906
-988739295852300565
-7033387776678101343
-11272445595878688522
-7354297400162059236
-14795036413642344215
-32432389311890484
-7706082920376043056
-5745384143842640959
-7245259428309435081
-7032541710273610045
-6493133948225848344
-6980701895219806482
-18412827946584768573
-2672695638434121742
-16252369836876630326
-1731136082779636780
-7300765452299273077
-4980727526481134825
-1623790192145992094
-5462796864028870900
-2573543666009113907
-14648423506775771260
-5344573059049392338
-13159245476491953264
-3016351
-28202074485294367
-8850604587316807925
-9433463473978082775
-8898527545369692553
-13497936734142399877
-7140733658712703552
-9242938543921759733
-17504292068232266828
-9870724716561847
-9462223648962578817
-7352605310344037290
-7088047777254408691
-6980701865126134027
-7245259398215762626
-18411981923171237925
-15082773189987797150
-7299919428885742429
-7247233547427447568
-11231012565665252996
-1873618497636796488
-6928015983667839166
-7192573516757467765
-8392045041254729237
-214958343501
-232154924118
-8773072981860484929
-17623817999021377174
-7139887635299172904
-13669583310057374915
-11952920104999651164
-7351759286930506642
-6095117356077024274
-6979855841712603379
-11911353550168000303
-4094565860123281828
-6979009775308112081
-2574671718721652501
-15289115337528575529
-6461237536894486399
-6588601354970859314
-829835576475584056
-12171433882329483024
-10601803635978405977
-6926323893849817220
-15798241638577277014
-13487493291780737431
-7247233517333775113
-7139887605205500449
-8339857016756570529
-32432363517708539
-7351759256836834187
-5781052833705625408
-7087201723747205588
-1873618501936285893
-161798583001876730
-7034515842288910727
-1413046550237676700
-10371546090728785591
-7579950199755704475
-15838740426298100091
-7299073375378539326
-6979855811618930924
-7246387493920244465
-1466296524689441838
-6927169930160636063
-6151097687482829151
-7515235746198914183
-2625039568697361387
-12568193962150069461
-7139041581791969801
-16683469352030111006
-1644749190600984028
-6926323863756144765
-3207856338718361997
-7086355700333674940
-11303992210264950105
-9391897685297203230
-12384683746431141169
-8177758353543530791
-17532367729421977465
-10164317815380509716
-13237708544183762949
-8111257460758677865
-7246387463826572010
-14101039392621716
-18413109988782047309
-7194547648772768447
-12269921124804134294
-5301355009923286798
-7193701582368277149
-460011210063
-7086355670240002485
-17641922897637607789
-14095300710942181622
-2436256590268073848
-7352887352541316026
-7033669788781707624
-7245541440413041362
-2431982719322294696
-6980983907323412763
-7563081637033018877
-4409898123206658145
-5046514566388713097
-14814059329512276972
-15800256793861162595
-11057351737720178980
-17330360441647925497
-7194547618679095992
-1873618523431110529
-11242798885672060886
-3189961255353123861
-7085509646826471837
-12105446909435184997
-17097227825074210038
-1626046323367871520
-7353733388852134869
-13819010092565514
-12633597574996166830
-16921919712692601988
-7352887322447643571
-7301047507393840008
-28202057290090175
-6980983877229740308
-13520119278536099331
-7645347347796264438
-9622122612430014405
-18412263935274844206
-7245541410319368907
-7192855528861074046
-9870724653384918
-5885976121588451299
-6928297995771445447
-7085509616732799382
-7140169647402779185
-10769578373240915520
-9819715104592692881
-7087483765944484324
-7352041299034112923
-7032823735274504521
-33560399034844152
-7301047477300167553
-6980137853816209660
-10944945458828937260
-13669905534306158122
-18026193457867982798
-5993604028623815795
-28766124487935334
-7192009505447543398
-5564352154681083943
-7141015683713598028
-1839019217048569077
-15288269228133516664
-7300201453886636905
-7087483735850811869
-6718814190576731851
-13940760358377162540
-7352041268940440468
-7299355387482145607
-6980137823722537205
-18411417881767641103
-6927451942264242344
-7192009475353870943
-15290807461739890121
-2188969976
-8455366527925159925
-18068492228830365096
-14880915248833168401
-14101065186935955
-7139323593895576082
-14311414805250836811
-9870726375998595
-7086637712437281221
-4958753984641959947
-13105552693218576372
-6979291800309006557
-7300201423792964450
-16165083394671772777
-13808562234416563902
-6926605918850711696
-7247515542334669589
-2624757565191619772
-5881584636093073417
-32150338517076320
-785983290288965090
-7086637682343608766
-7034797867289805203
-2979056014524680069
-9870726617761288
-14540120362423748755
-7298509333974942504
-1521238580360316800
-12976673671053247650
-7033951800885313905
-6926605888757039241
-7245823452516647643
-17952330525185607811
-9870725377427411
-15291935458564376817
-1603012832
-7085791658930078118
-6488020650427614617
-12919095062694397133
-7034797837196132748
-7354015400955741150
-6925759865343508593
-18413392013782941785
-7246669488827466486
-4938567614984488112
-9870724720559650
-7498048152822089063
-7193983607369171625
-7141297725910876764
-27638110467000252
-16349719711739348122
-11636885203347441199
-9106532716476499902
-4548898661544101372
-7085791628836405663
-15502606039934830517
-7562235613620865220
-7033105747378110802
-14426789124208003302
-8393023341868484594
-1251886678143272957
-15788821327926658275
-9870725383784222
-18413391983689269330
-6980419865919815941
-6925759835249836138
-5357989095224838157
-33842436932831545
-7192291517551149679
-11348511994799130885
-1349355141337056383
-7141297695817204309
-1201533247087773761
-32714336928793738
-4543926375081576497
-191950179678159950
-7032259723964580154
-11040187364284696123
-7353169347448538047
-14686192518692538500
-13757953135961245442
-18412545960275738682
-2185568528717121049
-7300483465990243186
-14835653469557491533
-1572796400505981136
-9788517128793228302
-6981265902230634784
-15290807375758362619
-7193137553861968522
-16118290484606469999
-16418446140666480289
-18411699893871247384
-16862372087833823519
-4206448621224592673
-7192291487457477224
-7140451672403673661
-455853151859508766
-5962976017044669465
-7139605605999182363
-8666576866089109507
-7032259693870907699
-6979573812412612838
-7300483435896570731
-8344402900088457576
-18412545930182066227
-5727354427211187592
-2625039560098645916
-5515200746281044358
-6926887930954317977
-5247593352906999919
-7247797554438275870
-7140451642310001206
-7087765760851706345
-7073611506224269312
-7355013769965733687
-13174099600966486052
-7352323293941334944
-11443887989255375246
-4146446263661364495
-7299637412483040083
-1571386258306042085
-5946179046742821181
-7246951531024745222
-6927733967265136820
-10639573055207376941
-6926887900860645522
-6994602712723555674
-5299098861505284222
-15289961412532110933
-5353476845679871617
-13513632884077627122
-7138759552491979260
-1144540847698347111
-7086073671033684399
-1901711630905639974
-31586254121535116
-9870726869747307
-5996988225456506285
-4526720883860178668
-2052034288295085986
-18413674025886548066
-2624475548788787270
-6926041877447114874
-8839539261532342355
-7246951500931072767
-4197549521920329467
-5197111291121763775
-2118540506775095159
-825887278052213957
-7194265619472777906
-5476138965309065280
-16588826745223579283
-7086919707344503242
-7034233825886208381
-7298791358975836980
-7603297094028494723
-7246105477517542119
-6981547944427913520
-6926041847353442419
-5353326642538940524
-16771372229407671518
-18413673995792875611
-1873618501935302415
-9091855447660366104
-2624475535891695220
-5248721418517808451
-695707940923638942
-1784668047839200521
-5123966563937617496
-9476180711086229037
-15289397293744457077
-2363364065503545615
-4187198287186888894
-14772210831162279241
-155104173409961345
-7032541736068186435
-7353451359552144328
-1744698334901438782
-1720270693428495876
-5863211220867680084
-7300765478093849467
-6981547914334241065
-18412827972379344963
-7246105447423869664
-1375340101
-1166767807772951618
-7193419565965574803
-15287141235607340513
-7140733684507279942
-9870726915359992
-427623910551390043
-7354297395862963171
-7352605336138613680
-7033387772379005278
-10744889200825600252
-12314414645940914243
-5835546371352167427
-7032541705974513980
-10066835850599860118
-14525018628252960934
-6980701890920710417
-7245259424010339016
-14735929132254102647
-13413734718255400134
-27920045185435350
-4259434058519872846
-11061538999702718852
-13453688972047942746
-4689994313342912648
-9870724605543603
-15693612750414022248
-7140733654413607487
-21103793
-1576744569957320370
-14491088305759127384
-14386655907411854458
-7088047772955312626
-28484120982325551
-7352605306044941225
-7245259393916666561
-6587273635278948726
-14242239340761843231
-18411981918872141860
-5885543833259738430
-6980701860827037962
-7299919424586646364
-7192573512458371700
-8271097636514235902
-7247233543128351503
-6928015979368743101
-7139887631000076839
-688165077222754128
-2596888135805175368
-805831884
-7351759282631410577
-7087201749541781978
-6979855837413507314
-2730758514787288045
-8477529075407324712
-6971520520943961491
-6979009771009016016
-1996539656478525349
-2783379539012682990
-9840570834876761539
-7247233513034679048
-5831598141713614638
-6926323889550721155
-8795062046652761419
-12021254591056840488
-8901578402287847519
-7351759252537738122
-8386285046382069117
-7087201719448109523
-7299073371079443261
-6979855807319834859
-7034515837989814662
-7218388648619280099
-10542598463376393433
-6927169925861539998
-7246387489621148400
-12120495559683868478
-13802551532558419223
-7139041577492873736
-11739843167758321863
-7086355696034578875
-17274157458060281885
-13267938307966371294
-12545157304493803906
-1413892608044762406
-13237708539884666884
-8487173076311541556
-18413109984482951244
-5991347858709874014
-30740204913755543
-7246387459527475945
-7194547644473672382
-17199514878479894463
-468607961144
-7193701578069181084
-11847668836417537503
-7086355665940906420
-1585891583597414023
-2131214311901497365
-7352887348242219961
-7033669784482611559
-9870725370086799
-12482638774054029707
-851940374707963849
-85348920764663585
-7245541436113945297
-27638024484357488
-6980983903024316698
-1785796031766792562
-32432333424428255
-7192855554655650436
-3909861025052296211
-8482084103035029713
-7194547614379999927
-6313103561496593691
-7085509642527375772
-5674063562281978264
-15287423178926065096
-7353733384553038804
-15398206672049014438
-14477731067643037065
-7301047503094743943
-7352887318148547506
-1584875219904694498
-6980983872930644243
-18412263930975748141
-1065200468752795466
-15177810505157773571
-6928297991472349382
-8054291783896991432
-12361766690509292773
-1053111882941597244
-7192855524561977981
-16593903001780618524
-5782296379703953450
-7141015709508174418
-7085509612433703317
-10965002685862053593
-985637059534194074
-7140169643103683120
-9870726291523332
-16878319014468388994
-1731418094883636884
-7352041294735016858
-7032823730975408456
-7087483761645388259
-6980137849517113595
-3232458045602335021
-827861392871065158
-7301047473001071488
-6446727831346940293
-5779476262966527373
-7192009501148447333
-6927451968058818734
-2625321597996762500
-245619828089948399
-7141015679414501963
-6722191038494869768
-7882137511871513597
-7087483731551715804
-7352041264641344403
-9870725382801118
-9272877412921575399
-7300201449587540840
-9870724775282169
-16436379939763521713
-17202169176751932566
-7299355383183049542
-18411417877468545038
-10754752208794749200
-7192009471054774878
-2430620139577280307
-8611086590374315409
-6927451937965146279
-805963576
-15291935514452624488
-7139323589596480017
-10812503129128502459
-8959988790532114083
-7086637708138185156
-2783943580416542774
-14720813411196273775
-10364150078460660033
-14362448656949381213
-15043322265989679329
-7300201419493868385
-7298509359769518894
-6979291796009910492
-1575898520748295875
-14922019929361876657
-5672976447337596765
-5831598141713745162
-6926605914551615631
-7247515538035573524
-2624475544489559534
-16260410401650050515
-2246314295
-7086637678044512701
-1873618544926328642
-7034797862990709138
-1096931248591407021
-7298509329675846439
-7033951796586217840
-7246669514622042876
-17941158134117828894
-10269743533867795671
-5347675226580976647
-3417860786310088910
-15688521365104690176
-7245823448217551578
-5134111493730665580
-5674104474255755027
-7717191090462393385
-6926605884457943176
-3596136887273981730
-32996417817806584
-7085791654630982053
-11573430836645528786
-7033105773172687192
-7002851749355521472
-5003335381653063809
-7354015396656645085
-7034797832897036683
-18413392009483845720
-7246669484528370421
-13979993538309916391
-6925759861044412528
-7193983603070075560
-10992311089537484262
-16267581173591573925
-2622783424577865887
-13819035887273179
-7141297721611780699
-16904712944498902101
-7033105743079014737
-6981265928025211174
-15289397375428134676
-4858945738627876093
-6149598375218383178
-6925759830950740073
-9870725377820485
-18413391979390173265
-6888657822347363642
-18411699919665823774
-1256116924183939300
-95187572621051114
-6980419861620719876
-7192291513252053614
-18103065826041137485
-10222816969990211254
-7141297691518108244
-7353169343149441982
-1283514803232966177
-2002437706943169624
-7032259719665484089
-5458807696621832226
-7300483461691147121
-7912204187237025199
-5570988786673059738
-18412545955976642617
-10756162204825814253
-6981265897931538719
-8790289859117385450
-13727996608218204062
-7193137549562872457
-18411699889572151319
-7247797580232852260
-12994192120810309276
-7140451668104577596
-7192291483158381159
-12595141854107535638
-530653668674242947
-7352323319735911334
-828143439366588588
-17782439637510194255
-9302231356660189158
-9870724574152005
-6702109864571701358
-7139605601700086298
-12221091663415674606
-7032259689571811634
-6818388851727926589
-6241248994235910470
-1495860772
-16414620023533864954
-6979573808113516773
-6926887926655221912
-764904305988667453
-1945545821847553660
-7247797550139179805
-257824245644068756
-32714354125768093
-5582375653047993645
-7138759578286555650
-7140451638010905141
-7087765756552610280
-18216242462685529942
-9870724653909453
-7352323289642238879
-14665067898537833
-8621549700391504088
-7299637408183944018
-9870726605571596
-7246951526725649157
-6927733962966040755
-1535116265
-11108455537719445229
-6926887896561549457
-3189738274038088907
-546637851
-2835565962349970524
-11203378729712550962
-5095261602624047103
-7138759548192883195
-7086919733139079632
-7022272817292838188
-8784323283773883651
-1559288237147293403
-7086073666734588334
-8223467025204184800
-13165236096819726188
-15603385639994459498
-18413674021587452001
-7246951496631976702
-6926041873148018809
-6206603763061885007
-67325673392440852
-1760429919
-7194265615173681841
-7719447165796222449
-848307201366754599
-7086919703045407177
-1300970856318961162
-7034233821587112316
-481505314442
-7353451385346720718
-16860434610997626540
-7298791354676740915
-9870724715972410
-7246105473218446054
-6981547940128817455
-2710500877822855357
-8910392170934372316
-9870724821026326
-11204987448696309768
-9538952396691932645
-16145254715685996093
-9026822436951032807
-933515197985065029
-17256118064413607864
-9870724723574767
-15637617634654488066
-15167687298960066085
-3229097897723823367
-232710822722865204
-7032541731769090370
-1095239150174733620
-7353451355253048263
-5661738430124328056
-7246105443124773599
-948019863933683111
-12569464285503162148
-18412827968080248898
-12806893037197853951
-6981547910035145000
-7300765473794753402
-7193419561666478738
-8781693865030060215
-9350227582874420345
-2146416191707613108
-5141454990908000668
-7140733680208183877
-99231065340118997
-1384869222252742270
-7088047798749889016
-2755882956846859405
-7033387768079909213
-7352605331839517615
-1854737581649037059
-7245259419711242951
-6980701886621614352
-7032541701675417915
-7192573538252948090
-15287141166821409947
-15009813827273492365
-1873618519132277025
-15859000273729226306
-9194029999668660035
-18061482934504720304
-15620129942127970446
-63936358548900149
-7088047768656216561
-1299603461807998560
-7352605301745845160
-1912255148622875001
-1884114781242131983
-1873618471842023452
-6979009796803592406
-18411981914573045795
-7299919420287550299
-16311679952720365302
-6980701856527941897
-11966722661119886674
-7247233538829255438
-16633803289536038978
-6293788
-7192573508159275635
-6928015975069647036
-7139887626700980774
-15287987246124173226
-7087201745242685913
-7351759278332314512
-10417684559046444665
-6187441192028997511
-6979855833114411249
-1627738421784085859
-9821766011289404660
-2201358182892963955
-2917029403468826036
-6979009766709919951
-6927169951656116388
-7247233508735582983
-6926323885251625090
-15292217457771480209
-17852636614813549786
-7351759248238642057
-7087201715149013458
-1873618510534673791
-7034515833690718597
-17119958815989303480
-9440398924256576696
-7299073366780347196
-10246365349637458074
-13237708565679243274
-6927169921562443933
-3124549171711117404
-7246387485322052335
-6173800107753209956
-8070402889435842339
-7193701603863757474
-5903201844931986950
-9942444152932271115
-7139041573193777671
-17445466366832674018
-7086355691735482810
-16337104745239479432
-39608547845342982
-11685864050243995219
-9870724855301167
-2624475527293240350
-12073088508578760599
-7344550947391473690
-13237708535585570819
-103179325071230022
-10145345834749004124
-15248610667087201439
-7194547640174576317
-18413109980183855179
-9612643166216521167
-2292825785040635134
-464308864142
-7193701573770085019
-7406197765746854162
-16499980877748898004
-8868431006280779480
-8692390532950198617
-10906015674175259968
-7086355661641810355
-7352887343943123896
-10864636259949152234
-8099428861213869098
-4451132852614006819
-7033669780183515494
-6814198485741601955
-7245541431814849232
-12239027656080492550
-6980983898725220633
-9789319859965200232
-6928298017266925772
-7192855550356554371
-10277198458468566225
-10155047267998762621
-33560403333744444
-11548493110908748854
-7085509638228279707
-9870727015827078
-7852905094729042252
-5890488336741041777
-7032823756769984846
-7353733380253942739
-9870724569302118
-2390363305267037366
-7352887313849451441
-7301047498795647878
-2785415304742897065
-5197675268039181352
-2440417245444768861
-18412263926676652076
-18006978259953256063
-1148489064437843751
-6209141837602293106
-6928297987173253317
-7141015705209078353
-7192855520262881916
-6698767362340226194
-7140169638804587055
-1873618501935826611
-7032823726676312391
-7087483757346292194
-15287141265701537621
-7352041290435920793
-9391368372458883286
-7301047468701975423
-18411417903263121428
-15274450213741529511
-2576363855830648001
-6980137845218017530
-7299355408977625932
-16414819240624849959
-7192009496849351268
-12524340376691672164
-29048140890506751
-17786951981635864475
-11226550812567013593
-6927451963759722669
-5812662174917002591
-12350988444867429435
-16904712944497919197
-6444753720828101815
-8249591958531213230
-11026052076656986043
-7141015675115405898
-593494716095857647
-490104423410
-5558136817694804187
-5299098870104786949
-7300201445288444775
-507301005145
-7087483727252619739
-5903884228619468800
-18411417873169448973
-1873618489038734736
-5887800003173221669
-18441542378572155291
-7247515563830149914
-12588077379430713565
-7299355378883953477
-7563081654230386389
-6927451933666050214
-7192009466755678813
-13733412026998916877
-12944621759895439337
-3054485070182287068
-13297153724119713646
-7139323585297383952
-2625321580800574862
-7086637703839089091
-9603332665379260396
-6979291791710814427
-7033951822380794230
-7298509355470422829
-33278412726339059
-18029805427700663298
-7247515533736477459
-7410231591516636514
-6926605910252519566
-17946822069105788673
-7034797858691613073
-4261009806383449657
-447113660592
-7033951792287121775
-7298509325376750374
-7246669510322946811
-4534407021717685362
-882239441530390089
-7245823443918455513
-6926605880158847111
-15287141209813681996
-6155327963617691259
-7085791650331885988
-6520986101609792887
-7354015392357549020
-6208295814188697765
-17554608171616110273
-7033105768873591127
-7246669480229274356
-18413392005184749655
-2803649555476907539
-6925759856745316463
-2599235317102217269
-7193983598770979495
-434216567980
-8106071886713521650
-15746639385482889572
-7042686759039796981
-1154748910884030612
-7141297717312684634
-12243195703567124314
-1873618463243306305
-7353169368944018372
-12450159764203178109
-4148105506751382809
-7033105738779918672
-6981265923726115109
-33560368940713974
-17169981030136022953
-6980419857321623811
-4347925833116092296
-18411699915366727709
-7192291508952957549
-5623513681448666797
-33560403333875172
-9870724666886005
-7139605627494662688
-18158855401059780058
-31586288515352537
-15287987271918618035
-14808173258166110477
-7141297687219012179
-10906583505664280685
-7353169338850345917
-6210168527151630153
-7032259715366388024
-2126679883759814820
-2149273943172056336
-4347925901902415544
-30176172108350955
-7300483457392051056
-6209987959894705607
-18412545951677546552
-6981265893632442654
-7193137545263776392
-5303893075865503190
-1893500283777910651
-18411699885273055254
-7247797575933756195
-5081553090054654323
-7140451663805481531
-8247724729662703889
-15291935458565162174
-7139605597400990233
-7352323315436815269
-7087765782347186670
-7032259685272715569
-316743096613078208
-8854892867934423142
-6979573803814420708
-29612199490947173
-6926887922356125847
-8812300966546965802
-11722969585197581469
-7247797545840083740
-8270229298440046551
-6232418373048731093
-1873618252584978388
-7138759573987459585
-4977791693940394284
-7087765752253514215
-7352323285343142814
-7880607085181405609
-18332726249486944491
-3305725656990746261
-7299637403884847953
-5729046448243673423
-7246951522426553092
-71108415
-6927733958666944690
-17393050773869299132
-2928740603216529441
-7138759543893787130
-16150166559522883585
-10364366638615365781
-7086919728839983567
-13883199269509858330
-6478223379528812725
-6661826795216177518
-7086073662435492269
-1873618441748613473
-17733865212901721719
-12164227723825907371
-936899463602964420
-7246951492332880637
-10590197086357423139
-18413674017288355936
-6926041868848922744
-6736241924278258064
-16008494395543127400
-11957793163459888268
-7194265610874585776
-13819010092238498
-9966383549812704714
-27356055372105278
-14101043691783933
-11185827869741483157
-7086919698746311112
-7034233817288016251
-1411918557712287647
-18056758355458721953
-10913804840997816088
-7298791350377644850
-1627738408887257463
-7353451381047624653
-5565348488711768192
-2560318637201230152
-6981547935829721390
-7246105468919349989
-9181763891149276690
-8262357255499876300
-7193419587461055128
-496747170645214674
-10898149942002910916
-13913370016116443282
-5992475971611657642
-7353451350953952198
-9759732344438719525
-15945141101920192007
-7032541727469994305
-9735614383619770157
-18412827963781152833
-7300765469495657337
-10284111861439072375
-6981547905736048935
-7193419557367382673
-14137369733883168373
-2392267112159643424
-2150364374056305808
-5724666024857831081
-7140733675909087812
-7033387763780813148
-515898869092
-7352605327540421550
-7088047794450792951
-6980701882322518287
-7245259415412146886
-12083278685454862534
-5532932791479699590
-7192573533953852025
-4144026118870010131
-5248951067485144858
-6928016000864223426
-30740204914411808
-12245079979070720255
-5622546645511767792
-16663012318422238821
-15860561610643605152
-31304293607146765
-795150206
-785983290289883252
-100
-101
-102
diff --git a/python/cudf/cudf/tests/dataset_generator.py b/python/cudf/cudf/tests/dataset_generator.py
index d3124d72a54..5e03068f818 100644
--- a/python/cudf/cudf/tests/dataset_generator.py
+++ b/python/cudf/cudf/tests/dataset_generator.py
@@ -131,15 +131,17 @@ def _generate_column(column_params, num_rows):
         else:
             arrow_type = None
 
+        if not isinstance(arrow_type, pa.lib.Decimal128Type):
+            vals = pa.array(
+                column_params.generator,
+                size=column_params.cardinality,
+                safe=False,
+                type=arrow_type,
+            )
         vals = pa.array(
-            column_params.generator,
-            size=column_params.cardinality,
-            safe=False,
-            type=arrow_type,
-        )
-        # Generate data for current column
-        return pa.array(
-            np.random.choice(vals, size=num_rows),
+            np.random.choice(column_params.generator, size=num_rows)
+            if isinstance(arrow_type, pa.lib.Decimal128Type)
+            else np.random.choice(vals, size=num_rows),
             mask=np.random.choice(
                 [True, False],
                 size=num_rows,
@@ -152,9 +154,13 @@ def _generate_column(column_params, num_rows):
             else None,
             size=num_rows,
             safe=False,
-            type=arrow_type,
+            type=None
+            if isinstance(arrow_type, pa.lib.Decimal128Type)
+            else arrow_type,
         )
-
+        if isinstance(arrow_type, pa.lib.Decimal128Type):
+            vals = vals.cast(arrow_type, safe=False)
+        return vals
     else:
         # Generate data for current column
         return pa.array(
@@ -276,7 +282,9 @@ def get_dataframe(parameters, use_threads):
     return tbl
 
 
-def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1)):
+def rand_dataframe(
+    dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1), use_threads=True
+):
     """
     Generates a random table.
 
@@ -294,6 +302,8 @@ def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1)):
     seed : int
         Specifies the `seed` value to be utilized by all downstream
         random data generation APIs.
+    use_threads : bool
+        Indicates whether to use threads pools to build the columns
 
     Returns
     -------
@@ -340,6 +350,22 @@ def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1)):
                     dtype=dtype,
                 )
             )
+        elif dtype == "decimal64":
+            max_precision = meta.get(
+                "max_precision", cudf.Decimal64Dtype.MAX_PRECISION
+            )
+            precision = np.random.randint(1, max_precision)
+            scale = np.random.randint(0, precision)
+            dtype = cudf.Decimal64Dtype(precision=precision, scale=scale)
+            column_params.append(
+                ColumnParameters(
+                    cardinality=cardinality,
+                    null_frequency=null_frequency,
+                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    is_sorted=False,
+                    dtype=dtype,
+                )
+            )
         elif dtype == "category":
             column_params.append(
                 ColumnParameters(
@@ -435,7 +461,7 @@ def rand_dataframe(dtypes_meta, rows, seed=random.randint(0, 2 ** 32 - 1)):
 
     df = get_dataframe(
         Parameters(num_rows=rows, column_parameters=column_params, seed=seed,),
-        use_threads=True,
+        use_threads=use_threads,
     )
 
     return df
@@ -495,6 +521,18 @@ def boolean_generator(size):
     return lambda: np.random.choice(a=[False, True], size=size)
 
 
+def decimal_generator(dtype, size):
+    max_integral = 10 ** (dtype.precision - dtype.scale) - 1
+    max_float = (10 ** dtype.scale - 1) if dtype.scale != 0 else 0
+    return lambda: (
+        np.random.uniform(
+            low=-max_integral,
+            high=max_integral + (max_float / 10 ** dtype.scale),
+            size=size,
+        )
+    )
+
+
 def get_values_for_nested_data(dtype, lists_max_length):
     """
     Returns list of values based on dtype.
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index ac80071c8e4..40df234580c 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1519,12 +1519,32 @@ def test_scalar_power_invalid(dtype_l, dtype_r):
     ],
 )
 @pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12])
-@pytest.mark.parametrize("frequency", ["months"])
+@pytest.mark.parametrize(
+    "frequency",
+    [
+        "months",
+        "years",
+        "days",
+        "hours",
+        "minutes",
+        "seconds",
+        "microseconds",
+        pytest.param(
+            "nanoseconds",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36589"
+            ),
+        ),
+    ],
+)
 @pytest.mark.parametrize(
     "dtype",
     ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
 )
-def test_datetime_dateoffset_binaryop(date_col, n_periods, frequency, dtype):
+@pytest.mark.parametrize("op", [operator.add, operator.sub])
+def test_datetime_dateoffset_binaryop(
+    date_col, n_periods, frequency, dtype, op
+):
     gsr = cudf.Series(date_col, dtype=dtype)
     psr = gsr.to_pandas()  # converts to nanos
 
@@ -1533,16 +1553,123 @@ def test_datetime_dateoffset_binaryop(date_col, n_periods, frequency, dtype):
     goffset = cudf.DateOffset(**kwargs)
     poffset = pd.DateOffset(**kwargs)
 
-    expect = psr + poffset
-    got = gsr + goffset
+    expect = op(psr, poffset)
+    got = op(gsr, goffset)
 
     utils.assert_eq(expect, got)
 
-    expect = psr - poffset
-    got = gsr - goffset
+    expect = op(psr, -poffset)
+    got = op(gsr, -goffset)
+
+    utils.assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "date_col",
+    [
+        [
+            "2000-01-01 00:00:00.012345678",
+            "2000-01-31 00:00:00.012345678",
+            "2000-02-29 00:00:00.012345678",
+        ]
+    ],
+)
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"months": 2, "years": 5},
+        {"microseconds": 1, "seconds": 1},
+        {"months": 2, "years": 5, "seconds": 923, "microseconds": 481},
+        pytest.param(
+            {"milliseconds": 4},
+            marks=pytest.mark.xfail(
+                reason="Pandas gets the wrong answer for milliseconds"
+            ),
+        ),
+        pytest.param(
+            {"milliseconds": 4, "years": 2},
+            marks=pytest.mark.xfail(
+                reason="Pandas construction fails with these keywords"
+            ),
+        ),
+        pytest.param(
+            {"nanoseconds": 12},
+            marks=pytest.mark.xfail(
+                reason="Pandas gets the wrong answer for nanoseconds"
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("op", [operator.add, operator.sub])
+def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
+
+    gsr = cudf.Series(date_col, dtype="datetime64[ns]")
+    psr = gsr.to_pandas()
+
+    poffset = pd.DateOffset(**kwargs)
+    goffset = cudf.DateOffset(**kwargs)
+
+    expect = op(psr, poffset)
+    got = op(gsr, goffset)
+
+    utils.assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "date_col",
+    [
+        [
+            "2000-01-01 00:00:00.012345678",
+            "2000-01-31 00:00:00.012345678",
+            "2000-02-29 00:00:00.012345678",
+        ]
+    ],
+)
+@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12])
+@pytest.mark.parametrize(
+    "frequency",
+    [
+        "months",
+        "years",
+        "days",
+        "hours",
+        "minutes",
+        "seconds",
+        "microseconds",
+        pytest.param(
+            "nanoseconds",
+            marks=pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/36589"
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+)
+def test_datetime_dateoffset_binaryop_reflected(
+    date_col, n_periods, frequency, dtype
+):
+    gsr = cudf.Series(date_col, dtype=dtype)
+    psr = gsr.to_pandas()  # converts to nanos
+
+    kwargs = {frequency: n_periods}
+
+    goffset = cudf.DateOffset(**kwargs)
+    poffset = pd.DateOffset(**kwargs)
+
+    expect = poffset + psr
+    got = goffset + gsr
 
     utils.assert_eq(expect, got)
 
+    with pytest.raises(TypeError):
+        poffset - psr
+
+    with pytest.raises(TypeError):
+        goffset - gsr
+
 
 @pytest.mark.parametrize("frame", [cudf.Series, cudf.Index, cudf.DataFrame])
 @pytest.mark.parametrize(
@@ -1645,11 +1772,11 @@ def _decimal_series(input, dtype):
         (
             operator.add,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=-2, precision=3),
+            cudf.Decimal64Dtype(scale=-2, precision=17),
             ["0.1", "0.2"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["100.1", "200.2"],
-            cudf.Decimal64Dtype(scale=3, precision=9),
+            cudf.Decimal64Dtype(scale=3, precision=18),
         ),
         (
             operator.sub,
@@ -1672,11 +1799,11 @@ def _decimal_series(input, dtype):
         (
             operator.sub,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=-2, precision=3),
+            cudf.Decimal64Dtype(scale=-2, precision=10),
             ["0.1", "0.2"],
-            cudf.Decimal64Dtype(scale=3, precision=4),
+            cudf.Decimal64Dtype(scale=6, precision=10),
             ["99.9", "199.8"],
-            cudf.Decimal64Dtype(scale=3, precision=9),
+            cudf.Decimal64Dtype(scale=6, precision=18),
         ),
         (
             operator.mul,
@@ -1705,6 +1832,33 @@ def _decimal_series(input, dtype):
             ["343.0", "1000.0"],
             cudf.Decimal64Dtype(scale=0, precision=8),
         ),
+        (
+            operator.truediv,
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
+            ["1.5", "3.0"],
+            cudf.Decimal64Dtype(scale=1, precision=4),
+            ["1.0", "0.6"],
+            cudf.Decimal64Dtype(scale=1, precision=9),
+        ),
+        (
+            operator.truediv,
+            ["110", "200"],
+            cudf.Decimal64Dtype(scale=-1, precision=3),
+            ["0.1", "0.2"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
+            ["1000.0", "1000.0"],
+            cudf.Decimal64Dtype(scale=-3, precision=8),
+        ),
+        (
+            operator.truediv,
+            ["132.86", "15.25"],
+            cudf.Decimal64Dtype(scale=4, precision=14),
+            ["2.34", "8.50"],
+            cudf.Decimal64Dtype(scale=2, precision=8),
+            ["56.77", "1.79"],
+            cudf.Decimal64Dtype(scale=2, precision=18),
+        ),
         (
             operator.add,
             ["1.5", None, "2.0"],
@@ -1753,11 +1907,11 @@ def _decimal_series(input, dtype):
         (
             operator.mul,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=-2, precision=3),
+            cudf.Decimal64Dtype(scale=-2, precision=10),
             ["0.1", None],
-            cudf.Decimal64Dtype(scale=3, precision=4),
+            cudf.Decimal64Dtype(scale=3, precision=12),
             ["10.0", None],
-            cudf.Decimal64Dtype(scale=1, precision=8),
+            cudf.Decimal64Dtype(scale=1, precision=18),
         ),
         (
             operator.eq,
@@ -1786,6 +1940,33 @@ def _decimal_series(input, dtype):
             [True, None],
             bool,
         ),
+        (
+            operator.ne,
+            ["0.06", "0.42"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            ["0.18", "0.42"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            [True, False],
+            bool,
+        ),
+        (
+            operator.ne,
+            ["1.33", "1.21"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            ["0.1899", "1.21"],
+            cudf.Decimal64Dtype(scale=4, precision=5),
+            [True, False],
+            bool,
+        ),
+        (
+            operator.ne,
+            ["300", None],
+            cudf.Decimal64Dtype(scale=-2, precision=3),
+            ["110", "5500"],
+            cudf.Decimal64Dtype(scale=-1, precision=4),
+            [True, None],
+            bool,
+        ),
         (
             operator.lt,
             ["0.18", "0.42", "1.00"],
@@ -1939,6 +2120,30 @@ def test_binops_decimal(args):
             cudf.Series([True, False, None], dtype=bool),
             cudf.Series([True, False, None], dtype=bool),
         ),
+        (
+            operator.ne,
+            ["100", "42", "24", None],
+            cudf.Decimal64Dtype(scale=0, precision=3),
+            [100, 40, 24, 12],
+            cudf.Series([False, True, False, None], dtype=bool),
+            cudf.Series([False, True, False, None], dtype=bool),
+        ),
+        (
+            operator.ne,
+            ["10.1", "88", "11", None],
+            cudf.Decimal64Dtype(scale=1, precision=3),
+            [10, 42, 11, 12],
+            cudf.Series([True, True, False, None], dtype=bool),
+            cudf.Series([True, True, False, None], dtype=bool),
+        ),
+        (
+            operator.ne,
+            ["100.000", "42", "23.999", None],
+            cudf.Decimal64Dtype(scale=3, precision=6),
+            [100, 42, 24, 12],
+            cudf.Series([False, False, True, None], dtype=bool),
+            cudf.Series([False, False, True, None], dtype=bool),
+        ),
         (
             operator.lt,
             ["100", "40", "28", None],
@@ -2211,6 +2416,78 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-1, precision=6),
             True,
         ),
+        (
+            operator.truediv,
+            ["1000", "2000"],
+            cudf.Decimal64Dtype(scale=-2, precision=4),
+            1,
+            ["1000", "2000"],
+            cudf.Decimal64Dtype(scale=-2, precision=6),
+            False,
+        ),
+        (
+            operator.truediv,
+            ["100", "200"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
+            decimal.Decimal(2),
+            ["50", "100"],
+            cudf.Decimal64Dtype(scale=2, precision=6),
+            False,
+        ),
+        (
+            operator.truediv,
+            ["35.23", "54.91"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
+            decimal.Decimal("1.5"),
+            ["23.4", "36.6"],
+            cudf.Decimal64Dtype(scale=1, precision=7),
+            False,
+        ),
+        (
+            operator.truediv,
+            ["22.2", "93.6"],
+            cudf.Decimal64Dtype(scale=1, precision=3),
+            cudf.Scalar(decimal.Decimal("1.5")),
+            ["14", "62"],
+            cudf.Decimal64Dtype(scale=0, precision=6),
+            False,
+        ),
+        (
+            operator.truediv,
+            ["100", "200"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            1,
+            ["0", "0"],
+            cudf.Decimal64Dtype(scale=-2, precision=5),
+            True,
+        ),
+        (
+            operator.truediv,
+            ["1.2", "0.5"],
+            cudf.Decimal64Dtype(scale=1, precision=6),
+            decimal.Decimal(20),
+            ["10", "40"],
+            cudf.Decimal64Dtype(scale=-1, precision=9),
+            True,
+        ),
+        (
+            operator.truediv,
+            ["1.22", "5.24"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            decimal.Decimal("8.55"),
+            ["7", "1"],
+            cudf.Decimal64Dtype(scale=0, precision=7),
+            True,
+        ),
+        (
+            operator.truediv,
+            ["1.1", "42.8"],
+            cudf.Decimal64Dtype(scale=1, precision=3),
+            cudf.Scalar(decimal.Decimal("90.84")),
+            ["82.5", "2.1"],
+            cudf.Decimal64Dtype(scale=1, precision=8),
+            True,
+        ),
         (
             operator.sub,
             ["100", "200"],
@@ -2332,6 +2609,30 @@ def decimal_series(input, dtype):
             cudf.Series([True, False, None], dtype=bool),
             cudf.Series([True, False, None], dtype=bool),
         ),
+        (
+            operator.ne,
+            ["100.00", "41", None],
+            cudf.Decimal64Dtype(scale=2, precision=5),
+            100,
+            cudf.Series([False, True, None], dtype=bool),
+            cudf.Series([False, True, None], dtype=bool),
+        ),
+        (
+            operator.ne,
+            ["100.123", "120.21", None],
+            cudf.Decimal64Dtype(scale=3, precision=6),
+            decimal.Decimal("100.123"),
+            cudf.Series([False, True, None], dtype=bool),
+            cudf.Series([False, True, None], dtype=bool),
+        ),
+        (
+            operator.ne,
+            ["100.123", "41", "120.21", None],
+            cudf.Decimal64Dtype(scale=3, precision=6),
+            cudf.Scalar(decimal.Decimal("100.123")),
+            cudf.Series([False, True, True, None], dtype=bool),
+            cudf.Series([False, True, True, None], dtype=bool),
+        ),
         (
             operator.gt,
             ["100.00", "41", "120.21", None],
diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py
index 241d719f09e..4600d932c6f 100644
--- a/python/cudf/cudf/tests/test_buffer.py
+++ b/python/cudf/cudf/tests/test_buffer.py
@@ -1,5 +1,6 @@
 import cupy as cp
 import pytest
+from cupy.testing import assert_array_equal
 
 from cudf.core.buffer import Buffer
 
@@ -44,3 +45,14 @@ def test_buffer_from_cuda_iface_dtype(data, dtype):
             TypeError, match="Buffer data must be of uint8 type"
         ):
             buf = Buffer(data=data, size=data.size)  # noqa: F841
+
+
+@pytest.mark.parametrize("size", [0, 1, 10, 100, 1000, 10_000])
+def test_buffer_copy(size):
+    data = cp.random.randint(low=0, high=100, size=size, dtype="u1")
+    buf = Buffer(data=data)
+    got = buf.copy()
+    assert got.size == buf.size
+    if size > 0:
+        assert got.ptr != buf.ptr
+    assert_array_equal(cp.asarray(buf), cp.asarray(got))
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index d0e31a82b28..5c4c121db4d 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -5,10 +5,12 @@
 import numpy as np
 import pandas as pd
 import pytest
+from decimal import Decimal
 
 import cudf as gd
 from cudf.tests.utils import assert_eq, assert_exceptions_equal
 from cudf.utils.dtypes import is_categorical_dtype
+from cudf.core.dtypes import Decimal64Dtype
 
 
 def make_frames(index=None, nulls="none"):
@@ -1203,3 +1205,325 @@ def test_concat_join_empty_dataframes_axis_1(
     assert_eq(
         expected, actual, check_index_type=False, check_column_type=False
     )
+
+
+def test_concat_preserve_order():
+    """Ensure that order is preserved on 'inner' concatenations."""
+    df = pd.DataFrame([["d", 3, 4.0], ["c", 4, 5.0]], columns=["c", "b", "a"])
+    dfs = [df, df]
+
+    assert_eq(
+        pd.concat(dfs, join="inner"),
+        gd.concat([gd.DataFrame(df) for df in dfs], join="inner"),
+    )
+
+
+@pytest.mark.parametrize("ignore_index", [True, False])
+@pytest.mark.parametrize("typ", [gd.DataFrame, gd.Series])
+def test_concat_single_object(ignore_index, typ):
+    """Ensure that concat on a single object does not change it."""
+    obj = typ([1, 2, 3])
+    assert_eq(gd.concat([obj], ignore_index=ignore_index, axis=0), obj)
+
+
+@pytest.mark.parametrize("ltype", [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2)])
+@pytest.mark.parametrize("rtype", [Decimal64Dtype(3, 2), Decimal64Dtype(8, 4)])
+def test_concat_decimal_dataframe(ltype, rtype):
+    gdf1 = gd.DataFrame(
+        {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]}
+    )
+    gdf2 = gd.DataFrame(
+        {"id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"]}
+    )
+
+    gdf1["val"] = gdf1["val"].astype(ltype)
+    gdf2["val"] = gdf2["val"].astype(rtype)
+
+    pdf1 = gdf1.to_pandas()
+    pdf2 = gdf2.to_pandas()
+
+    got = gd.concat([gdf1, gdf2])
+    expected = pd.concat([pdf1, pdf2])
+
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)])
+@pytest.mark.parametrize(
+    "rtype", [Decimal64Dtype(4, 3), Decimal64Dtype(10, 4)]
+)
+def test_concat_decimal_series(ltype, rtype):
+    gs1 = gd.Series(["228.3", "559.5", "281.1"]).astype(ltype)
+    gs2 = gd.Series(["2.345", "5.259", "8.154"]).astype(rtype)
+
+    ps1 = gs1.to_pandas()
+    ps2 = gs2.to_pandas()
+
+    got = gd.concat([gs1, gs2])
+    expected = pd.concat([ps1, ps2])
+
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "df1, df2, df3, expected",
+    [
+        (
+            gd.DataFrame(
+                {"val": [Decimal("42.5"), Decimal("8.7")]},
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.DataFrame(
+                {"val": [Decimal("9.23"), Decimal("-67.49")]},
+                dtype=Decimal64Dtype(6, 4),
+            ),
+            gd.DataFrame({"val": [8, -5]}, dtype="int32"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("42.5"),
+                        Decimal("8.7"),
+                        Decimal("9.23"),
+                        Decimal("-67.49"),
+                        Decimal("8"),
+                        Decimal("-5"),
+                    ]
+                },
+                dtype=Decimal64Dtype(7, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.DataFrame(
+                {"val": [Decimal("95.2"), Decimal("23.4")]},
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.DataFrame({"val": [54, 509]}, dtype="uint16"),
+            gd.DataFrame({"val": [24, -48]}, dtype="int32"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("95.2"),
+                        Decimal("23.4"),
+                        Decimal("54"),
+                        Decimal("509"),
+                        Decimal("24"),
+                        Decimal("-48"),
+                    ]
+                },
+                dtype=Decimal64Dtype(5, 2),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.DataFrame(
+                {"val": [Decimal("36.56"), Decimal("-59.24")]},
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.DataFrame({"val": [403.21, 45.13]}, dtype="float32"),
+            gd.DataFrame({"val": [52.262, -49.25]}, dtype="float64"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("36.56"),
+                        Decimal("-59.24"),
+                        Decimal("403.21"),
+                        Decimal("45.13"),
+                        Decimal("52.262"),
+                        Decimal("-49.25"),
+                    ]
+                },
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.DataFrame(
+                {"val": [Decimal("9563.24"), Decimal("236.633")]},
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.DataFrame({"val": [5393, -95832]}, dtype="int64"),
+            gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("9563.24"),
+                        Decimal("236.633"),
+                        Decimal("5393"),
+                        Decimal("-95832"),
+                        Decimal("-29.234"),
+                        Decimal("-31.945"),
+                    ]
+                },
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+    ],
+)
+def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
+    df = gd.concat([df1, df2, df3])
+    assert_eq(df, expected)
+    assert_eq(df.val.dtype, expected.val.dtype)
+
+
+@pytest.mark.parametrize(
+    "s1, s2, s3, expected",
+    [
+        (
+            gd.Series(
+                [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2)
+            ),
+            gd.Series(
+                [Decimal("101.243"), Decimal("-92.449")],
+                dtype=Decimal64Dtype(9, 6),
+            ),
+            gd.Series([94, -22], dtype="int32"),
+            gd.Series(
+                [
+                    Decimal("32.8"),
+                    Decimal("-87.7"),
+                    Decimal("101.243"),
+                    Decimal("-92.449"),
+                    Decimal("94"),
+                    Decimal("-22"),
+                ],
+                dtype=Decimal64Dtype(10, 6),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2)
+            ),
+            gd.Series([33, 984], dtype="uint32"),
+            gd.Series([593, -702], dtype="int32"),
+            gd.Series(
+                [
+                    Decimal("7.2"),
+                    Decimal("122.1"),
+                    Decimal("33"),
+                    Decimal("984"),
+                    Decimal("593"),
+                    Decimal("-702"),
+                ],
+                dtype=Decimal64Dtype(5, 2),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("982.94"), Decimal("-493.626")],
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.Series([847.98, 254.442], dtype="float32"),
+            gd.Series([5299.262, -2049.25], dtype="float64"),
+            gd.Series(
+                [
+                    Decimal("982.94"),
+                    Decimal("-493.626"),
+                    Decimal("847.98"),
+                    Decimal("254.442"),
+                    Decimal("5299.262"),
+                    Decimal("-2049.25"),
+                ],
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("492.204"), Decimal("-72824.455")],
+                dtype=Decimal64Dtype(9, 4),
+            ),
+            gd.Series([8438, -27462], dtype="int64"),
+            gd.Series([-40.292, 49202.953], dtype="float64"),
+            gd.Series(
+                [
+                    Decimal("492.204"),
+                    Decimal("-72824.455"),
+                    Decimal("8438"),
+                    Decimal("-27462"),
+                    Decimal("-40.292"),
+                    Decimal("49202.953"),
+                ],
+                dtype=Decimal64Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+    ],
+)
+def test_concat_decimal_numeric_series(s1, s2, s3, expected):
+    s = gd.concat([s1, s2, s3])
+    assert_eq(s, expected)
+
+
+@pytest.mark.parametrize(
+    "s1, s2, expected",
+    [
+        (
+            gd.Series(
+                [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2)
+            ),
+            gd.Series(["2007-06-12", "2006-03-14"], dtype="datetime64"),
+            gd.Series(
+                [
+                    "955.22",
+                    "8.20",
+                    "2007-06-12 00:00:00",
+                    "2006-03-14 00:00:00",
+                ],
+                index=[0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("-52.44"), Decimal("365.22")],
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.Series(
+                np.arange(
+                    "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]"
+                ),
+                dtype="datetime64[s]",
+            ),
+            gd.Series(
+                [
+                    "-52.44",
+                    "365.22",
+                    "2005-02-01 12:00:00",
+                    "2005-02-01 13:00:00",
+                    "2005-02-01 14:00:00",
+                ],
+                index=[0, 1, 0, 1, 2],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("753.0"), Decimal("94.22")],
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]),
+            gd.Series(
+                ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"],
+                index=[0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("753.0"), Decimal("94.22")],
+                dtype=Decimal64Dtype(5, 2),
+            ),
+            gd.Series(
+                [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")]
+            ),
+            gd.Series(
+                ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"],
+                index=[0, 1, 0, 1],
+            ),
+        ),
+    ],
+)
+def test_concat_decimal_non_numeric(s1, s2, expected):
+    s = gd.concat([s1, s2])
+    assert_eq(s, expected)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index d972d2ad11c..0a31624ef7c 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
-import csv
 import gzip
 import os
 import re
@@ -80,6 +79,16 @@ def make_numpy_mixed_dataframe():
     return df
 
 
+@pytest.fixture
+def pd_mixed_dataframe():
+    return make_numpy_mixed_dataframe()
+
+
+@pytest.fixture
+def cudf_mixed_dataframe():
+    return cudf.from_pandas(make_numpy_mixed_dataframe())
+
+
 def make_all_numeric_dataframe():
     df = pd.DataFrame()
 
@@ -238,12 +247,13 @@ def test_csv_reader_datetime(parse_dates):
 
 @pytest.mark.parametrize("pandas_arg", [{"delimiter": "|"}, {"sep": "|"}])
 @pytest.mark.parametrize("cudf_arg", [{"sep": "|"}, {"delimiter": "|"}])
-def test_csv_reader_mixed_data_delimiter_sep(tmpdir, pandas_arg, cudf_arg):
+def test_csv_reader_mixed_data_delimiter_sep(
+    tmpdir, pandas_arg, cudf_arg, pd_mixed_dataframe
+):
 
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file3.csv")
 
-    df = make_numpy_mixed_dataframe()
-    df.to_csv(fname, sep="|", index=False, header=False)
+    pd_mixed_dataframe.to_csv(fname, sep="|", index=False, header=False)
 
     gdf1 = read_csv(
         str(fname),
@@ -332,12 +342,11 @@ def test_csv_reader_dtype_extremes(use_names):
     assert_eq(gdf, pdf)
 
 
-def test_csv_reader_skiprows_skipfooter(tmpdir):
+def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
 
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file5.csv")
 
-    df = make_numpy_mixed_dataframe()
-    df.to_csv(
+    pd_mixed_dataframe.to_csv(
         fname, columns=["Integer", "Date", "Float"], index=False, header=False
     )
 
@@ -446,10 +455,9 @@ def test_csv_reader_strings_quotechars(tmpdir):
     assert df["text"][3] == "f,,!.,"
 
 
-def test_csv_reader_usecols_int_char(tmpdir):
+def test_csv_reader_usecols_int_char(tmpdir, pd_mixed_dataframe):
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file10.csv")
-    df = make_numpy_mixed_dataframe()
-    df.to_csv(
+    pd_mixed_dataframe.to_csv(
         fname,
         columns=["Integer", "Date", "Float", "Integer2"],
         index=False,
@@ -663,11 +671,13 @@ def test_csv_reader_buffer_strings():
         ("", None, None),
     ],
 )
-def test_csv_reader_compression(tmpdir, ext, out_comp, in_comp):
+def test_csv_reader_compression(
+    tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe
+):
 
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_compression" + ext)
 
-    df = make_numpy_mixed_dataframe()
+    df = pd_mixed_dataframe
     df.to_csv(fname, index=False, header=False, compression=out_comp)
 
     gdf = read_csv(fname, names=list(df.columns.values), compression=in_comp)
@@ -1502,27 +1512,45 @@ def test_csv_writer_datetime_data(tmpdir):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("sep", [",", "|", " ", ";", np.str_(",")])
+@pytest.mark.parametrize("line_terminator", ["\r", "\n", "\t", np.str_("\n")])
+@pytest.mark.parametrize("sep", [",", "/", np.str_(",")])
+def test_csv_writer_terminator_sep(line_terminator, sep, cudf_mixed_dataframe):
+    df = cudf_mixed_dataframe
+
+    buffer = BytesIO()
+    df.to_csv(buffer, line_terminator=line_terminator, sep=sep, index=False)
+
+    got = read_csv(buffer, lineterminator=line_terminator, sep=sep)
+    assert_eq(df, got)
+
+
+@pytest.mark.parametrize(
+    "line_terminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")]
+)
+def test_csv_writer_multichar_terminator(
+    line_terminator, cudf_mixed_dataframe
+):
+    df = cudf_mixed_dataframe
+
+    default_terminator_csv = StringIO()
+    df.to_csv(default_terminator_csv)
+
+    # Need to check manually since readers don't support
+    # multicharacter line terminators
+    expected = default_terminator_csv.getvalue().replace("\n", line_terminator)
+
+    buffer = StringIO()
+    df.to_csv(buffer, line_terminator=line_terminator)
+    got = buffer.getvalue()
+
+    assert_eq(expected, got)
+
+
 @pytest.mark.parametrize(
     "columns",
     [
-        # Category is not yet supported from libcudf
-        # ["Integer", "Date", "Float", "Integer2", "Category"],
-        ["Integer", "Date", "Float", "Integer2"],
-        # ["Category", "Date", "Float"],
         ["Date", "Float"],
-        ["Integer2"],
-        # ["Category", "Integer2", "Float", "Date", "Integer"],
-        ["Integer2", "Float", "Date", "Integer"],
-        [
-            # "Category",
-            "Integer2",
-            "Float",
-            "Date",
-            "Integer",
-            "String",
-            "Boolean",
-        ],
+        ["Integer2", "Float", "Date", "Integer", "String", "Boolean"],
         None,
     ],
 )
@@ -1532,51 +1560,40 @@ def test_csv_writer_datetime_data(tmpdir):
 @pytest.mark.parametrize(
     "index", [True, False, np.bool_(True), np.bool_(False)]
 )
-@pytest.mark.parametrize(
-    "line_terminator", ["\r", "\n", "NEWLINE", "<<<<<", np.str_("\n\r")]
-)
-def test_csv_writer_mixed_data(
-    sep, columns, header, index, line_terminator, tmpdir
+def test_csv_writer_column_and_header_options(
+    columns, header, index, pd_mixed_dataframe
 ):
-    pdf_df_fname = tmpdir.join("pdf_df_3.csv")
-    gdf_df_fname = tmpdir.join("gdf_df_3.csv")
+    pdf = pd_mixed_dataframe
+    df = cudf.from_pandas(pdf)
 
-    df = make_numpy_mixed_dataframe()
-    df["Date"] = df["Date"].astype("datetime64")
-    gdf = cudf.from_pandas(df)
-    gdf["Date"] = gdf["Date"].astype("datetime64[s]")
-    df.to_csv(
-        path_or_buf=pdf_df_fname,
-        index=index,
-        sep=sep,
-        columns=columns,
-        header=header,
-        line_terminator=line_terminator,
-        date_format="%Y-%m-%dT%H:%M:%SZ",
-        quoting=csv.QUOTE_NONE,
-        escapechar="\\",
-    )
-    gdf.to_csv(
-        path_or_buf=gdf_df_fname,
-        index=index,
-        sep=sep,
-        columns=columns,
-        header=header,
-        line_terminator=line_terminator,
+    cudf_buffer = BytesIO()
+    df.to_csv(cudf_buffer, columns=columns, header=header, index=index)
+    pd_buffer = BytesIO()
+    pdf.to_csv(pd_buffer, columns=columns, header=header, index=index)
+
+    expected = cudf.read_csv(pd_buffer, header=0 if header else None)
+    got = cudf.read_csv(cudf_buffer, header=0 if header else None)
+
+    expected_column_cnt = (1 if index else 0) + (
+        len(columns) if columns else pdf.shape[1]
     )
+    assert_eq(expected_column_cnt, got.shape[1])
+    assert_eq(expected, got)
 
-    assert os.path.exists(pdf_df_fname)
-    assert os.path.exists(gdf_df_fname)
 
-    expect = pd.read_csv(pdf_df_fname, quoting=csv.QUOTE_NONE, escapechar="\\")
-    got = pd.read_csv(gdf_df_fname)
-    assert_eq(expect, got)
+def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe):
+    df = cudf_mixed_dataframe
+
+    buffer = BytesIO()
+    with pytest.raises(RuntimeError):
+        df.to_csv(buffer, columns=[], index=False)
 
 
 def test_csv_writer_multiindex(tmpdir):
     pdf_df_fname = tmpdir.join("pdf_df_3.csv")
     gdf_df_fname = tmpdir.join("gdf_df_3.csv")
 
+    np.random.seed(0)
     gdf = cudf.DataFrame(
         {
             "a": np.random.randint(0, 5, 20),
@@ -1599,28 +1616,15 @@ def test_csv_writer_multiindex(tmpdir):
 
 
 @pytest.mark.parametrize("chunksize", [None, 9, 1000])
-def test_csv_writer_chunksize(chunksize, tmpdir):
-    pdf_df_fname = tmpdir.join("pdf_df_4.csv")
-    gdf_df_fname = tmpdir.join("gdf_df_4.csv")
-
-    pdf = make_numpy_mixed_dataframe()
-    pdf["Date"] = pdf["Date"].astype("datetime64")
-    # Increase the df len as chunked logic only gets applied from chunksize >=8
-    pdf = pd.concat([pdf] * 5)
-    gdf = cudf.from_pandas(pdf)
-    gdf["Date"] = gdf["Date"].astype("datetime64[s]")
-
-    pdf.to_csv(
-        pdf_df_fname, date_format="%Y-%m-%dT%H:%M:%SZ", chunksize=chunksize,
-    )
-    gdf.to_csv(gdf_df_fname, chunksize=chunksize)
+@pytest.mark.parametrize("dtype", dtypes)
+def test_csv_writer_chunksize(chunksize, dtype):
+    cu_df = cudf.from_pandas(make_numeric_dataframe(100, dtype))
 
-    assert os.path.exists(pdf_df_fname)
-    assert os.path.exists(gdf_df_fname)
+    buffer = BytesIO()
+    cu_df.to_csv(buffer, chunksize=chunksize, index=False)
 
-    expect = pd.read_csv(pdf_df_fname)
-    got = pd.read_csv(gdf_df_fname)
-    assert_eq(expect, got)
+    got = cudf.read_csv(buffer, dtype=[dtype])
+    assert_eq(cu_df, got)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 810eabdf699..0b73f32e94d 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -45,6 +45,28 @@ def test_init_via_list_of_tuples():
     assert_eq(pdf, gdf)
 
 
+@pytest.mark.parametrize("columns", [["a", "b"], pd.Series(["a", "b"])])
+def test_init_via_list_of_series(columns):
+    data = [pd.Series([1, 2]), pd.Series([3, 4])]
+
+    pdf = cudf.DataFrame(data, columns=columns)
+    gdf = cudf.DataFrame(data, columns=columns)
+
+    assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize("index", [None, [0, 1, 2]])
+def test_init_with_missing_columns(index):
+    """Test initialization when columns and data keys are disjoint."""
+    data = {"a": [1, 2, 3], "b": [2, 3, 4]}
+    columns = ["c", "d"]
+
+    pdf = cudf.DataFrame(data, columns=columns, index=index)
+    gdf = cudf.DataFrame(data, columns=columns, index=index)
+
+    assert_eq(pdf, gdf)
+
+
 def _dataframe_na_data():
     return [
         pd.DataFrame(
@@ -2009,6 +2031,27 @@ def test_quantile(q, numeric_only):
         )
 
 
+@pytest.mark.parametrize("q", [0.2, 1, 0.001, [0.5], [], [0.005, 0.8, 0.03]])
+@pytest.mark.parametrize("interpolation", ["higher", "lower", "nearest"])
+def test_decimal_quantile(q, interpolation):
+    data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"]
+    gdf = cudf.DataFrame(
+        {"id": np.random.randint(0, 10, size=len(data)), "val": data}
+    )
+    gdf["id"] = gdf["id"].astype("float64")
+    gdf["val"] = gdf["val"].astype(cudf.Decimal64Dtype(7, 2))
+    pdf = gdf.to_pandas()
+
+    got = gdf.quantile(q, numeric_only=False, interpolation=interpolation)
+    expected = pdf.quantile(
+        q if isinstance(q, list) else [q],
+        numeric_only=False,
+        interpolation=interpolation,
+    )
+
+    assert_eq(got, expected)
+
+
 def test_empty_quantile():
     pdf = pd.DataFrame({"x": []})
     df = cudf.DataFrame({"x": []})
@@ -4334,12 +4377,6 @@ def test_constructor_properties():
     df[key1] = val1
     df[key2] = val2
 
-    # Correct use of _constructor (for DataFrame)
-    assert_eq(df, df._constructor({key1: val1, key2: val2}))
-
-    # Correct use of _constructor (for cudf.Series)
-    assert_eq(df[key1], df[key2]._constructor(val1, name=key1))
-
     # Correct use of _constructor_sliced (for DataFrame)
     assert_eq(df[key1], df._constructor_sliced(val1, name=key1))
 
@@ -6026,7 +6063,7 @@ def test_dataframe_init_1d_list(data, columns):
 def test_dataframe_init_from_arrays_cols(data, cols, index):
 
     gd_data = data
-    if isinstance(data, cupy.core.ndarray):
+    if isinstance(data, cupy.ndarray):
         # pandas can't handle cupy arrays in general
         pd_data = data.get()
 
@@ -7786,6 +7823,7 @@ def test_equals_dtypes():
     [
         pd.DataFrame({"a": [10, 11, 12]}, index=["a", "b", "z"]),
         pd.DataFrame({"z": ["a"]}),
+        pd.DataFrame({"a": [], "b": []}),
     ],
 )
 @pytest.mark.parametrize(
@@ -8518,3 +8556,123 @@ def test_dataframe_argsort(df, ascending, expected):
     actual = df.argsort(ascending=ascending)
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "data,columns,index",
+    [
+        (pd.Series([1, 2, 3]), None, None),
+        (pd.Series(["a", "b", None, "c"], name="abc"), None, None),
+        (
+            pd.Series(["a", "b", None, "c"], name="abc"),
+            ["abc", "b"],
+            [1, 2, 3],
+        ),
+    ],
+)
+def test_dataframe_init_from_series(data, columns, index):
+    expected = pd.DataFrame(data, columns=columns, index=index)
+    actual = cudf.DataFrame(data, columns=columns, index=index)
+
+    assert_eq(
+        expected,
+        actual,
+        check_index_type=False if len(expected) == 0 else True,
+    )
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False),
+        ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True),
+        ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False),
+        ({"a": [True, False, False], "b": [False, False, True]}, True),
+        ({"a": [True, False, False]}, True),
+        ({"a": [[1, 2], [3, 4]]}, True),
+        ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False),
+        ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True),
+        ({}, True),
+    ],
+)
+def test_is_homogeneous_dataframe(data, expected):
+    actual = cudf.DataFrame(data)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "data, indexes, expected",
+    [
+        (
+            {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]},
+            ["a", "b"],
+            True,
+        ),
+        (
+            {
+                "a": [1, 2, 3, 4],
+                "b": [5, 6, 7, 8],
+                "c": [1.2, 1, 2, 3],
+                "d": ["hello", "world", "cudf", "rapids"],
+            },
+            ["a", "b"],
+            False,
+        ),
+        (
+            {
+                "a": ["a", "b", "c"],
+                "b": [4, 5, 6],
+                "c": [7, 8, 9],
+                "d": [1, 2, 3],
+            },
+            ["a", "b"],
+            True,
+        ),
+    ],
+)
+def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected):
+    test_dataframe = cudf.DataFrame(data).set_index(indexes)
+    actual = cudf.DataFrame(test_dataframe)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "data, expected", [([1, 2, 3, 4], True), ([True, False], True)]
+)
+def test_is_homogeneous_series(data, expected):
+    actual = cudf.Series(data)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "levels, codes, expected",
+    [
+        (
+            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
+            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+            True,
+        ),
+        (
+            [[1, 2, 3], [True, False, True]],
+            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+            False,
+        ),
+    ],
+)
+def test_is_homogeneous_multiIndex(levels, codes, expected):
+    actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous
+
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)],
+)
+def test_is_homogeneous_index(data, expected):
+    actual = cudf.Index(data)._is_homogeneous
+
+    assert actual == expected
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7eb8fcd0aa4..647ff5250ba 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1248,3 +1248,17 @@ def test_datetime_infer_format(data, dtype):
 def test_dateoffset_instance_subclass_check():
     assert not issubclass(pd.DateOffset, cudf.DateOffset)
     assert not isinstance(pd.DateOffset(), cudf.DateOffset)
+
+
+def test_datetime_to_datetime_error():
+    assert_exceptions_equal(
+        lfunc=pd.to_datetime,
+        rfunc=cudf.to_datetime,
+        lfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],),
+        rfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],),
+        check_exception_type=False,
+        expected_error_message=re.escape(
+            "errors parameter has to be either one of: ['ignore', 'raise', "
+            "'coerce', 'warn'], found: %d-%B-%Y %H:%M"
+        ),
+    )
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 70fc63baba8..073a8e443c7 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -13,6 +13,7 @@
     FLOAT_TYPES,
     INTEGER_TYPES,
     NUMERIC_TYPES,
+    _decimal_series,
     assert_eq,
 )
 
@@ -201,3 +202,87 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype):
     expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
 
     assert_eq(got, expected)
+    assert_eq(got.dtype, expected.dtype)
+
+
+@pytest.mark.parametrize(
+    "args",
+    [
+        # scatter to a single index
+        (
+            ["1", "2", "3"],
+            Decimal64Dtype(1, 0),
+            Decimal(5),
+            1,
+            ["1", "5", "3"],
+        ),
+        (
+            ["1.5", "2.5", "3.5"],
+            Decimal64Dtype(2, 1),
+            Decimal("5.5"),
+            1,
+            ["1.5", "5.5", "3.5"],
+        ),
+        (
+            ["1.0042", "2.0042", "3.0042"],
+            Decimal64Dtype(5, 4),
+            Decimal("5.0042"),
+            1,
+            ["1.0042", "5.0042", "3.0042"],
+        ),
+        # scatter via boolmask
+        (
+            ["1", "2", "3"],
+            Decimal64Dtype(1, 0),
+            Decimal(5),
+            cudf.Series([True, False, True]),
+            ["5", "2", "5"],
+        ),
+        (
+            ["1.5", "2.5", "3.5"],
+            Decimal64Dtype(2, 1),
+            Decimal("5.5"),
+            cudf.Series([True, True, True]),
+            ["5.5", "5.5", "5.5"],
+        ),
+        (
+            ["1.0042", "2.0042", "3.0042"],
+            Decimal64Dtype(5, 4),
+            Decimal("5.0042"),
+            cudf.Series([False, False, True]),
+            ["1.0042", "2.0042", "5.0042"],
+        ),
+        # We will allow assigning a decimal with less precision
+        (
+            ["1.00", "2.00", "3.00"],
+            Decimal64Dtype(3, 2),
+            Decimal(5),
+            1,
+            ["1.00", "5.00", "3.00"],
+        ),
+        # But not truncation
+        (
+            ["1", "2", "3"],
+            Decimal64Dtype(1, 0),
+            Decimal("5.5"),
+            1,
+            pa.lib.ArrowInvalid,
+        ),
+        # We will allow for setting scalars into decimal columns
+        (["1", "2", "3"], Decimal64Dtype(1, 0), 5, 1, ["1", "5", "3"]),
+        # But not if it has too many digits to fit the precision
+        (["1", "2", "3"], Decimal64Dtype(1, 0), 50, 1, pa.lib.ArrowInvalid),
+    ],
+)
+def test_series_setitem_decimal(args):
+    data, dtype, item, to, expect = args
+    data = _decimal_series(data, dtype)
+
+    if expect is pa.lib.ArrowInvalid:
+        with pytest.raises(expect):
+            data[to] = item
+        return
+    else:
+        expect = _decimal_series(expect, dtype)
+        data[to] = item
+        assert_eq(data, expect)
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index b6e2aac0304..a5895caf49f 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -6,14 +6,16 @@
 import pytest
 
 import cudf
+from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
     CategoricalDtype,
     Decimal64Dtype,
+    IntervalDtype,
     ListDtype,
     StructDtype,
-    IntervalDtype,
 )
 from cudf.tests.utils import assert_eq
+from cudf.utils.dtypes import np_to_pa_dtype
 
 
 def test_cdt_basic():
@@ -155,3 +157,103 @@ def test_interval_dtype_pyarrow_round_trip(fields, closed):
     expect = pa_array
     got = IntervalDtype.from_arrow(expect).to_arrow()
     assert expect.equals(got)
+
+
+def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array):
+    """
+    In cudf, each column holds its dtype. And since column may have child
+    columns, child columns also holds their datatype. This method tests
+    that every level of `column` matches the type of the given `array`
+    recursively.
+    """
+
+    if isinstance(column.dtype, ListDtype):
+        return array.type.equals(
+            column.dtype.to_arrow()
+        ) and assert_column_array_dtype_equal(
+            column.base_children[1], array.values
+        )
+    elif isinstance(column.dtype, StructDtype):
+        return array.type.equals(column.dtype.to_arrow()) and all(
+            [
+                assert_column_array_dtype_equal(child, array.field(i))
+                for i, child in enumerate(column.base_children)
+            ]
+        )
+    elif isinstance(column.dtype, Decimal64Dtype):
+        return array.type.equals(column.dtype.to_arrow())
+    elif isinstance(column.dtype, CategoricalDtype):
+        raise NotImplementedError()
+    else:
+        return array.type.equals(np_to_pa_dtype(column.dtype))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [[{"name": 123}]],
+        [
+            [
+                {
+                    "IsLeapYear": False,
+                    "data": {"Year": 1999, "Month": 7},
+                    "names": ["Mike", None],
+                },
+                {
+                    "IsLeapYear": True,
+                    "data": {"Year": 2004, "Month": 12},
+                    "names": None,
+                },
+                {
+                    "IsLeapYear": False,
+                    "data": {"Year": 1996, "Month": 2},
+                    "names": ["Rose", "Richard"],
+                },
+            ]
+        ],
+        [
+            [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}],
+            [
+                {"human?": None, "deets": {"weight": 5.3, "age": 25}},
+                {"human?": False, "deets": {"weight": 8.0, "age": 31}},
+                {"human?": False, "deets": None},
+            ],
+            [],
+            None,
+            [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
+        ],
+        [
+            {
+                "name": "var0",
+                "val": [
+                    {"name": "var1", "val": None, "type": "optional<struct>"}
+                ],
+                "type": "list",
+            },
+            {},
+            {
+                "name": "var2",
+                "val": [
+                    {
+                        "name": "var3",
+                        "val": {"field": 42},
+                        "type": "optional<struct>",
+                    },
+                    {
+                        "name": "var4",
+                        "val": {"field": 3.14},
+                        "type": "optional<struct>",
+                    },
+                ],
+                "type": "list",
+            },
+            None,
+        ],
+    ],
+)
+def test_lists_of_structs_dtype(data):
+    got = cudf.Series(data)
+    expected = pa.array(data)
+
+    assert_column_array_dtype_equal(got._column, expected)
+    assert expected.equals(got._column.to_arrow())
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 61d11fa5961..cd0f870e0d5 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -20,7 +20,7 @@ def test_factorize_series_obj(ncats, nelem):
 
     uvals, labels = df["cats"].factorize()
     np.testing.assert_array_equal(labels.to_array(), sorted(set(arr)))
-    assert isinstance(uvals, cp.core.core.ndarray)
+    assert isinstance(uvals, cp.ndarray)
     assert isinstance(labels, Index)
 
     encoder = dict((labels[idx], idx) for idx in range(len(labels)))
@@ -39,7 +39,7 @@ def test_factorize_index_obj(ncats, nelem):
 
     uvals, labels = df.index.factorize()
     np.testing.assert_array_equal(labels.values.get(), sorted(set(arr)))
-    assert isinstance(uvals, cp.core.core.ndarray)
+    assert isinstance(uvals, cp.ndarray)
     assert isinstance(labels, Index)
 
     encoder = dict((labels[idx], idx) for idx in range(len(labels)))
@@ -127,15 +127,15 @@ def test_factorize_result_classes():
 
     labels, cats = cudf.factorize(cudf.Series(data))
 
-    assert isinstance(labels, cp.core.core.ndarray)
+    assert isinstance(labels, cp.ndarray)
     assert isinstance(cats, cudf.Index)
 
     labels, cats = cudf.factorize(cudf.Index(data))
 
-    assert isinstance(labels, cp.core.core.ndarray)
+    assert isinstance(labels, cp.ndarray)
     assert isinstance(cats, cudf.Index)
 
     labels, cats = cudf.factorize(cp.array(data))
 
-    assert isinstance(labels, cp.core.core.ndarray)
-    assert isinstance(cats, cp.core.core.ndarray)
+    assert isinstance(labels, cp.ndarray)
+    assert isinstance(cats, cp.ndarray)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 4dbe608af82..e774bda4914 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -15,6 +15,7 @@
 import cudf
 from cudf.core import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
+from cudf.tests.dataset_generator import rand_dataframe
 from cudf.tests.utils import (
     DATETIME_TYPES,
     SIGNED_TYPES,
@@ -27,17 +28,31 @@
 _tomorrow = _now + np.timedelta64(1, "D")
 _now = np.int64(_now.astype("datetime64[ns]"))
 _tomorrow = np.int64(_tomorrow.astype("datetime64[ns]"))
-_index_type_aggs = {"count", "idxmin", "idxmax"}
+_index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"}
 
 
-def assert_groupby_results_equal(expect, got, sort=True, **kwargs):
+def assert_groupby_results_equal(
+    expect, got, sort=True, as_index=True, by=None, **kwargs
+):
     # Because we don't sort by index by default in groupby,
     # sort expect and got by index before comparing
     if sort:
-        expect = expect.sort_index()
-        got = got.sort_index()
-    else:
-        assert_eq(expect.sort_index(), got.sort_index(), **kwargs)
+        if as_index:
+            expect = expect.sort_index()
+            got = got.sort_index()
+        else:
+            assert by is not None
+            if isinstance(expect, (pd.DataFrame, cudf.DataFrame)):
+                expect = expect.sort_values(by=by).reset_index(drop=True)
+            else:
+                expect = expect.sort_values().reset_index(drop=True)
+
+            if isinstance(got, cudf.DataFrame):
+                got = got.sort_values(by=by).reset_index(drop=True)
+            else:
+                got = got.sort_values().reset_index(drop=True)
+
+    assert_eq(expect, got, **kwargs)
 
 
 def make_frame(
@@ -201,10 +216,16 @@ def test_groupby_getitem_getattr(as_index):
     pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]})
     gdf = cudf.from_pandas(pdf)
     assert_groupby_results_equal(
-        pdf.groupby("x")["y"].sum(), gdf.groupby("x")["y"].sum(),
+        pdf.groupby("x")["y"].sum(),
+        gdf.groupby("x")["y"].sum(),
+        as_index=as_index,
+        by="x",
     )
     assert_groupby_results_equal(
-        pdf.groupby("x").y.sum(), gdf.groupby("x").y.sum(),
+        pdf.groupby("x").y.sum(),
+        gdf.groupby("x").y.sum(),
+        as_index=as_index,
+        by="x",
     )
     assert_groupby_results_equal(
         pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum(),
@@ -212,6 +233,8 @@ def test_groupby_getitem_getattr(as_index):
     assert_groupby_results_equal(
         pdf.groupby(["x", "y"], as_index=as_index).sum(),
         gdf.groupby(["x", "y"], as_index=as_index).sum(),
+        as_index=as_index,
+        by=["x", "y"],
     )
 
 
@@ -314,7 +337,18 @@ def emulate(df):
 @pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000])
 @pytest.mark.parametrize(
     "func",
-    ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"],
+    [
+        "mean",
+        "std",
+        "var",
+        "min",
+        "max",
+        "idxmin",
+        "idxmax",
+        "count",
+        "sum",
+        "prod",
+    ],
 )
 def test_groupby_2keys_agg(nelem, func):
     # gdf (Note: lack of multiIndex)
@@ -390,7 +424,7 @@ def test_groupby_agg_decimal(num_groups, nelem_per_group, func):
 
 
 @pytest.mark.parametrize(
-    "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "mean"]
+    "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "prod", "mean"]
 )
 def test_series_groupby(agg):
     s = pd.Series([1, 2, 3])
@@ -404,7 +438,7 @@ def test_series_groupby(agg):
 
 
 @pytest.mark.parametrize(
-    "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "mean"]
+    "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "prod", "mean"]
 )
 def test_series_groupby_agg(agg):
     s = pd.Series([1, 2, 3])
@@ -422,6 +456,7 @@ def test_series_groupby_agg(agg):
         "max",
         "count",
         "sum",
+        "prod",
         "mean",
         pytest.param(
             "idxmin",
@@ -451,6 +486,7 @@ def test_groupby_level_zero(agg):
         "max",
         "count",
         "sum",
+        "prod",
         "mean",
         pytest.param(
             "idxmin",
@@ -815,7 +851,7 @@ def test_groupby_multi_agg_hash_groupby(agg):
 
 
 @pytest.mark.parametrize(
-    "agg", ["min", "max", "idxmax", "idxmax", "sum", "count", "mean"]
+    "agg", ["min", "max", "idxmax", "idxmax", "sum", "prod", "count", "mean"]
 )
 def test_groupby_nulls_basic(agg):
     check_dtype = False if agg in _index_type_aggs else True
@@ -855,7 +891,7 @@ def test_groupby_nulls_basic(agg):
     # Pandas' null semantics. Should we change it?
     assert_groupby_results_equal(
         getattr(pdf.groupby("a"), agg)().fillna(0),
-        getattr(gdf.groupby("a"), agg)().fillna(0),
+        getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1),
         check_dtype=check_dtype,
     )
 
@@ -1026,6 +1062,36 @@ def test_groupby_size():
     )
 
 
+def test_groupby_cumcount():
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 1, 3, 4],
+            "b": ["bob", "bob", "alice", "cooper"],
+            "c": [1, 2, 3, 4],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    assert_groupby_results_equal(
+        pdf.groupby("a").cumcount(),
+        gdf.groupby("a").cumcount(),
+        check_dtype=False,
+    )
+
+    assert_groupby_results_equal(
+        pdf.groupby(["a", "b", "c"]).cumcount(),
+        gdf.groupby(["a", "b", "c"]).cumcount(),
+        check_dtype=False,
+    )
+
+    sr = pd.Series(range(len(pdf)))
+    assert_groupby_results_equal(
+        pdf.groupby(sr).cumcount(),
+        gdf.groupby(sr).cumcount(),
+        check_dtype=False,
+    )
+
+
 @pytest.mark.parametrize("nelem", get_nelem())
 @pytest.mark.parametrize("as_index", [True, False])
 @pytest.mark.parametrize(
@@ -1045,7 +1111,13 @@ def test_groupby_datetime(nelem, as_index, agg):
     else:
         pdres = pdg.agg({"datetime": agg})
         gdres = gdg.agg({"datetime": agg})
-    assert_groupby_results_equal(pdres, gdres, check_dtype=check_dtype)
+    assert_groupby_results_equal(
+        pdres,
+        gdres,
+        check_dtype=check_dtype,
+        as_index=as_index,
+        by=["datetime"],
+    )
 
 
 def test_groupby_dropna():
@@ -1236,7 +1308,11 @@ def test_raise_data_error():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
 
-    assert_exceptions_equal(pdf.groupby("a").mean, gdf.groupby("a").mean)
+    assert_exceptions_equal(
+        pdf.groupby("a").mean,
+        gdf.groupby("a").mean,
+        compare_error_message=False,
+    )
 
 
 def test_drop_unsupported_multi_agg():
@@ -1302,6 +1378,8 @@ def test_reset_index_after_empty_groupby():
     assert_groupby_results_equal(
         pdf.groupby("a").sum().reset_index(),
         gdf.groupby("a").sum().reset_index(),
+        as_index=False,
+        by="a",
     )
 
 
@@ -1588,3 +1666,238 @@ def test_groupby_unique(by, data, dtype):
     expect = pdf.groupby("by")["data"].unique()
     got = gdf.groupby("by")["data"].unique()
     assert_groupby_results_equal(expect, got)
+
+
+@pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
+@pytest.mark.parametrize("func", ["cummin", "cummax", "cumcount", "cumsum"])
+def test_groupby_2keys_scan(nelem, func):
+    pdf = make_frame(pd.DataFrame, nelem=nelem)
+    expect_df = pdf.groupby(["x", "y"], sort=True).agg(func)
+    got_df = (
+        make_frame(DataFrame, nelem=nelem)
+        .groupby(["x", "y"], sort=True)
+        .agg(func)
+    )
+    # pd.groupby.cumcount returns a series.
+    if isinstance(expect_df, pd.Series):
+        expect_df = expect_df.to_frame("val")
+    expect_df = expect_df.set_index([pdf["x"], pdf["y"]]).sort_index()
+
+    check_dtype = False if func in _index_type_aggs else True
+    assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
+
+
+def test_groupby_mix_agg_scan():
+    err_msg = "Cannot perform both aggregation and scan in one operation"
+    func = ["cumsum", "sum"]
+    gb = make_frame(DataFrame, nelem=10).groupby(["x", "y"], sort=True)
+
+    gb.agg(func[0])
+    gb.agg(func[1])
+    gb.agg(func[1:])
+    with pytest.raises(NotImplementedError, match=err_msg):
+        gb.agg(func)
+
+
+@pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
+@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
+@pytest.mark.parametrize("direction", [1, -1])
+@pytest.mark.parametrize("fill_value", [None, np.nan, 42])
+def test_groupby_shift_row(nelem, shift_perc, direction, fill_value):
+    pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"])
+    gdf = cudf.from_pandas(pdf)
+    n_shift = int(nelem * shift_perc) * direction
+
+    expected = pdf.groupby(["x", "y"]).shift(
+        periods=n_shift, fill_value=fill_value
+    )
+    got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value)
+
+    # Pandas returns shifted column in original row order. We set its index
+    # to be the key columns, so that `assert_groupby_results_equal` can sort
+    # rows by key columns to make sure cudf and pandas results matches.
+    expected.index = pd.MultiIndex.from_frame(gdf[["x", "y"]].to_pandas())
+    assert_groupby_results_equal(
+        expected[["val", "val2"]], got[["val", "val2"]]
+    )
+
+
+@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
+@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
+@pytest.mark.parametrize("direction", [1, -1])
+@pytest.mark.parametrize("fill_value", [None, 0, 42])
+def test_groupby_shift_row_mixed_numerics(
+    nelem, shift_perc, direction, fill_value
+):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+            {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
+            {
+                "dtype": "datetime64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+            {
+                "dtype": "timedelta64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    pdf = t.to_pandas()
+    gdf = cudf.from_pandas(pdf)
+    n_shift = int(nelem * shift_perc) * direction
+
+    expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
+    got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
+
+    # Pandas returns shifted column in original row order. We set its index
+    # to be the key columns, so that `assert_groupby_results_equal` can sort
+    # rows by key columns to make sure cudf and pandas results matches.
+    expected.index = gdf["0"].to_pandas()
+    assert_groupby_results_equal(
+        expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
+    )
+
+
+# TODO: Shifting list columns is currently unsupported because we cannot
+# construct a null list scalar in python. Support once it is added.
+@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
+@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
+@pytest.mark.parametrize("direction", [1, -1])
+def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+            {"dtype": "str", "null_frequency": 0.4, "cardinality": 10},
+            {
+                "dtype": "datetime64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+            {
+                "dtype": "timedelta64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    pdf = t.to_pandas()
+    gdf = cudf.from_pandas(pdf)
+    n_shift = int(nelem * shift_perc) * direction
+
+    expected = pdf.groupby(["0"]).shift(periods=n_shift)
+    got = gdf.groupby(["0"]).shift(periods=n_shift)
+
+    # Pandas returns shifted column in original row order. We set its index
+    # to be the key columns, so that `assert_groupby_results_equal` can sort
+    # rows by key columns to make sure cudf and pandas results matches.
+    expected.index = gdf["0"].to_pandas()
+    assert_groupby_results_equal(
+        expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
+    )
+
+
+@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
+@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
+@pytest.mark.parametrize("direction", [1, -1])
+@pytest.mark.parametrize(
+    "fill_value",
+    [
+        [
+            42,
+            "fill",
+            np.datetime64(123, "ns"),
+            cudf.Scalar(456, dtype="timedelta64[ns]"),
+        ]
+    ],
+)
+def test_groupby_shift_row_mixed_fill(
+    nelem, shift_perc, direction, fill_value
+):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+            {"dtype": "str", "null_frequency": 0.4, "cardinality": 10},
+            {
+                "dtype": "datetime64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+            {
+                "dtype": "timedelta64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    pdf = t.to_pandas()
+    gdf = cudf.from_pandas(pdf)
+    n_shift = int(nelem * shift_perc) * direction
+
+    # Pandas does not support specifing different fill_value by column, so we
+    # simulate it column by column
+    expected = pdf.copy()
+    for col, single_fill in zip(pdf.iloc[:, 1:], fill_value):
+        if isinstance(single_fill, cudf.Scalar):
+            single_fill = single_fill._host_value
+        expected[col] = (
+            pdf[col]
+            .groupby(pdf["0"])
+            .shift(periods=n_shift, fill_value=single_fill)
+        )
+
+    got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
+
+    # Pandas returns shifted column in original row order. We set its index
+    # to be the key columns, so that `assert_groupby_results_equal` can sort
+    # rows by key columns to make sure cudf and pandas results matches.
+    expected.index = gdf["0"].to_pandas()
+    assert_groupby_results_equal(
+        expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
+    )
+
+
+@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
+@pytest.mark.parametrize("fill_value", [None, 0, 42])
+def test_groupby_shift_row_zero_shift(nelem, fill_value):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+            {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
+            {
+                "dtype": "datetime64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+            {
+                "dtype": "timedelta64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    gdf = cudf.from_pandas(t.to_pandas())
+
+    expected = gdf
+    got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value)
+
+    # Here, the result should be the same as input due to 0-shift, only the
+    # key orders are different.
+    expected = expected.set_index("0")
+    assert_groupby_results_equal(
+        expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
+    )
diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py
index 698dcba650f..529552cb2d9 100644
--- a/python/cudf/cudf/tests/test_hash_vocab.py
+++ b/python/cudf/cudf/tests/test_hash_vocab.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 from cudf.utils.hash_vocab_utils import hash_vocab
 import os
 import filecmp
@@ -7,18 +7,16 @@
 
 @pytest.fixture(scope="module")
 def datadir(datadir):
-    return os.path.join(datadir, "vocab_hash")
+    return os.path.join(
+        datadir, "subword_tokenizer_data", "bert_base_cased_sampled"
+    )
 
 
 def test_correct_bert_base_vocab_hash(datadir, tmpdir):
-    # The vocabulary is 5% drawn from bert-base-uncased
-    # sampling script at:
-    # https://gist.github.com/VibhuJawa/4fc5981d2cbba1ab8b1e78cdf6aede72
-    vocab_path = os.path.join(datadir, "bert-base-uncased-vocab-5per.txt")
+    # The vocabulary is drawn from bert-base-cased
+    vocab_path = os.path.join(datadir, "vocab.txt")
 
-    groundtruth_path = os.path.join(
-        datadir, "ground_truth_vocab_hash_5per.txt"
-    )
+    groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
     output_path = tmpdir.join("cudf-vocab-hash.txt")
     hash_vocab(vocab_path, output_path)
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 21a431dd540..158dffc3884 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1799,7 +1799,7 @@ def test_index_tolist(data, dtype):
         TypeError,
         match=re.escape(
             r"cuDF does not support conversion to host memory "
-            r"via `tolist()` method. Consider using "
+            r"via the `tolist()` method. Consider using "
             r"`.to_arrow().to_pylist()` to construct a Python list."
         ),
     ):
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 086d59ab0f2..bc67164d787 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -102,13 +102,16 @@ def pdf_gdf_multi():
 def test_series_indexing(i1, i2, i3):
     a1 = np.arange(20)
     series = cudf.Series(a1)
+
     # Indexing
     sr1 = series.iloc[i1]
     assert sr1.null_count == 0
     np.testing.assert_equal(sr1.to_array(), a1[:12])
+
     sr2 = sr1.iloc[i2]
     assert sr2.null_count == 0
     np.testing.assert_equal(sr2.to_array(), a1[3:12])
+
     # Index with stride
     sr3 = sr2.iloc[i3]
     assert sr3.null_count == 0
@@ -123,6 +126,44 @@ def test_series_indexing(i1, i2, i3):
             assert series[i] == a1[i]
 
 
+@pytest.mark.parametrize(
+    "arg",
+    [
+        1,
+        -1,
+        "b",
+        np.int32(1),
+        np.uint32(1),
+        np.int8(1),
+        np.uint8(1),
+        np.int16(1),
+        np.uint16(1),
+        np.int64(1),
+        np.uint64(1),
+    ],
+)
+def test_series_get_item_iloc_defer(arg):
+    # Indexing for non-numeric dtype Index
+    ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"]))
+    gs = cudf.from_pandas(ps)
+
+    expect = ps[arg]
+    got = gs[arg]
+
+    assert_eq(expect, got)
+
+
+def test_series_iloc_defer_cudf_scalar():
+    ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"]))
+    gs = cudf.from_pandas(ps)
+
+    for t in index_dtypes:
+        arg = cudf.Scalar(1, dtype=t)
+        got = gs[arg]
+        expect = 2
+        assert_eq(expect, got)
+
+
 def test_series_indexing_large_size():
     n_elem = 100_000
     gsr = cudf.Series(cupy.ones(n_elem))
@@ -1095,8 +1136,17 @@ def test_dataframe_setitem_iloc(key, value, pdf_gdf):
         (("one", "a"), 5),
         ((slice(None), "a"), 5),
         ((slice(None), "a"), range(3)),
+        ((slice(None), "a"), [3, 2, 1]),
         ((slice(None, "two"), "a"), range(2)),
+        ((slice(None, "two"), "a"), [4, 5]),
         ((["one", "two"], "a"), 5),
+        (("one", "c"), 5),
+        ((["one", "two"], "c"), 5),
+        ((slice(None), "c"), 5),
+        ((slice(None), "c"), range(3)),
+        ((slice(None), "c"), [3, 2, 1]),
+        ((slice(None, "two"), "c"), range(2)),
+        ((slice(None, "two"), "c"), [4, 5]),
     ],
 )
 def test_dataframe_setitem_loc(key, value, pdf_gdf):
@@ -1106,6 +1156,21 @@ def test_dataframe_setitem_loc(key, value, pdf_gdf):
     assert_eq(pdf, gdf)
 
 
+@pytest.mark.parametrize(
+    "key, value",
+    [
+        (("one", "a"), 5),
+        ((slice(None), "a"), range(3)),
+        ((slice(None), "a"), [3, 2, 1]),
+    ],
+)
+def test_dataframe_setitem_loc_empty_df(key, value):
+    pdf, gdf = pd.DataFrame(), cudf.DataFrame()
+    pdf.loc[key] = value
+    gdf.loc[key] = value
+    assert_eq(pdf, gdf, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "key,value",
     [
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 9906600304b..7edcb08a7c8 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf import NA
 from cudf.tests.utils import assert_eq
 
 
@@ -315,3 +316,37 @@ def test_contains_null_search_key(data, expect):
     expect = cudf.Series(expect, dtype="bool")
     got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type))
     assert_eq(expect, got)
+
+
+def test_concatenate_rows_of_lists():
+    pdf = pd.DataFrame({"val": [["a", "a"], ["b"], ["c"]]})
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf["val"] + pdf["val"]
+    got = gdf["val"] + gdf["val"]
+
+    assert_eq(expect, got)
+
+
+def test_concatenate_list_with_nonlist():
+    with pytest.raises(TypeError, match="can only concatenate list to list"):
+        gdf1 = cudf.DataFrame({"A": [["a", "c"], ["b", "d"], ["c", "d"]]})
+        gdf2 = cudf.DataFrame({"A": ["a", "b", "c"]})
+        gdf1["A"] + gdf2["A"]
+
+
+@pytest.mark.parametrize(
+    "indata,expect",
+    [
+        ([1], [1]),
+        ([1, 2, 3], [1, 2, 3]),
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]),
+        ([None], [NA]),
+        ([1, None, 3], [1, NA, 3]),
+        ([[1, None, 3], [None, 5, 6]], [[1, NA, 3], [NA, 5, 6]]),
+    ],
+)
+def test_list_getitem(indata, expect):
+    list_sr = cudf.Series([indata])
+    # __getitem__ shall fill None with cudf.NA
+    assert list_sr[0] == expect
diff --git a/python/cudf/cudf/tests/test_offset.py b/python/cudf/cudf/tests/test_offset.py
new file mode 100644
index 00000000000..1ed04616f04
--- /dev/null
+++ b/python/cudf/cudf/tests/test_offset.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+import re
+
+import numpy as np
+import pytest
+
+from cudf import DateOffset
+
+INT64MAX = np.iinfo("int64").max
+
+
+@pytest.mark.parametrize("period", [1.5, 0.5, "string", "1", "1.0"])
+@pytest.mark.parametrize("freq", ["years", "months"])
+def test_construction_invalid(period, freq):
+    kwargs = {freq: period}
+    with pytest.raises(ValueError):
+        DateOffset(**kwargs)
+
+
+@pytest.mark.parametrize(
+    "unit", ["nanoseconds", "microseconds", "milliseconds", "seconds"]
+)
+def test_construct_max_offset(unit):
+    DateOffset(**{unit: np.iinfo("int64").max})
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"seconds": INT64MAX + 1},
+        {"seconds": INT64MAX, "minutes": 1},
+        {"minutes": INT64MAX},
+    ],
+)
+def test_offset_construction_overflow(kwargs):
+    with pytest.raises(NotImplementedError):
+        DateOffset(**kwargs)
+
+
+@pytest.mark.parametrize(
+    "unit",
+    [
+        "years",
+        "months",
+        "weeks",
+        "days",
+        "hours",
+        "minutes",
+        "seconds",
+        "milliseconds",
+        "microseconds",
+        "nanoseconds",
+    ],
+)
+@pytest.mark.parametrize("period", [0.5, -0.5, 0.71])
+def test_offset_no_fractional_periods(unit, period):
+    with pytest.raises(
+        ValueError, match=re.escape("Non-integer periods not supported")
+    ):
+        DateOffset(**{unit: period})
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index faad489da86..71666846b96 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -14,6 +14,7 @@
 import cudf
 from cudf.io.orc import ORCWriter
 from cudf.tests.utils import assert_eq, gen_rand_series, supported_numpy_dtypes
+from cudf.core.dtypes import Decimal64Dtype
 
 
 @pytest.fixture(scope="module")
@@ -90,33 +91,6 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
     assert_eq(expect, got, check_categorical=False)
 
 
-def test_orc_reader_decimal(datadir):
-    path = datadir / "TestOrcFile.decimal.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
-
-    pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path, engine="cudf").to_pandas()
-
-    # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
-    # This is because cuDF returns as float64 as it lacks an equivalent dtype
-    pdf = pdf.apply(pd.to_numeric)
-
-    np.testing.assert_allclose(pdf, gdf)
-
-
-def test_orc_reader_decimal_as_int(datadir):
-    path = datadir / "TestOrcFile.decimal.orc"
-
-    gdf = cudf.read_orc(
-        path, engine="cudf", decimals_as_float=False, force_decimal_scale=2
-    ).to_pandas()
-
-    assert gdf["_col0"][0] == -100050  # -1000.5
-
-
 def test_orc_reader_filenotfound(tmpdir):
     with pytest.raises(FileNotFoundError):
         cudf.read_orc("TestMissingFile.orc")
@@ -306,17 +280,14 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
     pdf = orcfile.read().to_pandas()
     gdf = cudf.read_orc(
         path, engine="cudf", skiprows=skiprows, num_rows=num_rows
-    ).to_pandas()
-
-    # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
-    # This is because cuDF returns as float64 as it lacks an equivalent dtype
-    pdf = pdf.apply(pd.to_numeric)
+    )
 
     # Slice rows out of the whole dataframe for comparison as PyArrow doesn't
     # have an API to read a subsection of rows from the file
     pdf = pdf[skiprows : skiprows + num_rows]
+    pdf = pdf.reset_index(drop=True)
 
-    np.testing.assert_allclose(pdf, gdf)
+    assert_eq(pdf, gdf)
 
 
 def test_orc_read_skiprows(tmpdir):
@@ -517,6 +488,7 @@ def test_orc_writer_sliced(tmpdir):
 @pytest.mark.parametrize(
     "orc_file",
     [
+        "TestOrcFile.decimal.orc",
         "TestOrcFile.decimal.same.values.orc",
         "TestOrcFile.decimal.multiple.values.orc",
         # For addional information take look at PR 7034
@@ -525,15 +497,39 @@ def test_orc_writer_sliced(tmpdir):
 )
 def test_orc_reader_decimal_type(datadir, orc_file):
     file_path = datadir / orc_file
-    pdf = pd.read_orc(file_path)
+
+    try:
+        orcfile = pa.orc.ORCFile(file_path)
+    except pa.ArrowIOError as e:
+        pytest.skip(".orc file is not found: %s" % e)
+
+    pdf = orcfile.read().to_pandas()
     df = cudf.read_orc(file_path).to_pandas()
-    # Converting to strings since pandas keeps it in decimal
-    pdf["col8"] = pdf["col8"].astype("str")
-    df["col8"] = df["col8"].astype("str")
 
     assert_eq(pdf, df)
 
 
+def test_orc_decimal_precision_fail(datadir):
+    file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc"
+
+    try:
+        orcfile = pa.orc.ORCFile(file_path)
+    except pa.ArrowIOError as e:
+        pytest.skip(".orc file is not found: %s" % e)
+
+    # Max precision supported is 18 (Decimal64Dtype limit)
+    # and the data has the precision 19. This test should be removed
+    # once Decimal128Dtype is introduced.
+    with pytest.raises(RuntimeError):
+        cudf.read_orc(file_path)
+
+    # Shouldn't cause failure if decimal column is not chosen to be read.
+    pdf = orcfile.read(columns=["int"]).to_pandas()
+    gdf = cudf.read_orc(file_path, columns=["int"])
+
+    assert_eq(pdf, gdf)
+
+
 # For addional information take look at PR 6636 and 6702
 @pytest.mark.parametrize(
     "orc_file",
@@ -770,3 +766,17 @@ def test_empty_string_columns(data):
 
     assert_eq(expected, got_df)
     assert_eq(expected_pdf, got_df)
+
+
+@pytest.mark.parametrize("scale", [-3, 0, 3])
+def test_orc_writer_decimal(tmpdir, scale):
+    np.random.seed(0)
+    fname = tmpdir / "decimal.orc"
+
+    expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)})
+    expected["dec_val"] = expected["dec_val"].astype(Decimal64Dtype(7, scale))
+
+    expected.to_orc(fname)
+
+    got = pd.read_orc(fname)
+    assert_eq(expected.to_pandas()["dec_val"], got["dec_val"])
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 4781ff995b0..54bf17e4c2b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -19,7 +19,11 @@
 import cudf
 from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
 from cudf.tests import dataset_generator as dg
-from cudf.tests.utils import assert_eq, assert_exceptions_equal
+from cudf.tests.utils import (
+    TIMEDELTA_TYPES,
+    assert_eq,
+    assert_exceptions_equal,
+)
 
 
 @pytest.fixture(scope="module")
@@ -1782,6 +1786,9 @@ def test_parquet_writer_statistics(tmpdir, pdf):
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
 
+    for t in TIMEDELTA_TYPES:
+        pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t)
+
     gdf = cudf.from_pandas(pdf)
     gdf.to_parquet(file_path, index=False)
 
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index e87ab3730dd..ca819c7f59b 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -90,7 +90,9 @@ def test_pickle_index():
     idx = GenericIndex(np.arange(nelem), name="a")
     pickled = pickle.dumps(idx)
     out = pickle.loads(pickled)
-    assert idx == out
+    # TODO: Once operations like `all` are supported on Index objects, we can
+    # just use that without calling values first.
+    assert (idx == out).values.all()
 
 
 def test_pickle_buffer():
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index c998f308417..0fa09bc5df7 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -2,20 +2,19 @@
 
 from __future__ import division, print_function
 
-import random
 import re
+from decimal import Decimal
 from itertools import product
 
 import numpy as np
 import pandas as pd
 import pytest
-from decimal import Decimal
 
 import cudf
 from cudf.core import Series
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.tests import utils
-from cudf.tests.utils import NUMERIC_TYPES, gen_rand, assert_eq
+from cudf.tests.utils import NUMERIC_TYPES, assert_eq, gen_rand
 
 params_dtype = NUMERIC_TYPES
 
@@ -69,12 +68,15 @@ def test_sum_decimal(dtype, nelem):
 
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_product(dtype, nelem):
+    np.random.seed(0)
     dtype = np.dtype(dtype).type
     if np.dtype(dtype).kind in {"u", "i"}:
         data = np.ones(nelem, dtype=dtype)
         # Set at most 30 items to [0..2) to keep the value within 2^32
         for _ in range(30):
-            data[random.randrange(nelem)] = random.random() * 2
+            data[np.random.randint(low=0, high=nelem, size=1)] = (
+                np.random.uniform() * 2
+            )
     else:
         data = gen_rand(dtype, nelem)
 
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 65ce2a79992..6dca539b8d5 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -5,9 +5,10 @@
 import numpy as np
 import pandas as pd
 import pytest
+from decimal import Decimal
 
 import cudf
-from cudf.core import DataFrame, Series
+from cudf.core.dtypes import Decimal64Dtype
 from cudf.tests.utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
@@ -36,6 +37,7 @@
         ([1, 2, 3], cudf.Series([10, 11, 12])),
         (cudf.Series([1, 2, 3]), None),
         ({1: 10, 2: 22}, None),
+        (np.inf, 4),
     ],
 )
 def test_series_replace_all(gsr, to_replace, value):
@@ -64,14 +66,14 @@ def test_series_replace():
 
     # Numerical
     a2 = np.array([5, 1, 2, 3, 4])
-    sr1 = Series(a1)
+    sr1 = cudf.Series(a1)
     sr2 = sr1.replace(0, 5)
     assert_eq(a2, sr2.to_array())
 
     # Categorical
     psr3 = pd.Series(["one", "two", "three"], dtype="category")
     psr4 = psr3.replace("one", "two")
-    sr3 = Series.from_pandas(psr3)
+    sr3 = cudf.from_pandas(psr3)
     sr4 = sr3.replace("one", "two")
     assert_eq(psr4, sr4)
 
@@ -94,7 +96,7 @@ def test_series_replace():
     assert_eq(a8, sr8.to_array())
 
     # large input containing null
-    sr9 = Series(list(range(400)) + [None])
+    sr9 = cudf.Series(list(range(400)) + [None])
     sr10 = sr9.replace([22, 323, 27, 0], None)
     assert sr10.null_count == 5
     assert len(sr10.to_array()) == (401 - 5)
@@ -119,7 +121,7 @@ def test_series_replace_with_nulls():
 
     # Numerical
     a2 = np.array([-10, 1, 2, 3, 4])
-    sr1 = Series(a1)
+    sr1 = cudf.Series(a1)
     sr2 = sr1.replace(0, None).fillna(-10)
     assert_eq(a2, sr2.to_array())
 
@@ -128,7 +130,7 @@ def test_series_replace_with_nulls():
     sr6 = sr1.replace([0, 1], [None, 6]).fillna(-10)
     assert_eq(a6, sr6.to_array())
 
-    sr1 = Series([0, 1, 2, 3, 4, None])
+    sr1 = cudf.Series([0, 1, 2, 3, 4, None])
     with pytest.raises(TypeError):
         sr1.replace([0, 1], [5.5, 6.5]).fillna(-10)
 
@@ -230,7 +232,7 @@ def test_dataframe_replace(df, to_replace, value):
 def test_dataframe_replace_with_nulls():
     # numerical
     pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]})
-    gdf1 = DataFrame.from_pandas(pdf1)
+    gdf1 = cudf.from_pandas(pdf1)
     pdf2 = pdf1.replace(0, 4)
     gdf2 = gdf1.replace(0, None).fillna(4)
     assert_eq(gdf2, pdf2)
@@ -249,7 +251,7 @@ def test_dataframe_replace_with_nulls():
     gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4)
     assert_eq(gdf8, pdf8)
 
-    gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]})
+    gdf1 = cudf.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]})
     gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3)
     assert_eq(gdf9, pdf6)
 
@@ -332,6 +334,58 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "gsr_data",
+    [
+        cudf.Series(["2.34", "5.2", "7.47", None, "92.29", None]).astype(
+            Decimal64Dtype(7, 2)
+        ),
+        cudf.Series(["-74.56", None, "-23.73", "34.55", "2.89", None]).astype(
+            Decimal64Dtype(7, 2)
+        ),
+        cudf.Series(
+            ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan]
+        ).astype(Decimal64Dtype(8, 3)),
+        cudf.Series(
+            ["2.964", None, "57.432", "-989.330", None, "56.444"]
+        ).astype(Decimal64Dtype(8, 3)),
+        cudf.Series(
+            [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan]
+        ).astype(Decimal64Dtype(10, 4)),
+    ],
+)
+@pytest.mark.parametrize(
+    "fill_value",
+    [
+        42,
+        -123,
+        Decimal("8.2"),
+        Decimal("-12.87"),
+        cudf.Series([None, -854, 9533, -274, -845, 7924], dtype="int32"),
+        cudf.Series(["-53.5", "13.4", "-64.3", None, "42.42", None]).astype(
+            Decimal64Dtype(7, 2)
+        ),
+        cudf.Series(
+            ["57.45", np.nan, np.nan, "686.49", "-55.5", "73.24"],
+        ).astype(Decimal64Dtype(7, 2)),
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+def test_fillna_decimal(gsr_data, fill_value, inplace):
+    gsr = gsr_data.copy(deep=True)
+    psr = gsr.to_pandas()
+
+    if isinstance(fill_value, cudf.Series):
+        p_fill_value = fill_value.to_pandas()
+    else:
+        p_fill_value = fill_value
+
+    expected = psr.fillna(p_fill_value, inplace=inplace)
+    got = gsr.fillna(fill_value, inplace=inplace)
+
+    assert_eq(expected, got, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "psr_data",
     [
@@ -375,7 +429,7 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace):
 @pytest.mark.parametrize("inplace", [True, False])
 def test_fillna_categorical(psr_data, fill_value, inplace):
     psr = psr_data.copy(deep=True)
-    gsr = Series.from_pandas(psr)
+    gsr = cudf.from_pandas(psr)
 
     if isinstance(fill_value, pd.Series):
         fill_value_cudf = cudf.from_pandas(fill_value)
@@ -620,7 +674,7 @@ def test_fillna_method_fixed_width_non_num(data, container, method, inplace):
 @pytest.mark.parametrize("inplace", [True, False])
 def test_fillna_dataframe(df, value, inplace):
     pdf = df.copy(deep=True)
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
 
     fill_value_pd = value
     if isinstance(fill_value_pd, (pd.Series, pd.DataFrame)):
@@ -688,7 +742,7 @@ def test_fillna_string(ps_data, fill_value, inplace):
 
 @pytest.mark.parametrize("data_dtype", INTEGER_TYPES)
 def test_series_fillna_invalid_dtype(data_dtype):
-    gdf = Series([1, 2, None, 3], dtype=data_dtype)
+    gdf = cudf.Series([1, 2, None, 3], dtype=data_dtype)
     fill_value = 2.5
     with pytest.raises(TypeError) as raises:
         gdf.fillna(fill_value)
@@ -702,7 +756,7 @@ def test_series_fillna_invalid_dtype(data_dtype):
 @pytest.mark.parametrize("fill_value", [100, 100.0, 128.5])
 def test_series_where(data_dtype, fill_value):
     psr = pd.Series(list(range(10)), dtype=data_dtype)
-    sr = Series.from_pandas(psr)
+    sr = cudf.from_pandas(psr)
 
     if sr.dtype.type(fill_value) != fill_value:
         with pytest.raises(TypeError):
@@ -748,7 +802,7 @@ def test_series_where(data_dtype, fill_value):
 @pytest.mark.parametrize("fill_value", [100, 100.0, 100.5])
 def test_series_with_nulls_where(fill_value):
     psr = pd.Series([None] * 3 + list(range(5)))
-    sr = Series.from_pandas(psr)
+    sr = cudf.from_pandas(psr)
 
     expect = psr.where(psr > 0, fill_value)
     got = sr.where(sr > 0, fill_value)
@@ -771,7 +825,7 @@ def test_dataframe_with_nulls_where_with_scalars(fill_value):
             "B": [4, -2, 3, None, 7, 6, 8, 0],
         }
     )
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
 
     expect = pdf.where(pdf % 3 == 0, fill_value)
     got = gdf.where(gdf % 3 == 0, fill_value)
@@ -785,7 +839,7 @@ def test_dataframe_with_different_types():
     pdf = pd.DataFrame(
         {"A": [111, 22, 31, 410, 56], "B": [-10.12, 121.2, 45.7, 98.4, 87.6]}
     )
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
     expect = pdf.where(pdf > 50, -pdf)
     got = gdf.where(gdf > 50, -gdf)
 
@@ -793,9 +847,9 @@ def test_dataframe_with_different_types():
 
     # Testing for string
     pdf = pd.DataFrame({"A": ["a", "bc", "cde", "fghi"]})
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
     pdf_mask = pd.DataFrame({"A": [True, False, True, False]})
-    gdf_mask = DataFrame.from_pandas(pdf_mask)
+    gdf_mask = cudf.from_pandas(pdf_mask)
     expect = pdf.where(pdf_mask, ["cudf"])
     got = gdf.where(gdf_mask, ["cudf"])
 
@@ -804,7 +858,7 @@ def test_dataframe_with_different_types():
     # Testing for categoriacal
     pdf = pd.DataFrame({"A": ["a", "b", "b", "c"]})
     pdf["A"] = pdf["A"].astype("category")
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
     expect = pdf.where(pdf_mask, "c")
     got = gdf.where(gdf_mask, ["c"])
 
@@ -813,7 +867,7 @@ def test_dataframe_with_different_types():
 
 def test_dataframe_where_with_different_options():
     pdf = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]})
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
 
     # numpy array
     boolean_mask = np.array([[False, True], [True, False], [False, True]])
@@ -837,8 +891,8 @@ def test_dataframe_where_with_different_options():
 
 
 def test_series_multiple_times_with_nulls():
-    sr = Series([1, 2, 3, None])
-    expected = Series([None, None, None, None], dtype=np.int64)
+    sr = cudf.Series([1, 2, 3, None])
+    expected = cudf.Series([None, None, None, None], dtype=np.int64)
 
     for i in range(3):
         got = sr.replace([1, 2, 3], None)
@@ -850,7 +904,7 @@ def test_series_multiple_times_with_nulls():
         # subsequent calls and the memory used for mask may have junk values.
         # So, if it is not updated properly, the result would be wrong.
         # So, this will help verify that scenario.
-        Series([1, 1, 1, None])
+        cudf.Series([1, 1, 1, None])
 
 
 @pytest.mark.parametrize("series_dtype", NUMERIC_TYPES)
@@ -859,7 +913,7 @@ def test_series_multiple_times_with_nulls():
 )
 def test_numeric_series_replace_dtype(series_dtype, replacement):
     psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
-    sr = Series.from_pandas(psr)
+    sr = cudf.from_pandas(psr)
 
     # Both Scalar
     if sr.dtype.type(replacement) != replacement:
@@ -904,7 +958,7 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
 
 def test_replace_inplace():
     data = np.array([5, 1, 2, 3, 4])
-    sr = Series(data)
+    sr = cudf.Series(data)
     psr = pd.Series(data)
 
     sr_copy = sr.copy()
@@ -917,7 +971,7 @@ def test_replace_inplace():
     assert_eq(sr, psr)
     assert_eq(sr_copy, psr_copy)
 
-    sr = Series(data)
+    sr = cudf.Series(data)
     psr = pd.Series(data)
 
     sr_copy = sr.copy()
@@ -934,7 +988,7 @@ def test_replace_inplace():
     assert_eq(srr, psrr)
 
     psr = pd.Series(["one", "two", "three"], dtype="category")
-    sr = Series.from_pandas(psr)
+    sr = cudf.from_pandas(psr)
 
     sr_copy = sr.copy()
     psr_copy = psr.copy()
@@ -947,7 +1001,7 @@ def test_replace_inplace():
     assert_eq(sr_copy, psr_copy)
 
     pdf = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]})
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
 
     pdf_copy = pdf.copy()
     gdf_copy = gdf.copy()
@@ -959,7 +1013,7 @@ def test_replace_inplace():
     assert_eq(pdf_copy, gdf_copy)
 
     pds = pd.Series([1, 2, 3, 45])
-    gds = Series.from_pandas(pds)
+    gds = cudf.from_pandas(pds)
     vals = np.array([]).astype(int)
 
     assert_eq(pds.replace(vals, -1), gds.replace(vals, -1))
@@ -969,7 +1023,7 @@ def test_replace_inplace():
     assert_eq(pds, gds)
 
     pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]})
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
 
     assert_eq(
         pdf.replace({"a": 2}, {"a": -33}), gdf.replace({"a": 2}, {"a": -33})
@@ -1002,7 +1056,7 @@ def test_dataframe_clip(lower, upper, inplace):
     pdf = pd.DataFrame(
         {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]}
     )
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
 
     got = gdf.clip(lower=lower, upper=upper, inplace=inplace)
     expect = pdf.clip(lower=lower, upper=upper, axis=1)
@@ -1020,7 +1074,7 @@ def test_dataframe_clip(lower, upper, inplace):
 def test_dataframe_category_clip(lower, upper, inplace):
     data = ["a", "b", "c", "d", "e"]
     pdf = pd.DataFrame({"a": data})
-    gdf = DataFrame.from_pandas(pdf)
+    gdf = cudf.from_pandas(pdf)
     gdf["a"] = gdf["a"].astype("category")
 
     expect = pdf.clip(lower=lower, upper=upper)
@@ -1037,7 +1091,9 @@ def test_dataframe_category_clip(lower, upper, inplace):
     [([2, 7.4], [4, 7.9, "d"]), ([2, 7.4, "a"], [4, 7.9, "d"])],
 )
 def test_dataframe_exceptions_for_clip(lower, upper):
-    gdf = DataFrame({"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]})
+    gdf = cudf.DataFrame(
+        {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]}
+    )
 
     with pytest.raises(ValueError):
         gdf.clip(lower=lower, upper=upper)
@@ -1060,7 +1116,7 @@ def test_dataframe_exceptions_for_clip(lower, upper):
 @pytest.mark.parametrize("inplace", [True, False])
 def test_series_clip(data, lower, upper, inplace):
     psr = pd.Series(data)
-    gsr = Series.from_pandas(data)
+    gsr = cudf.Series.from_pandas(data)
 
     expect = psr.clip(lower=lower, upper=upper)
     got = gsr.clip(lower=lower, upper=upper, inplace=inplace)
@@ -1074,10 +1130,10 @@ def test_series_clip(data, lower, upper, inplace):
 def test_series_exceptions_for_clip():
 
     with pytest.raises(ValueError):
-        Series([1, 2, 3, 4]).clip([1, 2], [2, 3])
+        cudf.Series([1, 2, 3, 4]).clip([1, 2], [2, 3])
 
     with pytest.raises(NotImplementedError):
-        Series([1, 2, 3, 4]).clip(1, 2, axis=0)
+        cudf.Series([1, 2, 3, 4]).clip(1, 2, axis=0)
 
 
 @pytest.mark.parametrize(
@@ -1095,7 +1151,7 @@ def test_series_exceptions_for_clip():
 @pytest.mark.parametrize("inplace", [True, False])
 def test_index_clip(data, lower, upper, inplace):
     pdf = pd.DataFrame({"a": data})
-    index = DataFrame.from_pandas(pdf).set_index("a").index
+    index = cudf.from_pandas(pdf).set_index("a").index
 
     expect = pdf.clip(lower=lower, upper=upper)
     got = index.clip(lower=lower, upper=upper, inplace=inplace)
@@ -1112,7 +1168,7 @@ def test_index_clip(data, lower, upper, inplace):
 @pytest.mark.parametrize("inplace", [True, False])
 def test_multiindex_clip(lower, upper, inplace):
     df = pd.DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 3, 4, 5]})
-    gdf = DataFrame.from_pandas(df)
+    gdf = cudf.from_pandas(df)
 
     index = gdf.set_index(["a", "b"]).index
 
@@ -1143,13 +1199,13 @@ def test_series_fillna(data, index, value):
         data,
         index=index if index is not None and len(index) == len(data) else None,
     )
-    gsr = Series(
+    gsr = cudf.Series(
         data,
         index=index if index is not None and len(index) == len(data) else None,
     )
 
     expect = psr.fillna(pd.Series(value))
-    got = gsr.fillna(Series(value))
+    got = gsr.fillna(cudf.Series(value))
     assert_eq(expect, got)
 
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 7c274734980..d7b9f2fe1d7 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1493,3 +1493,10 @@ def test_categorical_index_with_nan_repr():
     )
 
     assert cat_index[2:].__repr__() == sliced_expected_repr
+
+
+def test_empty_series_name():
+    ps = pd.Series([], name="abc", dtype="int")
+    gs = cudf.from_pandas(ps)
+
+    assert ps.__repr__() == gs.__repr__()
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 916e73ea381..42939f8129a 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import cudf
@@ -194,14 +195,13 @@ def test_scalar_roundtrip(value):
     + TEST_DECIMAL_TYPES,
 )
 def test_null_scalar(dtype):
-    if isinstance(dtype, cudf.Decimal64Dtype):
-        with pytest.raises(NotImplementedError):
-            s = cudf.Scalar(None, dtype=dtype)
-        return
-
     s = cudf.Scalar(None, dtype=dtype)
     assert s.value is cudf.NA
-    assert s.dtype == np.dtype(dtype)
+    assert s.dtype == (
+        np.dtype(dtype)
+        if not isinstance(dtype, cudf.Decimal64Dtype)
+        else dtype
+    )
     assert s.is_valid() is False
 
 
@@ -234,25 +234,36 @@ def test_generic_null_scalar_construction_fails(value):
 
 
 @pytest.mark.parametrize(
-    "dtype",
-    NUMERIC_TYPES
-    + DATETIME_TYPES
-    + TIMEDELTA_TYPES
-    + ["object"]
-    + TEST_DECIMAL_TYPES,
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"]
 )
 def test_scalar_dtype_and_validity(dtype):
-    if isinstance(dtype, cudf.Decimal64Dtype):
-        with pytest.raises(NotImplementedError):
-            s = cudf.Scalar(None, dtype=dtype)
-        return
-
     s = cudf.Scalar(1, dtype=dtype)
 
     assert s.dtype == np.dtype(dtype)
     assert s.is_valid() is True
 
 
+@pytest.mark.parametrize(
+    "slr,dtype,expect",
+    [
+        (1, cudf.Decimal64Dtype(1, 0), Decimal("1")),
+        (Decimal(1), cudf.Decimal64Dtype(1, 0), Decimal("1")),
+        (Decimal("1.1"), cudf.Decimal64Dtype(2, 1), Decimal("1.1")),
+        (Decimal("1.1"), cudf.Decimal64Dtype(4, 3), Decimal("1.100")),
+        (Decimal("1.11"), cudf.Decimal64Dtype(2, 2), pa.lib.ArrowInvalid),
+    ],
+)
+def test_scalar_dtype_and_validity_decimal(slr, dtype, expect):
+    if expect is pa.lib.ArrowInvalid:
+        with pytest.raises(expect):
+            cudf.Scalar(slr, dtype=dtype)
+        return
+    else:
+        result = cudf.Scalar(slr, dtype=dtype)
+        assert result.dtype == dtype
+        assert result.is_valid
+
+
 @pytest.mark.parametrize(
     "value",
     [
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 0e9c61b634d..d172033d30f 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.tests import utils
-from cudf.tests.utils import assert_eq
+from cudf.tests.utils import _decimal_series, assert_eq
 
 
 @pytest.mark.parametrize(
@@ -289,6 +289,44 @@ def test_serialize_list_columns(data):
     assert_eq(recreated, df)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "a": _decimal_series(
+                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
+            )
+        },
+        {
+            "a": _decimal_series(
+                ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0)
+            ),
+            "b": _decimal_series(
+                ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
+            ),
+            "c": _decimal_series(
+                ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
+            ),
+        },
+        {
+            "a": _decimal_series(
+                ["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0)
+            ),
+            "b": _decimal_series(
+                ["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1)
+            ),
+            "c": _decimal_series(
+                [None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1)
+            ),
+        },
+    ],
+)
+def test_serialize_decimal_columns(data):
+    df = cudf.DataFrame(data)
+    recreated = df.__class__.deserialize(*df.serialize())
+    assert_eq(recreated, df)
+
+
 def test_deserialize_cudf_0_16(datadir):
     fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_0.16.pkl"
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 0dc53fa29e9..0cc0ad57745 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -372,7 +372,7 @@ def test_series_tolist(data):
         TypeError,
         match=re.escape(
             r"cuDF does not support conversion to host memory "
-            r"via `tolist()` method. Consider using "
+            r"via the `tolist()` method. Consider using "
             r"`.to_arrow().to_pylist()` to construct a Python list."
         ),
     ):
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 2ca6bc622be..58b3996ab5c 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
+import json
 import re
 import urllib.parse
 from contextlib import ExitStack as does_not_raise
@@ -807,8 +808,7 @@ def test_string_cat_str_error():
         gs.str.cat(gs.str)
 
 
-@pytest.mark.xfail(raises=(NotImplementedError, AttributeError))
-@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"])
+@pytest.mark.parametrize("sep", ["", " ", "|", ",", "|||"])
 def test_string_join(ps_gs, sep):
     ps, gs = ps_gs
 
@@ -1318,7 +1318,6 @@ def test_string_no_children_properties():
     assert empty_col.children == ()
     assert empty_col.size == 0
 
-    assert empty_col._nbytes == 0
     assert getsizeof(empty_col) >= 0  # Accounts for Python GC overhead
 
 
@@ -2527,9 +2526,12 @@ def test_string_hex_to_int(data):
 
     gsr = cudf.Series(data)
 
-    got = gsr.str.htoi()
     expected = cudf.Series([263988422296292, 0, 281474976710655])
 
+    got = gsr.str.htoi()
+    assert_eq(expected, got)
+
+    got = gsr.str.hex_to_int()  # alias
     assert_eq(expected, got)
 
 
@@ -2586,7 +2588,9 @@ def test_string_ip4_to_int():
     expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449])
 
     got = gsr.str.ip2int()
+    assert_eq(expected, got)
 
+    got = gsr.str.ip_to_int()  # alias
     assert_eq(expected, got)
 
 
@@ -2931,3 +2935,256 @@ def test_string_slice_with_mask():
     assert_eq(actual._column.null_count, expected._column.null_count)
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            """
+            {
+                "store":{
+                    "book":[
+                        {
+                            "category":"reference",
+                            "author":"Nigel Rees",
+                            "title":"Sayings of the Century",
+                            "price":8.95
+                        },
+                        {
+                            "category":"fiction",
+                            "author":"Evelyn Waugh",
+                            "title":"Sword of Honour",
+                            "price":12.99
+                        }
+                    ]
+                }
+            }
+            """
+        ],
+        [
+            """
+            {
+                "store":{
+                    "book":[
+                        {
+                            "category":"reference",
+                            "author":"Nigel Rees",
+                            "title":"Sayings of the Century",
+                            "price":8.95
+                        }
+                    ]
+                }
+            }
+            """,
+            """
+            {
+                "store":{
+                    "book":[
+                        {
+                            "category":"fiction",
+                            "author":"Evelyn Waugh",
+                            "title":"Sword of Honour",
+                            "price":12.99
+                        }
+                    ]
+                }
+            }
+            """,
+        ],
+    ],
+)
+def test_string_get_json_object_n(data):
+    gs = cudf.Series(data)
+    ps = pd.Series(data)
+
+    assert_eq(
+        json.loads(gs.str.get_json_object("$.store")[0]),
+        ps.apply(lambda x: json.loads(x)["store"])[0],
+    )
+    assert_eq(
+        json.loads(gs.str.get_json_object("$.store.book")[0]),
+        ps.apply(lambda x: json.loads(x)["store"]["book"])[0],
+    )
+    assert_eq(
+        gs.str.get_json_object("$.store.book[0].category"),
+        ps.apply(lambda x: json.loads(x)["store"]["book"][0]["category"]),
+    )
+
+
+@pytest.mark.parametrize(
+    "json_path", ["$.store", "$.store.book", "$.store.book[*].category", " "]
+)
+def test_string_get_json_object_empty_json_strings(json_path):
+    gs = cudf.Series(
+        [
+            """
+            {
+                "":{
+                    "":[
+                        {
+                            "":"",
+                            "":"",
+                            "":""
+                        },
+                        {
+                            "":"fiction",
+                            "":"",
+                            "title":""
+                        }
+                    ]
+                }
+            }
+            """
+        ]
+    )
+
+    got = gs.str.get_json_object(json_path)
+    expect = cudf.Series([None], dtype="object")
+
+    assert_eq(got, expect)
+
+
+@pytest.mark.parametrize("json_path", ["a", ".", "/.store"])
+def test_string_get_json_object_invalid_JSONPath(json_path):
+    gs = cudf.Series(
+        [
+            """
+            {
+                "store":{
+                    "book":[
+                        {
+                            "category":"reference",
+                            "author":"Nigel Rees",
+                            "title":"Sayings of the Century",
+                            "price":8.95
+                        },
+                        {
+                            "category":"fiction",
+                            "author":"Evelyn Waugh",
+                            "title":"Sword of Honour",
+                            "price":12.99
+                        }
+                    ]
+                }
+            }
+            """
+        ]
+    )
+
+    with pytest.raises(ValueError):
+        gs.str.get_json_object(json_path)
+
+
+def test_str_join_lists_error():
+    sr = cudf.Series([["a", "a"], ["b"], ["c"]])
+
+    with pytest.raises(
+        ValueError, match="sep_na_rep cannot be defined when `sep` is scalar."
+    ):
+        sr.str.join(sep="-", sep_na_rep="-")
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "string_na_rep should be a string scalar, got [10, 20] of type "
+            ": <class 'list'>"
+        ),
+    ):
+        sr.str.join(string_na_rep=[10, 20])
+
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            "sep should be of similar size to the series, got: 2, expected: 3"
+        ),
+    ):
+        sr.str.join(sep=["=", "-"])
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "sep_na_rep should be a string scalar, got "
+            "['na'] of type: <class 'list'>"
+        ),
+    ):
+        sr.str.join(sep=["-", "+", "."], sep_na_rep=["na"])
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "sep should be an str, array-like or Series object, "
+            "found <class 'cudf.core.dataframe.DataFrame'>"
+        ),
+    ):
+        sr.str.join(sep=cudf.DataFrame())
+
+
+@pytest.mark.parametrize(
+    "sr,sep,string_na_rep,sep_na_rep,expected",
+    [
+        (
+            cudf.Series([["a", "a"], ["b"], ["c"]]),
+            "-",
+            None,
+            None,
+            cudf.Series(["a-a", "b", "c"]),
+        ),
+        (
+            cudf.Series([["a", "b"], [None], [None, "hello", None, "world"]]),
+            "__",
+            "=",
+            None,
+            cudf.Series(["a__b", None, "=__hello__=__world"]),
+        ),
+        (
+            cudf.Series(
+                [
+                    ["a", None, "b"],
+                    [None],
+                    [None, "hello", None, "world"],
+                    None,
+                ]
+            ),
+            ["-", "_", "**", "!"],
+            None,
+            None,
+            cudf.Series(["a--b", None, "**hello****world", None]),
+        ),
+        (
+            cudf.Series(
+                [
+                    ["a", None, "b"],
+                    [None],
+                    [None, "hello", None, "world"],
+                    None,
+                ]
+            ),
+            ["-", "_", "**", None],
+            "rep_str",
+            "sep_str",
+            cudf.Series(
+                ["a-rep_str-b", None, "rep_str**hello**rep_str**world", None]
+            ),
+        ),
+        (
+            cudf.Series([[None, "a"], [None], None]),
+            ["-", "_", None],
+            "rep_str",
+            None,
+            cudf.Series(["rep_str-a", None, None]),
+        ),
+        (
+            cudf.Series([[None, "a"], [None], None]),
+            ["-", "_", None],
+            None,
+            "sep_str",
+            cudf.Series(["-a", None, None]),
+        ),
+    ],
+)
+def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected):
+    actual = sr.str.join(
+        sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep
+    )
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index c7efb55c089..3c211951dff 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -34,3 +34,13 @@ def test_struct_of_struct_loc():
     df = cudf.DataFrame({"col": [{"a": {"b": 1}}]})
     expect = cudf.Series([{"a": {"b": 1}}], name="col")
     assert_eq(expect, df["col"])
+
+
+@pytest.mark.parametrize(
+    "key, expect", [(0, [1, 3]), (1, [2, 4]), ("a", [1, 3]), ("b", [2, 4])]
+)
+def test_struct_for_field(key, expect):
+    sr = cudf.Series([{"a": 1, "b": 2}, {"a": 3, "b": 4}])
+    expect = cudf.Series(expect)
+    got = sr.struct.field(key)
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py
new file mode 100644
index 00000000000..bdb343a41f7
--- /dev/null
+++ b/python/cudf/cudf/tests/test_subword_tokenizer.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+from transformers import BertTokenizer
+import pytest
+import os
+import numpy as np
+
+import cudf
+from cudf.core.subword_tokenizer import SubwordTokenizer
+
+
+@pytest.fixture(scope="module")
+def datadir(datadir):
+    return os.path.join(datadir, "subword_tokenizer_data")
+
+
+def assert_equal_tokenization_outputs(hf_output, cudf_output):
+    assert (
+        np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0
+    )
+    assert (
+        np.sum(
+            hf_output["attention_mask"] != cudf_output["attention_mask"].get()
+        )
+        == 0
+    )
+
+
+def test_subword_tokenize_on_disk_vocab_str_api(datadir):
+    """
+    Tests the subword-tokenizer API where
+    the vocabulary is not pre-loaded
+    and is accessed via the string accessor
+    """
+    with open(os.path.join(datadir, "test_sentences.txt")) as file:
+        input_sentence_ls = [line.strip() for line in file]
+
+    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
+    vocab_hash_path = os.path.join(vocab_dir, "vocab-hash.txt")
+
+    ser = cudf.Series(input_sentence_ls)
+    tokens, masks, metadata = ser.str.subword_tokenize(
+        vocab_hash_path,
+        max_length=32,
+        stride=32,
+        do_lower=True,
+        max_rows_tensor=len(ser),
+    )
+
+
+@pytest.mark.parametrize("seq_len", [32, 64])
+@pytest.mark.parametrize("stride", [0, 15, 30])
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("do_lower_case", [True, False])
+def test_subword_tokenize(
+    seq_len, stride, add_special_tokens, do_lower_case, datadir
+):
+    with open(os.path.join(datadir, "test_sentences.txt")) as file:
+        input_sentence_ls = [line.strip() for line in file]
+
+    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
+
+    hf_tokenizer = BertTokenizer.from_pretrained(
+        vocab_dir, do_lower_case=do_lower_case
+    )
+
+    hf_output = hf_tokenizer(
+        input_sentence_ls,
+        max_length=seq_len,
+        stride=stride,
+        padding="max_length",
+        return_tensors="np",
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+    )
+
+    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
+    str_series = cudf.Series(input_sentence_ls)
+    cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case)
+    cudf_output = cudf_tokenizer(
+        str_series,
+        max_length=seq_len,
+        max_num_rows=len(str_series),
+        stride=stride,
+        padding="max_length",
+        return_tensors="cp",
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+    )
+    assert_equal_tokenization_outputs(hf_output, cudf_output)
+
+
+def test_subword_tokenize_with_truncation(datadir):
+    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
+    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
+    str_series = cudf.Series(["Test error"])
+    cudf_tokenizer = SubwordTokenizer(vocab_hash)
+
+    error_msg = (
+        "Adding special tokens is not supported with truncation = False. "
+        "Custom Cupy kernel can potentially "
+        "be used to add it. For reference "
+        "see: _bert_add_special_tokens"
+    )
+
+    with pytest.raises(NotImplementedError, match=error_msg):
+        cudf_tokenizer(
+            str_series,
+            max_length=64,
+            max_num_rows=len(str_series),
+            truncation=False,
+            add_special_tokens=True,
+        )
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 6ec5f88be48..ed409de196e 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -11,38 +11,25 @@
 supported_types = NUMERIC_TYPES
 
 
-@pytest.mark.parametrize("dtype", supported_types)
-def test_applymap(dtype):
-
-    size = 500
-
-    lhs_arr = np.random.random(size).astype(dtype)
-    lhs_col = Series(lhs_arr)._column
-
-    def generic_function(a):
-        return a ** 3
-
-    out_col = lhs_col.applymap(generic_function)
-
-    result = lhs_arr ** 3
-
-    np.testing.assert_almost_equal(result, out_col.to_array())
+def _generic_function(a):
+    return a ** 3
 
 
 @pytest.mark.parametrize("dtype", supported_types)
-def test_applymap_python_lambda(dtype):
+@pytest.mark.parametrize(
+    "udf,testfunc",
+    [
+        (_generic_function, lambda ser: ser ** 3),
+        (lambda x: x in [1, 2, 3, 4], lambda ser: np.isin(ser, [1, 2, 3, 4])),
+    ],
+)
+def test_applymap_python_lambda(dtype, udf, testfunc):
 
     size = 500
 
     lhs_arr = np.random.random(size).astype(dtype)
     lhs_ser = Series(lhs_arr)
 
-    # Note that the lambda has to be written this way.
-    # In other words, the following code does NOT compile with numba:
-    # test_list = [1, 2, 3, 4]
-    # out_ser = lhs_ser.applymap(lambda x: x in test_list)
-    out_ser = lhs_ser.applymap(lambda x: x in [1, 2, 3, 4])
-
-    result = np.isin(lhs_arr, [1, 2, 3, 4])
-
+    out_ser = lhs_ser.applymap(udf)
+    result = testfunc(lhs_arr)
     np.testing.assert_almost_equal(result, out_ser.to_array())
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index e3d03b80ae2..00d05a8c3a5 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2018, NVIDIA CORPORATION.
 from __future__ import division
 
-import numba
 import numpy as np
 import pytest
 
@@ -9,12 +8,8 @@
 from cudf.core import Series
 from cudf.utils import dtypes as dtypeutils
 
-try:
-    # Numba >= 0.49
-    from numba.np import numpy_support
-except ImportError:
-    # Numba <= 0.49
-    from numba import numpy_support
+from numba.cuda import compile_ptx
+from numba.np import numpy_support
 
 
 @pytest.mark.parametrize(
@@ -30,22 +25,19 @@ def test_generic_ptx(dtype):
     rhs_arr = np.random.random(size).astype(dtype)
     rhs_col = Series(rhs_arr)._column
 
-    @numba.cuda.jit(device=True)
     def generic_function(a, b):
         return a ** 3 + b
 
     nb_type = numpy_support.from_dtype(np.dtype(dtype))
     type_signature = (nb_type, nb_type)
 
-    result = generic_function.compile(type_signature)
-    ptx = generic_function.inspect_ptx(type_signature)
-    ptx_code = ptx.decode("utf-8")
+    ptx_code, output_type = compile_ptx(
+        generic_function, type_signature, device=True
+    )
 
-    output_type = numpy_support.as_dtype(result.signature.return_type)
+    dtype = numpy_support.as_dtype(output_type).type
 
-    out_col = libcudf.binaryop.binaryop_udf(
-        lhs_col, rhs_col, ptx_code, output_type.type
-    )
+    out_col = libcudf.binaryop.binaryop_udf(lhs_col, rhs_col, ptx_code, dtype)
 
     result = lhs_arr ** 3 + rhs_arr
 
diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py
index 37a74ab4760..672e83e6f64 100644
--- a/python/cudf/cudf/tests/utils.py
+++ b/python/cudf/cudf/tests/utils.py
@@ -3,6 +3,7 @@
 import re
 from collections.abc import Mapping, Sequence
 from contextlib import contextmanager
+from decimal import Decimal
 
 import cupy
 import numpy as np
@@ -296,6 +297,16 @@ def gen_rand_series(dtype, size, **kwargs):
     return cudf.Series(values)
 
 
+def _decimal_series(input, dtype):
+    return cudf.Series(
+        [x if x is None else Decimal(x) for x in input], dtype=dtype,
+    )
+
+
 @contextmanager
 def does_not_raise():
     yield
+
+
+def xfail_param(param, **kwargs):
+    return pytest.param(param, marks=pytest.mark.xfail(**kwargs))
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index 1e8beb18234..610b0997d85 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -11,12 +11,7 @@
 from cudf.utils import utils
 from cudf.utils.docutils import docfmt_partial
 
-try:
-    # Numba >= 0.49
-    from numba.core.utils import pysignature
-except ImportError:
-    # Numba <= 0.49
-    from numba.utils import pysignature
+from numba.core.utils import pysignature
 
 
 _doc_applyparams = """
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 722e0b12183..262fe304dd8 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -2,46 +2,12 @@
 from pickle import dumps
 
 import cachetools
-import cupy
 import numpy as np
 from numba import cuda
 
 import cudf
-from cudf.utils.utils import check_equals_float, check_equals_int
 
-try:
-    # Numba >= 0.49
-    from numba.np import numpy_support
-except ImportError:
-    # Numba <= 0.49
-    from numba import numpy_support
-
-
-# GPU array type casting
-
-
-def as_contiguous(arr):
-    assert arr.ndim == 1
-    cupy_dtype = arr.dtype
-    if np.issubdtype(cupy_dtype, np.datetime64):
-        cupy_dtype = np.dtype("int64")
-        arr = arr.view("int64")
-    out = cupy.ascontiguousarray(cupy.asarray(arr))
-    return cuda.as_cuda_array(out).view(arr.dtype)
-
-
-# Mask utils
-
-
-def full(size, value, dtype):
-    cupy_dtype = dtype
-    if np.issubdtype(cupy_dtype, np.datetime64):
-        time_unit, _ = np.datetime_data(cupy_dtype)
-        cupy_dtype = np.int64
-        value = np.datetime64(value, time_unit).view(cupy_dtype)
-
-    out = cupy.full(size, value, cupy_dtype)
-    return cuda.as_cuda_array(out).view(dtype)
+from numba.np import numpy_support
 
 
 #
@@ -77,7 +43,7 @@ def gpu_diff(in_col, out_col, out_mask, N):
 def gpu_mark_found_int(arr, val, out, not_found):
     i = cuda.grid(1)
     if i < arr.size:
-        if check_equals_int(arr[i], val):
+        if arr[i] == val:
             out[i] = i
         else:
             out[i] = not_found
@@ -92,7 +58,10 @@ def gpu_mark_found_float(arr, val, out, not_found):
         # at 0.51.1, this will have a very slight
         # performance improvement. Related
         # discussion in : https://github.com/rapidsai/cudf/pull/6073
-        if check_equals_float(arr[i], float(val)):
+        val = float(val)
+
+        # NaN-aware equality comparison.
+        if (arr[i] == val) or (arr[i] != arr[i] and val != val):
             out[i] = i
         else:
             out[i] = not_found
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 5cb0391d76f..0b59116f8e6 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -154,6 +154,20 @@ def is_numerical_dtype(obj):
     return dtype.kind in "biuf"
 
 
+def is_integer_dtype(obj):
+    try:
+        dtype = np.dtype(obj)
+    except TypeError:
+        return pd.api.types.is_integer_dtype(obj)
+    return dtype.kind in "iu"
+
+
+def is_integer(obj):
+    if isinstance(obj, cudf.Scalar):
+        return is_integer_dtype(obj.dtype)
+    return pd.api.types.is_integer(obj)
+
+
 def is_string_dtype(obj):
     return (
         pd.api.types.is_string_dtype(obj)
@@ -276,6 +290,17 @@ def is_decimal_dtype(obj):
     )
 
 
+def _find_common_type_decimal(dtypes):
+    # Find the largest scale and the largest difference between
+    # precision and scale of the columns to be concatenated
+    s = max([dtype.scale for dtype in dtypes])
+    lhs = max([dtype.precision - dtype.scale for dtype in dtypes])
+    # Combine to get the necessary precision and clip at the maximum
+    # precision
+    p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs)
+    return cudf.Decimal64Dtype(p, s)
+
+
 def cudf_dtype_from_pydata_dtype(dtype):
     """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
         Python dtype.
@@ -283,6 +308,8 @@ def cudf_dtype_from_pydata_dtype(dtype):
 
     if is_categorical_dtype(dtype):
         return cudf.core.dtypes.CategoricalDtype
+    elif is_decimal_dtype(dtype):
+        return cudf.core.dtypes.Decimal64Dtype
     elif dtype in cudf._lib.types.np_to_cudf_types:
         return dtype.type
 
@@ -306,6 +333,9 @@ def cudf_dtype_to_pa_type(dtype):
 
 
 def cudf_dtype_from_pa_type(typ):
+    """ Given a cuDF pyarrow dtype, converts it into the equivalent
+        cudf pandas dtype.
+    """
     if pa.types.is_list(typ):
         return cudf.core.dtypes.ListDtype.from_arrow(typ)
     elif pa.types.is_struct(typ):
@@ -662,9 +692,15 @@ def find_common_type(dtypes):
     dtypes = set(dtypes)
 
     if any(is_decimal_dtype(dtype) for dtype in dtypes):
-        raise NotImplementedError(
-            "DecimalDtype is not yet supported in find_common_type"
-        )
+        if all(
+            is_decimal_dtype(dtype) or is_numerical_dtype(dtype)
+            for dtype in dtypes
+        ):
+            return _find_common_type_decimal(
+                [dtype for dtype in dtypes if is_decimal_dtype(dtype)]
+            )
+        else:
+            return np.dtype("O")
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 16511627aa2..15120fd8fab 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1051,7 +1051,7 @@ def _is_local_filesystem(fs):
 def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
     """Return False if `path_or_data` resolves to multiple filepaths or buffers
     """
-    path_or_data = fsspec.utils.stringify_path(path_or_data)
+    path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
@@ -1076,7 +1076,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
 def is_directory(path_or_data, **kwargs):
     """Returns True if the provided filepath is a directory
     """
-    path_or_data = fsspec.utils.stringify_path(path_or_data)
+    path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
@@ -1121,7 +1121,7 @@ def get_filepath_or_buffer(
     compression : str
         Type of compression algorithm for the content
     """
-    path_or_data = fsspec.utils.stringify_path(path_or_data)
+    path_or_data = stringify_pathlike(path_or_data)
 
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options")
@@ -1223,6 +1223,27 @@ def is_fsspec_open_file(file_obj):
     return False
 
 
+def stringify_pathlike(pathlike):
+    """
+    Convert any object that implements the fspath protocol
+    to a string. Leaves other objects unchanged
+    Parameters
+    ----------
+    pathlike
+        Pathlike object that implements the fspath protocol
+
+    Returns
+    -------
+    maybe_pathlike_str
+        String version of the object if possible
+    """
+    maybe_pathlike_str = (
+        pathlike.__fspath__() if hasattr(pathlike, "__fspath__") else pathlike
+    )
+
+    return maybe_pathlike_str
+
+
 def buffer_write_lines(buf, lines):
     """
     Appends lines to a buffer.
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index ba9fa734248..f1841129e20 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -1,14 +1,13 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import functools
-from collections import OrderedDict
+import decimal
 from collections.abc import Sequence
-from math import floor, isinf, isnan
+from typing import FrozenSet, Set, Union
 
 import cupy as cp
 import numpy as np
 import pandas as pd
-from numba import njit
 
 import rmm
 
@@ -17,9 +16,11 @@
 from cudf.core.buffer import Buffer
 from cudf.utils.dtypes import to_cudf_compatible_scalar
 
+# The size of the mask in bytes
 mask_dtype = np.dtype(np.int32)
 mask_bitsize = mask_dtype.itemsize * 8
 
+
 _EQUALITY_OPS = {
     "eq",
     "ne",
@@ -36,46 +37,6 @@
 }
 
 
-@njit
-def mask_get(mask, pos):
-    return (mask[pos // mask_bitsize] >> (pos % mask_bitsize)) & 1
-
-
-@njit
-def check_equals_float(a, b):
-    return (
-        a == b
-        or (isnan(a) and isnan(b))
-        or ((isinf(a) and a < 0) and (isinf(b) and b < 0))
-        or ((isinf(a) and a > 0) and (isinf(b) and b > 0))
-    )
-
-
-@njit
-def rint(x):
-    """Round to the nearest integer.
-
-    Returns
-    -------
-    The nearest integer, as a float.
-    """
-    y = floor(x)
-    r = x - y
-
-    if r > 0.5:
-        y += 1.0
-    if r == 0.5:
-        r = y - 2.0 * floor(0.5 * y)
-        if r == 1.0:
-            y += 1.0
-    return y
-
-
-@njit
-def check_equals_int(a, b):
-    return a == b
-
-
 def scalar_broadcast_to(scalar, size, dtype=None):
 
     if isinstance(size, (tuple, list)):
@@ -97,6 +58,15 @@ def scalar_broadcast_to(scalar, size, dtype=None):
                 dtype
             )
 
+    if isinstance(scalar, decimal.Decimal):
+        if dtype is None:
+            dtype = cudf.Decimal64Dtype._from_decimal(scalar)
+
+        out_col = column.column_empty(size, dtype=dtype)
+        if out_col.size != 0:
+            out_col[:] = scalar
+        return out_col
+
     scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
     dtype = scalar.dtype
 
@@ -111,72 +81,6 @@ def scalar_broadcast_to(scalar, size, dtype=None):
         return out_col
 
 
-def normalize_index(index, size, doraise=True):
-    """Normalize negative index
-    """
-    if index < 0:
-        index = size + index
-    if doraise and not (0 <= index < size):
-        raise IndexError("out-of-bound")
-    return min(index, size)
-
-
-list_types_tuple = (list, np.array)
-
-
-def get_result_name(left, right):
-    """
-    This function will give appropriate name for the operations
-    involving two Series, Index's or combination of both.
-
-    Parameters
-    ----------
-    left : {Series, Index}
-    right : object
-
-    Returns
-    -------
-    name : object {string or None}
-    """
-
-    if isinstance(right, (cudf.Series, cudf.Index, pd.Series, pd.Index)):
-        name = compare_and_get_name(left, right)
-    else:
-        name = left.name
-    return name
-
-
-def compare_and_get_name(a, b):
-    """
-    If both a & b have name attribute, and they are
-    same return the common name.
-    Else, return either one of the name of a or b,
-    whichever is present.
-
-    Parameters
-    ----------
-    a : object
-    b : object
-
-    Returns
-    -------
-    name : str or None
-    """
-    a_has = hasattr(a, "name")
-    b_has = hasattr(b, "name")
-
-    if a_has and b_has:
-        if a.name == b.name:
-            return a.name
-        else:
-            return None
-    elif a_has:
-        return a.name
-    elif b_has:
-        return b.name
-    return None
-
-
 def initfunc(f):
     """
     Decorator for initialization functions that should
@@ -194,24 +98,6 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-def get_null_series(size, dtype=np.bool_):
-    """
-    Creates a null series of provided dtype and size
-
-    Parameters
-    ----------
-    size:  length of series
-    dtype: dtype of series to create; defaults to bool.
-
-    Returns
-    -------
-    a null cudf series of provided `size` and `dtype`
-    """
-
-    empty_col = column.column_empty(size, dtype, True)
-    return cudf.Series(empty_col)
-
-
 # taken from dask array
 # https://github.com/dask/dask/blob/master/dask/array/utils.py#L352-L363
 def _is_nep18_active():
@@ -268,6 +154,9 @@ class cached_property:
     it with `del`.
     """
 
+    # TODO: Can be replaced with functools.cached_property when we drop support
+    # for Python 3.7.
+
     def __init__(self, func):
         self.func = func
 
@@ -280,78 +169,42 @@ def __get__(self, instance, cls):
             return value
 
 
-class NestedMappingMixin:
-    """
-    Make missing values of a mapping empty instances
-    of the same type as the mapping.
-    """
-
-    def __getitem__(self, key):
-        if isinstance(key, tuple):
-            d = self
-            for k in key[:-1]:
-                d = d[k]
-            return d.__getitem__(key[-1])
-        else:
-            return super().__getitem__(key)
-
-    def __setitem__(self, key, value):
-        if isinstance(key, tuple):
-            d = self
-            for k in key[:-1]:
-                d = d.setdefault(k, self.__class__())
-            d.__setitem__(key[-1], value)
-        else:
-            super().__setitem__(key, value)
-
-
-class NestedOrderedDict(NestedMappingMixin, OrderedDict):
-    pass
+class GetAttrGetItemMixin:
+    """This mixin changes `__getattr__` to attempt a `__getitem__` call.
 
-
-def to_flat_dict(d):
-    """
-    Convert the given nested dictionary to a flat dictionary
-    with tuple keys.
+    Classes that include this mixin gain enhanced functionality for the
+    behavior of attribute access like `obj.foo`: if `foo` is not an attribute
+    of `obj`, obj['foo'] will be attempted, and the result returned.  To make
+    this behavior safe, classes that include this mixin must define a class
+    attribute `_PROTECTED_KEYS` that defines the attributes that are accessed
+    within `__getitem__`. For example, if `__getitem__` is defined as
+    `return self._data[key]`, we must define `_PROTECTED_KEYS={'_data'}`.
     """
 
-    def _inner(d, parents=None):
-        if parents is None:
-            parents = []
-        for k, v in d.items():
-            if not isinstance(v, d.__class__):
-                if parents:
-                    k = tuple(parents + [k])
-                yield (k, v)
-            else:
-                yield from _inner(d=v, parents=parents + [k])
-
-    return {k: v for k, v in _inner(d)}
-
-
-def to_nested_dict(d):
-    """
-    Convert the given dictionary with tuple keys to a NestedOrderedDict.
-    """
-    return NestedOrderedDict(d)
-
-
-def time_col_replace_nulls(input_col):
-
-    null = column.column_empty_like(input_col, masked=True, newsize=1)
-    out_col = cudf._lib.replace.replace(
-        input_col,
-        column.as_column(
-            Buffer(
-                np.array(
-                    [input_col.default_na_value()], dtype=input_col.dtype
-                ).view("|u1")
-            ),
-            dtype=input_col.dtype,
-        ),
-        null,
-    )
-    return out_col
+    # Tracking of protected keys by each subclass is necessary to make the
+    # `__getattr__`->`__getitem__` call safe. See
+    # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html  # noqa: E501
+    # for an explanation. In brief, defining the `_PROTECTED_KEYS` allows this
+    # class to avoid calling `__getitem__` inside `__getattr__` when
+    # `__getitem__` will internally again call `__getattr__`, resulting in an
+    # infinite recursion.
+    # This problem only arises when the copy protocol is invoked (e.g. by
+    # `copy.copy` or `pickle.dumps`), and could also be avoided by redefining
+    # methods involved with the copy protocol such as `__reduce__` or
+    # `__setstate__`, but this class may be used in complex multiple
+    # inheritance hierarchies that might also override serialization.  The
+    # solution here is a minimally invasive change that avoids such conflicts.
+    _PROTECTED_KEYS: Union[FrozenSet[str], Set[str]] = frozenset()
+
+    def __getattr__(self, key):
+        if key in self._PROTECTED_KEYS:
+            raise AttributeError
+        try:
+            return self[key]
+        except KeyError:
+            raise AttributeError(
+                f"{type(self).__name__} object has no attribute {key}"
+            )
 
 
 def raise_iteration_error(obj):
@@ -374,7 +227,8 @@ def pa_mask_buffer_to_mask(mask_buf, size):
     return Buffer(mask_buf)
 
 
-def isnat(val):
+def _isnat(val):
+    """Wraps np.isnat to return False instead of error on invalid inputs."""
     if not isinstance(val, (np.datetime64, np.timedelta64, str)):
         return False
     else:
@@ -518,7 +372,7 @@ def get_appropriate_dispatched_func(
 def _cast_to_appropriate_cudf_type(val, index=None):
     # Handle scalar
     if val.ndim == 0:
-        return cudf.Scalar(val).value
+        return to_cudf_compatible_scalar(val)
     # 1D array
     elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1):
         # if index is not None and is of a different length
diff --git a/python/cudf/requirements/cuda-10.1/dev_requirements.txt b/python/cudf/requirements/cuda-10.1/dev_requirements.txt
deleted file mode 100644
index bb35cb47330..00000000000
--- a/python/cudf/requirements/cuda-10.1/dev_requirements.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-# pyarrow gpu package will have to be built from source :
-# https://arrow.apache.org/docs/python/install.html#installing-from-source
-
-cupy-cuda101
-cachetools
-cmake
-cmake-setuptools>=0.1.3
-cython>=0.29,<0.30
-dlpack
-fastavro>=0.22.9
-flatbuffers
-fsspec>=0.6.0
-hypothesis
-mimesis
-mypy==0.782
-nbsphinx
-numba>=0.49.0,!=0.51.0
-numpy
-numpydoc
-nvtx>=0.2.1
-packaging
-pandas>=1.0,<=1.2.4
-pandoc==2.0a4
-protobuf
-pyorc
-pytest
-pytest-benchmark
-pytest-xdist
-rapidjson
-recommonmark
-setuptools
-sphinx
-sphinx-copybutton
-sphinx-markdown-tables
-sphinx_rtd_theme
-sphinxcontrib-websupport
-typing_extensions
-typing_extensions
-wheel
\ No newline at end of file
diff --git a/python/cudf/requirements/cuda-10.2/dev_requirements.txt b/python/cudf/requirements/cuda-10.2/dev_requirements.txt
deleted file mode 100644
index 90bb1b4dd22..00000000000
--- a/python/cudf/requirements/cuda-10.2/dev_requirements.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-# pyarrow gpu package will have to be built from source :
-# https://arrow.apache.org/docs/python/install.html#installing-from-source
-
-cupy-cuda102
-cachetools
-cmake
-cmake-setuptools>=0.1.3
-cython>=0.29,<0.30
-dlpack
-fastavro>=0.22.9
-flatbuffers
-fsspec>=0.6.0
-hypothesis
-mimesis
-mypy==0.782
-nbsphinx
-numba>=0.49.0,!=0.51.0
-numpy
-numpydoc
-nvtx>=0.2.1
-packaging
-pandas>=1.0,<=1.2.4
-pandoc==2.0a4
-protobuf
-pyorc
-pytest
-pytest-benchmark
-pytest-xdist
-rapidjson
-recommonmark
-setuptools
-sphinx
-sphinx-copybutton
-sphinx-markdown-tables
-sphinx_rtd_theme
-sphinxcontrib-websupport
-typing_extensions
-typing_extensions
-wheel
\ No newline at end of file
diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
index db9a19537d2..455258d2e2e 100644
--- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
@@ -16,12 +16,12 @@ hypothesis
 mimesis
 mypy==0.782
 nbsphinx
-numba>=0.49.0,!=0.51.0
+numba>=0.53.1
 numpy
 numpydoc
 nvtx>=0.2.1
 packaging
-pandas>=1.0,<=1.2.4
+pandas>=1.0,<1.3.0dev0
 pandoc==2.0a4
 protobuf
 pyorc
@@ -36,6 +36,6 @@ sphinx-copybutton
 sphinx-markdown-tables
 sphinx_rtd_theme
 sphinxcontrib-websupport
+transformers
 typing_extensions
-typing_extensions
-wheel
\ No newline at end of file
+wheel
diff --git a/python/cudf/requirements/cuda-11.1/dev_requirements.txt b/python/cudf/requirements/cuda-11.1/dev_requirements.txt
deleted file mode 100644
index 488d0daabd7..00000000000
--- a/python/cudf/requirements/cuda-11.1/dev_requirements.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-# pyarrow gpu package will have to be built from source :
-# https://arrow.apache.org/docs/python/install.html#installing-from-source
-
-cupy-cuda111
-cachetools
-cmake
-cmake-setuptools>=0.1.3
-cython>=0.29,<0.30
-dlpack
-fastavro>=0.22.9
-flatbuffers
-fsspec>=0.6.0
-hypothesis
-mimesis
-mypy==0.782
-nbsphinx
-numba>=0.49.0,!=0.51.0
-numpy
-numpydoc
-nvtx>=0.2.1
-packaging
-pandas>=1.0,<=1.2.4
-pandoc==2.0a4
-protobuf
-pyorc
-pytest
-pytest-benchmark
-pytest-xdist
-rapidjson
-recommonmark
-setuptools
-sphinx
-sphinx-copybutton
-sphinx-markdown-tables
-sphinx_rtd_theme
-sphinxcontrib-websupport
-typing_extensions
-typing_extensions
-wheel
\ No newline at end of file
diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
index 33875e01c58..a7e5f1c0993 100644
--- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
@@ -16,12 +16,12 @@ hypothesis
 mimesis
 mypy==0.782
 nbsphinx
-numba>=0.49.0,!=0.51.0
+numba>=0.53.1
 numpy
 numpydoc
 nvtx>=0.2.1
 packaging
-pandas>=1.0,<=1.2.4
+pandas>=1.0,<1.3.0dev0
 pandoc==2.0a4
 protobuf
 pyorc
@@ -36,6 +36,6 @@ sphinx-copybutton
 sphinx-markdown-tables
 sphinx_rtd_theme
 sphinxcontrib-websupport
+transformers
 typing_extensions
-typing_extensions
-wheel
\ No newline at end of file
+wheel
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 67a2238eeca..54921396b6f 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -19,12 +19,12 @@
 import versioneer
 
 install_requires = [
-    "numba>=0.49.0,!=0.51.0",
+    "numba>=0.53.1",
     "Cython>=0.29,<0.30",
     "fastavro>=0.22.9",
     "fsspec>=0.6.0",
     "numpy",
-    "pandas>=1.0,<=1.2.4",
+    "pandas>=1.0,<1.3.0dev0",
     "typing_extensions",
     "protobuf",
     "nvtx>=0.2.1",
@@ -40,6 +40,7 @@
         "hypothesis" "mimesis",
         "pyorc",
         "msgpack",
+        "transformers",
     ]
 }
 
@@ -191,7 +192,7 @@ def run(self):
         ),
         libraries=["cudart", "cudf"] + pa.get_libraries() + ["arrow_cuda"],
         language="c++",
-        extra_compile_args=["-std=c++14"],
+        extra_compile_args=["-std=c++17"],
     )
 ]
 
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index f7523dda503..f16b7b42e4e 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -72,7 +72,7 @@
         library_dirs=([get_python_lib(), os.path.join(os.sys.prefix, "lib")]),
         libraries=["cudf", "cudf_kafka"],
         language="c++",
-        extra_compile_args=["-std=c++14"],
+        extra_compile_args=["-std=c++17"],
     )
 ]
 
diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt
index 4234d7ee2ab..068d45a9401 100644
--- a/python/custreamz/dev_requirements.txt
+++ b/python/custreamz/dev_requirements.txt
@@ -3,8 +3,8 @@
 flake8==3.8.3
 black==19.10b0
 isort==5.0.7
-dask==2021.4.0
-distributed>=2.22.0,<=2021.4.0
+dask>=2021.4.0,<=2021.5.1
+distributed>=2.22.0,<=2021.5.1
 streamz
 python-confluent-kafka
 pytest
diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
new file mode 100644
index 00000000000..04d3e20b844
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+
+class ListMethods:
+    def __init__(self, d_series):
+        self.d_series = d_series
+
+    def len(self):
+        """
+        Computes the length of each element in the Series/Index.
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> ds.list.len().compute()
+        0       3
+        1    <NA>
+        2       2
+        dtype: int32
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.list.len(), meta=self.d_series._meta
+        )
+
+    def contains(self, search_key):
+        """
+        Creates a column of bool values indicating whether the specified scalar
+        is an element of each row of a list column.
+
+        Parameters
+        ----------
+        search_key : scalar
+            element being searched for in each row of the list column
+
+        Returns
+        -------
+        Column
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds.list.contains(4).compute()
+        Series([False, True, True])
+        dtype: bool
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.list.contains(search_key), meta=self.d_series._meta
+        )
+
+    def get(self, index):
+        """
+        Extract element at the given index from each component
+        Extract element from lists, tuples, or strings in
+        each element in the Series/Index.
+
+        Parameters
+        ----------
+        index : int
+
+        Returns
+        -------
+        Series or Index
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds.list.get(-1).compute()
+        0    3
+        1    5
+        2    6
+        dtype: int64
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.list.get(index), meta=self.d_series._meta
+        )
+
+    @property
+    def leaves(self):
+        """
+        From a Series of (possibly nested) lists, obtain the elements from
+        the innermost lists as a flat Series (one value per row).
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> s = cudf.Series([[[1, None], [3, 4]], None, [[5, 6]]])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds.list.leaves.compute()
+        0       1
+        1    <NA>
+        2       3
+        3       4
+        4       5
+        5       6
+        dtype: int64
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.list.leaves, meta=self.d_series._meta
+        )
+
+    def take(self, lists_indices):
+        """
+        Collect list elements based on given indices.
+
+        Parameters
+        ----------
+        lists_indices: List type arrays
+            Specifies what to collect from each row
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds
+        0    [1, 2, 3]
+        1         None
+        2       [4, 5]
+        dtype: list
+        >>> ds.list.take([[0, 1], [], []]).compute()
+        0    [1, 2]
+        1      None
+        2        []
+        dtype: list
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.list.take(lists_indices), meta=self.d_series._meta
+        )
+
+    def unique(self):
+        """
+        Returns unique element for each list in the column, order for each
+        unique element is not guaranteed.
+
+        Returns
+        -------
+        ListColumn
+
+        Examples
+        --------
+        >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds
+        0    [1.0, 1.0, 2.0, nan, nan]
+        1                         None
+        2                   [4.0, 4.0]
+        3                           []
+        dtype: list
+        >>> ds.list.unique().compute() # Order of elements not guaranteed
+        0              [1.0, 2.0, nan]
+        1                         None
+        2                        [4.0]
+        3                           []
+        dtype: list
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.list.unique(), meta=self.d_series._meta
+        )
+
+    def sort_values(
+        self,
+        ascending=True,
+        inplace=False,
+        kind="quicksort",
+        na_position="last",
+        ignore_index=False,
+    ):
+        """
+        Sort each list by the values.
+        Sort the lists in ascending or descending order by some criterion.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If True, sort values in ascending order, otherwise descending.
+        na_position : {'first', 'last'}, default 'last'
+            'first' puts nulls at the beginning, 'last' puts nulls at the end.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, ..., n - 1.
+
+        Returns
+        -------
+        ListColumn with each list sorted
+
+        Notes
+        -----
+        Difference from pandas:
+          * Not supporting: `inplace`, `kind`
+
+        Examples
+        --------
+        >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]])
+        >>> ds = dask_cudf.from_cudf(s, 2)
+        >>> ds.list.sort_values(ascending=True, na_position="last").compute()
+        0    [2.0, 4.0, 9.0, nan]
+        1         [2.0, 8.0, 8.0]
+        2              [1.0, 2.0]
+        dtype: list
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.list.sort_values(
+                ascending, inplace, kind, na_position, ignore_index
+            ),
+            meta=self.d_series._meta,
+        )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index a0f498d2668..1c66f256d19 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -6,7 +6,7 @@
 import pyarrow as pa
 
 from dask.dataframe.categorical import categorical_dtype_dispatch
-from dask.dataframe.core import get_parallel_type, make_meta, meta_nonempty
+from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.methods import (
     concat_dispatch,
     is_categorical_dtype_dispatch,
@@ -18,8 +18,14 @@
     _scalar_from_dtype,
     is_arraylike,
     is_scalar,
+    make_meta,
 )
 
+try:
+    from dask.dataframe.utils import make_meta_obj as make_meta_obj
+except ImportError:
+    from dask.dataframe.utils import make_meta as make_meta_obj
+
 import cudf
 from cudf.utils.dtypes import is_string_dtype
 
@@ -133,8 +139,8 @@ def _empty_series(name, dtype, index=None):
     return cudf.Series([], dtype=dtype, name=name, index=index)
 
 
-@make_meta.register(object)
-def make_meta_object(x, index=None):
+@make_meta_obj.register(object)
+def make_meta_object_cudf(x, index=None):
     """Create an empty cudf object containing the desired metadata.
 
     Parameters
@@ -242,9 +248,24 @@ def is_categorical_dtype_cudf(obj):
     return cudf.utils.dtypes.is_categorical_dtype(obj)
 
 
+try:
+    from dask.dataframe.dispatch import union_categoricals_dispatch
+
+    @union_categoricals_dispatch.register((cudf.Series, cudf.Index))
+    def union_categoricals_cudf(
+        to_union, sort_categories=False, ignore_order=False
+    ):
+        return cudf.api.types._union_categoricals(
+            to_union, sort_categories=False, ignore_order=False
+        )
+
+
+except ImportError:
+    pass
+
 try:
 
-    from dask.dataframe.utils import group_split_dispatch, hash_object_dispatch
+    from dask.dataframe.core import group_split_dispatch, hash_object_dispatch
 
     def safe_hash(frame):
         index = frame.index
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index aa83bad7630..dc63c5c435e 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
 import math
 import warnings
 from distutils.version import LooseVersion
@@ -23,6 +24,12 @@
 from cudf import _lib as libcudf
 
 from dask_cudf import sorting
+from dask_cudf.accessors import ListMethods
+
+try:
+    from dask.dataframe.utils import make_meta_util as dask_make_meta
+except ImportError:
+    from dask.dataframe.core import make_meta as dask_make_meta
 
 DASK_VERSION = LooseVersion(dask.__version__)
 
@@ -71,7 +78,7 @@ def __init__(self, dsk, name, meta, divisions):
             dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[])
         self.dask = dsk
         self._name = name
-        meta = dd.core.make_meta(meta)
+        meta = dask_make_meta(meta)
         if not isinstance(meta, self._partition_type):
             raise TypeError(
                 f"Expected meta to specify type "
@@ -114,7 +121,7 @@ def assigner(df, k, v):
             out[k] = v
             return out
 
-        meta = assigner(self._meta, k, dd.core.make_meta(v))
+        meta = assigner(self._meta, k, dask_make_meta(v))
         return self.map_partitions(assigner, k, v, meta=meta)
 
     def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
@@ -244,6 +251,11 @@ def sort_values(
         set_divisions=False,
         **kwargs,
     ):
+        if kwargs:
+            raise ValueError(
+                f"Unsupported input arguments passed : {list(kwargs.keys())}"
+            )
+
         if self.npartitions == 1:
             df = self.map_partitions(M.sort_values, by)
         else:
@@ -414,6 +426,10 @@ def groupby(self, *args, **kwargs):
 
         return CudfSeriesGroupBy(self, *args, **kwargs)
 
+    @property
+    def list(self):
+        return ListMethods(self)
+
 
 class Index(Series, dd.core.Index):
     _partition_type = cudf.Index
@@ -672,7 +688,7 @@ def reduction(
     if meta is None:
         meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
         meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
-    meta = dd.core.make_meta(meta)
+    meta = dask_make_meta(meta)
 
     graph = HighLevelGraph.from_collections(b, dsk, dependencies=args)
     return dd.core.new_dd_object(graph, b, meta, (None, None))
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 2803212a502..73fe1bd2196 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -60,6 +60,7 @@ def mean(self, split_every=None, split_out=1):
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
+        arg = _redirect_aggs(arg)
 
         _supported = {"count", "mean", "std", "var", "sum", "min", "max"}
         if (
@@ -106,6 +107,7 @@ def mean(self, split_every=None, split_out=1):
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
+        arg = _redirect_aggs(arg)
 
         _supported = {"count", "mean", "std", "var", "sum", "min", "max"}
         if (
@@ -159,7 +161,7 @@ def groupby_agg(
     split_out = split_out or 1
 
     # Standardize `gb_cols` and `columns` lists
-    aggs = aggs_in.copy()
+    aggs = _redirect_aggs(aggs_in.copy())
     if isinstance(gb_cols, str):
         gb_cols = [gb_cols]
     columns = [c for c in ddf.columns if c not in gb_cols]
@@ -277,15 +279,32 @@ def groupby_agg(
     return new_dd_object(graph, gb_agg_name, _meta, divisions)
 
 
+def _redirect_aggs(arg):
+    """ Redirect aggregations to their corresponding name in cuDF
+    """
+    redirects = {sum: "sum", max: "max", min: "min"}
+    if isinstance(arg, dict):
+        new_arg = dict()
+        for col in arg:
+            if isinstance(arg[col], list):
+                new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
+            else:
+                new_arg[col] = redirects.get(arg[col], arg[col])
+        return new_arg
+    if isinstance(arg, list):
+        return [redirects.get(agg, agg) for agg in arg]
+    return redirects.get(arg, arg)
+
+
 def _is_supported(arg, supported: set):
-    """ Check that aggregations in `args` is a subset of `supportd`
+    """ Check that aggregations in `arg` are a subset of `supported`
     """
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
             _global_set = set()
             for col in arg:
                 if isinstance(arg[col], list):
-                    _global_set.union(set(arg[col]))
+                    _global_set = _global_set.union(set(arg[col]))
                 else:
                     _global_set.add(arg[col])
         else:
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index dea8674ede9..f00e91b5ff6 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -16,6 +16,53 @@
 
 
 def read_csv(path, chunksize="256 MiB", **kwargs):
+    """
+    Read CSV files into a dask_cudf.DataFrame
+
+    This API parallelizes the ``cudf.read_csv`` function in the following ways:
+
+    It supports loading many files at once using globstrings:
+
+    >>> import dask_cudf
+    >>> df = dask_cudf.read_csv("myfiles.*.csv")
+
+    In some cases it can break up large files:
+
+    >>> df = dask_cudf.read_csv("largefile.csv", chunksize="256 MiB")
+
+    It can read CSV files from external resources (e.g. S3, HTTP, FTP)
+
+    >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
+    >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
+
+    Internally ``dask_cudf.read_csv`` uses ``cudf.read_csv`` and supports
+    many of the same keyword arguments with the same performance guarantees.
+    See the docstring for ``cudf.read_csv()`` for more information on available
+    keyword arguments.
+
+    Parameters
+    ----------
+    path : str, path object, or file-like object
+        Either a path to a file (a str, pathlib.Path, or
+        py._path.local.LocalPath), URL (including http, ftp, and S3 locations),
+        or any object with a read() method (such as builtin open() file
+        handler function or StringIO).
+    chunksize : int or str, default "256 MiB"
+        The target task partition size. If `None`, a single block
+        is used for each file.
+    **kwargs : dict
+        Passthrough key-word arguments that are sent to ``cudf.read_csv``.
+
+    Examples
+    --------
+    >>> import dask_cudf
+    >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"])
+    >>> ddf.compute()
+       a      b
+    0  1     hi
+    1  2  hello
+    2  3     ai
+    """
     if "://" in str(path):
         func = make_reader(cudf.read_csv, "read_csv", "CSV")
         return func(path, blocksize=chunksize, **kwargs)
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 5b0d19b737b..00fc197da9b 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -85,7 +85,12 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
         columns = list(schema)
 
     with fs.open(paths[0], "rb") as f:
-        meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs)
+        meta = cudf.read_orc(
+            f,
+            stripes=[0] if nstripes_per_file[0] else None,
+            columns=columns,
+            **kwargs,
+        )
 
     name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
     dsk = {}
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index eae6509bc92..d8ac9e52fd8 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -6,10 +6,10 @@
 
 from dask import dataframe as dd
 
-import dask_cudf
-
 import cudf
 
+import dask_cudf
+
 # import pyarrow.orc as orc
 
 cur_dir = os.path.dirname(__file__)
@@ -74,6 +74,23 @@ def test_read_orc_filtered(tmpdir, engine, predicate, expected_len):
     dd.assert_eq(len(df), expected_len)
 
 
+def test_read_orc_first_file_empty(tmpdir):
+
+    # Write a 3-file dataset where the first file is empty
+    # See: https://github.com/rapidsai/cudf/issues/8011
+    path = str(tmpdir)
+    os.makedirs(path, exist_ok=True)
+    df1 = cudf.DataFrame({"id": [1, 2], "float": [1.0, 2.0]})
+    df1.iloc[:0].to_orc(os.path.join(path, "data.0"))
+    df1.iloc[:1].to_orc(os.path.join(path, "data.1"))
+    df1.iloc[1:].to_orc(os.path.join(path, "data.2"))
+
+    # Read back the files with dask_cudf,
+    # and check the result.
+    df2 = dask_cudf.read_orc(os.path.join(path, "*"))
+    dd.assert_eq(df1, df2, check_index=False)
+
+
 @pytest.mark.parametrize("compute", [True, False])
 @pytest.mark.parametrize("compression", [None, "snappy"])
 @pytest.mark.parametrize(
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index f8d877048f4..d80583c6959 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -9,7 +9,7 @@
 
 import dask
 from dask import dataframe as dd
-from dask.utils import natural_sort_key, parse_bytes
+from dask.utils import natural_sort_key
 
 import cudf
 
@@ -294,7 +294,6 @@ def test_chunksize(tmpdir, chunksize, metadata):
     nparts = 2
     df_size = 100
     row_group_size = 5
-    row_group_byte_size = 451  # Empirically measured
 
     df = pd.DataFrame(
         {
@@ -334,12 +333,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
     if not chunksize:
         assert ddf2.npartitions == num_row_groups
     else:
-        # Check that we are really aggregating
-        df_byte_size = row_group_byte_size * num_row_groups
-        expected = df_byte_size // parse_bytes(chunksize)
-        remainder = (df_byte_size % parse_bytes(chunksize)) > 0
-        expected += int(remainder) * nparts
-        assert ddf2.npartitions == max(nparts, expected)
+        assert ddf2.npartitions < num_row_groups
 
 
 @pytest.mark.parametrize("row_groups", [1, 3, 10, 12])
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 76589682717..48e0d022a52 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -5,10 +5,10 @@
 
 from dask import dataframe as dd
 
-import dask_cudf as dgd
-
 from cudf import DataFrame, Series
-from cudf.tests.utils import assert_eq
+from cudf.tests.utils import assert_eq, does_not_raise
+
+import dask_cudf as dgd
 
 #############################################################################
 #                        Datetime Accessor                                  #
@@ -290,3 +290,151 @@ def test_str_slice():
         pdf.a.str.split(",", expand=True, n=2),
         ddf.a.str.split(",", expand=True, n=2),
     )
+
+
+#############################################################################
+#                              List Accessor                                #
+#############################################################################
+
+
+def data_test_1():
+    return [list(range(100)) for _ in range(100)]
+
+
+def data_test_2():
+    return [list(i for _ in range(i)) for i in range(500)]
+
+
+def data_test_non_numeric():
+    return [list(chr(97 + i % 20) for _ in range(i)) for i in range(500)]
+
+
+def data_test_nested():
+    return [
+        list(list(y for y in range(x % 5)) for x in range(i))
+        for i in range(40)
+    ]
+
+
+def data_test_sort():
+    return [[1, 2, 3, 1, 2, 5] for _ in range(20)]
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [[]],
+        [[[]]],
+        [[0]],
+        [[0, 1]],
+        [[0, 1], [2, 3]],
+        [[[0, 1], [2]], [[3, 4]]],
+        [[None]],
+        [[[None]]],
+        [[None], None],
+        [[1, None], [1]],
+        [[1, None], None],
+        [[[1, None], None], None],
+    ],
+)
+def test_create_list_series(data):
+    expect = pd.Series(data)
+    ds_got = dgd.from_cudf(Series(data), 4)
+    assert_eq(expect, ds_got.compute())
+
+
+@pytest.mark.parametrize(
+    "data", [data_test_1(), data_test_2(), data_test_non_numeric()],
+)
+def test_unique(data):
+    expect = Series(data).list.unique()
+    ds = dgd.from_cudf(Series(data), 5)
+    assert_eq(expect, ds.list.unique().compute())
+
+
+@pytest.mark.parametrize(
+    "data", [data_test_2(), data_test_non_numeric()],
+)
+def test_len(data):
+    expect = Series(data).list.len()
+    ds = dgd.from_cudf(Series(data), 5)
+    assert_eq(expect, ds.list.len().compute())
+
+
+@pytest.mark.parametrize(
+    "data, search_key", [(data_test_2(), 1)],
+)
+def test_contains(data, search_key):
+    expect = Series(data).list.contains(search_key)
+    ds = dgd.from_cudf(Series(data), 5)
+    assert_eq(expect, ds.list.contains(search_key).compute())
+
+
+@pytest.mark.parametrize(
+    "data, index, expectation",
+    [
+        (data_test_1(), 1, does_not_raise()),
+        (data_test_2(), 2, pytest.raises(IndexError)),
+    ],
+)
+def test_get(data, index, expectation):
+    with expectation:
+        expect = Series(data).list.get(index)
+
+    if expectation == does_not_raise():
+        ds = dgd.from_cudf(Series(data), 5)
+        assert_eq(expect, ds.list.get(index).compute())
+
+
+@pytest.mark.parametrize(
+    "data", [data_test_1(), data_test_2(), data_test_nested()],
+)
+def test_leaves(data):
+    expect = Series(data).list.leaves
+    ds = dgd.from_cudf(Series(data), 5)
+    got = ds.list.leaves.compute().reset_index(drop=True)
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data, list_indices, expectation",
+    [
+        (
+            data_test_1(),
+            [[0, 1] for _ in range(len(data_test_1()))],
+            does_not_raise(),
+        ),
+        (data_test_2(), [[0]], pytest.raises(ValueError)),
+    ],
+)
+def test_take(data, list_indices, expectation):
+    with expectation:
+        expect = Series(data).list.take(list_indices)
+
+    if expectation == does_not_raise():
+        ds = dgd.from_cudf(Series(data), 5)
+        assert_eq(expect, ds.list.take(list_indices).compute())
+
+
+@pytest.mark.parametrize(
+    "data, ascending, na_position, ignore_index",
+    [
+        (data_test_sort(), True, "first", False),
+        (data_test_sort(), False, "last", True),
+    ],
+)
+def test_sorting(data, ascending, na_position, ignore_index):
+    expect = Series(data).list.sort_values(
+        ascending=ascending, na_position=na_position, ignore_index=ignore_index
+    )
+    got = (
+        dgd.from_cudf(Series(data), 5)
+        .list.sort_values(
+            ascending=ascending,
+            na_position=na_position,
+            ignore_index=ignore_index,
+        )
+        .compute()
+        .reset_index(drop=True)
+    )
+    assert_eq(expect, got)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index e19fe016cc9..e7dff10b527 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -9,7 +9,13 @@
 
 import dask
 from dask import dataframe as dd
-from dask.dataframe.core import make_meta, meta_nonempty
+from dask.dataframe.core import meta_nonempty
+
+try:
+    from dask.dataframe.utils import make_meta_util as dask_make_meta
+except ImportError:
+    from dask.dataframe.core import make_meta as dask_make_meta
+
 from dask.utils import M
 
 import cudf
@@ -585,20 +591,20 @@ def test_hash_object_dispatch(index):
     )
 
     # DataFrame
-    result = dd.utils.hash_object_dispatch(obj, index=index)
+    result = dd.core.hash_object_dispatch(obj, index=index)
     expected = dgd.backends.hash_object_cudf(obj, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # Series
-    result = dd.utils.hash_object_dispatch(obj["x"], index=index)
+    result = dd.core.hash_object_dispatch(obj["x"], index=index)
     expected = dgd.backends.hash_object_cudf(obj["x"], index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # DataFrame with MultiIndex
     obj_multi = obj.set_index(["x", "z"], drop=True)
-    result = dd.utils.hash_object_dispatch(obj_multi, index=index)
+    result = dd.core.hash_object_dispatch(obj_multi, index=index)
     expected = dgd.backends.hash_object_cudf(obj_multi, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
@@ -638,7 +644,7 @@ def test_make_meta_backends(index):
     df = df.set_index(index)
 
     # Check "empty" metadata types
-    chk_meta = make_meta(df)
+    chk_meta = dask_make_meta(df)
     dd.assert_eq(chk_meta.dtypes, df.dtypes)
 
     # Check "non-empty" metadata types
@@ -777,3 +783,65 @@ def test_index_map_partitions():
     mins_gd = gddf.index.map_partitions(M.min, meta=gddf.index).compute()
 
     dd.assert_eq(mins_pd, mins_gd)
+
+
+def test_merging_categorical_columns():
+    try:
+        from dask.dataframe.dispatch import (  # noqa: F401
+            union_categoricals_dispatch,
+        )
+    except ImportError:
+        pytest.skip(
+            "need a version of dask that has union_categoricals_dispatch"
+        )
+
+    df_1 = cudf.DataFrame(
+        {"id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"]}
+    )
+
+    ddf_1 = dgd.from_cudf(df_1, npartitions=2)
+
+    ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"])
+
+    df_2 = cudf.DataFrame(
+        {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}
+    )
+
+    ddf_2 = dgd.from_cudf(df_2, npartitions=2)
+
+    ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"])
+    expected = cudf.DataFrame(
+        {
+            "id_1": [2, 3],
+            "cat_col": cudf.Series(
+                ["f", "f"],
+                dtype=cudf.CategoricalDtype(
+                    categories=["a", "b", "f", "g", "h"], ordered=False
+                ),
+            ),
+            "id_2": [113, 113],
+        }
+    )
+    dd.assert_eq(ddf_1.merge(ddf_2), expected)
+
+
+def test_correct_meta():
+    try:
+        from dask.dataframe.utils import make_meta_util  # noqa: F401
+    except ImportError:
+        pytest.skip("need make_meta_util to be preset")
+
+    # Need these local imports in this specific order.
+    # For context: https://github.com/rapidsai/cudf/issues/7946
+    import pandas as pd
+
+    from dask import dataframe as dd
+
+    import dask_cudf  # noqa: F401
+
+    df = pd.DataFrame({"a": [3, 4], "b": [1, 2]})
+    ddf = dd.from_pandas(df, npartitions=1)
+    emb = ddf["a"].apply(pd.Series, meta={"c0": "int64", "c1": "int64"})
+
+    assert isinstance(emb, dd.DataFrame)
+    assert isinstance(emb._meta, pd.DataFrame)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index f8ed00beb4f..e3a3045dcc7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -7,11 +7,12 @@
 import dask
 from dask import dataframe as dd
 
-import dask_cudf
-
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 
+import dask_cudf
+from dask_cudf.groupby import _is_supported
+
 
 @pytest.mark.parametrize("aggregation", ["sum", "mean", "count", "min", "max"])
 def test_groupby_basic_aggs(aggregation):
@@ -533,3 +534,32 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     )
 
     dd.assert_eq(gf, pf)
+
+
+@pytest.mark.parametrize(
+    "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
+)
+def test_groupby_agg_redirect(aggregations):
+    pdf = pd.DataFrame(
+        {
+            "x": np.random.randint(0, 5, size=10000),
+            "y": np.random.normal(size=10000),
+        }
+    )
+
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
+
+    a = ddf.groupby("x").agg({"x": aggregations[0]}).compute()
+    b = ddf.groupby("x").agg({"x": aggregations[1]}).compute()
+
+    dd.assert_eq(a, b)
+
+
+@pytest.mark.parametrize(
+    "arg",
+    [["not_supported"], {"a": "not_supported"}, {"a": ["not_supported"]}],
+)
+def test_is_supported(arg):
+    assert _is_supported(arg, {"supported"}) is False
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index f98d2c50c99..5b7da3e524d 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,11 +1,11 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask==2021.4.0
-distributed>=2.22.0,<=2021.4.0
+dask>=2021.4.0,<=2021.5.1
+distributed>=2.22.0,<=2021.5.1
 fsspec>=0.6.0
-numba>=0.49.0,!=0.51.0
+numba>=0.53.1
 numpy
-pandas>=1.0,<=1.2.4
+pandas>=1.0,<1.3.0dev0
 pytest
 setuptools
 wheel
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 9e2224338cc..f9cd37a1987 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,21 +10,21 @@
 
 install_requires = [
     "cudf",
-    "dask==2021.4.0",
-    "distributed>=2.22.0,<=2021.4.0",
+    "dask>=2021.4.0,<=2021.5.1",
+    "distributed>=2.22.0,<=2021.5.1",
     "fsspec>=0.6.0",
     "numpy",
-    "pandas>=1.0,<=1.2.4",
+    "pandas>=1.0,<1.3.0dev0",
 ]
 
 extras_require = {
     "test": [
         "numpy",
-        "pandas>=1.0,<=1.2.4",
+        "pandas>=1.0,<1.3.0dev0",
         "pytest",
-        "numba>=0.49.0,!=0.51.0",
-        "dask==2021.4.0",
-        "distributed>=2.22.0,<=2021.4.0",
+        "numba>=0.53.1",
+        "dask>=2021.4.0,<=2021.5.1",
+        "distributed>=2.22.0,<=2021.5.1",
     ]
 }