diff --git a/.gitignore b/.gitignore
index b398cfc4f88..aee3d072de2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,3 +159,7 @@ dask-worker-space/
 
 # protobuf
 **/*_pb2.py
+
+# Sphinx docs & build artifacts
+docs/cudf/source/api_docs/generated/*
+docs/cudf/source/api_docs/api/*
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cc92cde15a8..de00213a6f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,260 @@
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch.
 
-# cuDF 21.08.00 (Date TBD)
+# cuDF 21.08.00 (4 Aug 2021)
 
-Please see https://github.com/rapidsai/cudf/releases/tag/v21.08.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Fix a crash in pack() when being handed tables with no columns. ([#8697](https://github.com/rapidsai/cudf/pull/8697)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Remove unused cudf::strings::create_offsets ([#8663](https://github.com/rapidsai/cudf/pull/8663)) [@davidwendt](https://github.com/davidwendt)
+- Add delimiter parameter to cudf::strings::capitalize() ([#8620](https://github.com/rapidsai/cudf/pull/8620)) [@davidwendt](https://github.com/davidwendt)
+- Change default datetime index resolution to ns to match pandas ([#8611](https://github.com/rapidsai/cudf/pull/8611)) [@vyasr](https://github.com/vyasr)
+- Add sequence_type parameter to cudf::strings::title function ([#8602](https://github.com/rapidsai/cudf/pull/8602)) [@davidwendt](https://github.com/davidwendt)
+- Add `strings::repeat_strings` API that can repeat each string a different number of times ([#8561](https://github.com/rapidsai/cudf/pull/8561)) [@ttnghia](https://github.com/ttnghia)
+- String-to-boolean conversion is different from Pandas ([#8549](https://github.com/rapidsai/cudf/pull/8549)) [@skirui-source](https://github.com/skirui-source)
+- Add accurate hash join size functions ([#8453](https://github.com/rapidsai/cudf/pull/8453)) [@PointKernel](https://github.com/PointKernel)
+- Expose a Decimal32Dtype in cuDF Python ([#8438](https://github.com/rapidsai/cudf/pull/8438)) [@skirui-source](https://github.com/skirui-source)
+- Update dask make_meta changes to be compatible with dask upstream ([#8426](https://github.com/rapidsai/cudf/pull/8426)) [@galipremsagar](https://github.com/galipremsagar)
+- Adapt `cudf::scalar` classes to changes in `rmm::device_scalar` ([#8411](https://github.com/rapidsai/cudf/pull/8411)) [@harrism](https://github.com/harrism)
+- Remove special Index class from the general index class hierarchy ([#8309](https://github.com/rapidsai/cudf/pull/8309)) [@vyasr](https://github.com/vyasr)
+- Add first-class dtype utilities ([#8308](https://github.com/rapidsai/cudf/pull/8308)) [@vyasr](https://github.com/vyasr)
+- ORC - Support reading multiple orc files/buffers in a single operation ([#8142](https://github.com/rapidsai/cudf/pull/8142)) [@jdye64](https://github.com/jdye64)
+- Upgrade arrow to 4.0.1 ([#7495](https://github.com/rapidsai/cudf/pull/7495)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🐛 Bug Fixes
+
+- Fix `contains` check in string column ([#8834](https://github.com/rapidsai/cudf/pull/8834)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove unused variable from `row_bit_count_test`. ([#8829](https://github.com/rapidsai/cudf/pull/8829)) [@mythrocks](https://github.com/mythrocks)
+- Fixes issue with null struct columns in ORC reader ([#8819](https://github.com/rapidsai/cudf/pull/8819)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Set CMake vars for python/parquet support in libarrow builds ([#8808](https://github.com/rapidsai/cudf/pull/8808)) [@vyasr](https://github.com/vyasr)
+- Handle empty child columns in row_bit_count() ([#8791](https://github.com/rapidsai/cudf/pull/8791)) [@mythrocks](https://github.com/mythrocks)
+- Revert &quot;Remove cudf unneeded build time requirement of the cuda driver&quot; ([#8784](https://github.com/rapidsai/cudf/pull/8784)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix isort error in utils.pyx ([#8771](https://github.com/rapidsai/cudf/pull/8771)) [@charlesbluca](https://github.com/charlesbluca)
+- Handle sliced struct/list columns properly in concatenate() bounds checking. ([#8760](https://github.com/rapidsai/cudf/pull/8760)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix issues with `_CPackedColumns.serialize()` handling of host and device data ([#8759](https://github.com/rapidsai/cudf/pull/8759)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix issues with `MultiIndex` in `dropna`, `stack` &amp; `reset_index` ([#8753](https://github.com/rapidsai/cudf/pull/8753)) [@galipremsagar](https://github.com/galipremsagar)
+- Write pandas extension types to parquet file metadata ([#8749](https://github.com/rapidsai/cudf/pull/8749)) [@devavret](https://github.com/devavret)
+- Fix `where` to handle `DataFrame` &amp; `Series` input combination ([#8747](https://github.com/rapidsai/cudf/pull/8747)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `replace` to handle null values correctly ([#8744](https://github.com/rapidsai/cudf/pull/8744)) [@galipremsagar](https://github.com/galipremsagar)
+- Handle sliced structs properly in pack/contiguous_split. ([#8739](https://github.com/rapidsai/cudf/pull/8739)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix issue in slice() where columns with a positive offset were computing null counts incorrectly. ([#8738](https://github.com/rapidsai/cudf/pull/8738)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix `cudf.Series` constructor to handle list of sequences ([#8735](https://github.com/rapidsai/cudf/pull/8735)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix min/max sorted groupby aggregation on string column with nulls (argmin, argmax sentinel value missing on nulls) ([#8731](https://github.com/rapidsai/cudf/pull/8731)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix orc reader assert on create data_type in debug ([#8706](https://github.com/rapidsai/cudf/pull/8706)) [@davidwendt](https://github.com/davidwendt)
+- Fix min/max inclusive cudf::scan for strings column ([#8705](https://github.com/rapidsai/cudf/pull/8705)) [@davidwendt](https://github.com/davidwendt)
+- JNI: Fix driver version assertion logic in testGetCudaRuntimeInfo ([#8701](https://github.com/rapidsai/cudf/pull/8701)) [@sperlingxx](https://github.com/sperlingxx)
+- Adding fix for skip_rows and crash in orc reader ([#8700](https://github.com/rapidsai/cudf/pull/8700)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Bug fix: `replace_nulls_policy` functor not returning correct indices for gathermap ([#8699](https://github.com/rapidsai/cudf/pull/8699)) [@isVoid](https://github.com/isVoid)
+- Fix a crash in pack() when being handed tables with no columns. ([#8697](https://github.com/rapidsai/cudf/pull/8697)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Add post-processing steps to `dask_cudf.groupby.CudfSeriesGroupby.aggregate` ([#8694](https://github.com/rapidsai/cudf/pull/8694)) [@charlesbluca](https://github.com/charlesbluca)
+- JNI build no longer looks for Arrow in conda environment ([#8686](https://github.com/rapidsai/cudf/pull/8686)) [@jlowe](https://github.com/jlowe)
+- Handle arbitrarily different data in null list column rows when checking for equivalency. ([#8666](https://github.com/rapidsai/cudf/pull/8666)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Add ConfigureNVBench to avoid concurrent main() entry points ([#8662](https://github.com/rapidsai/cudf/pull/8662)) [@PointKernel](https://github.com/PointKernel)
+- Pin `*arrow` to use `*cuda` in `run` ([#8651](https://github.com/rapidsai/cudf/pull/8651)) [@jakirkham](https://github.com/jakirkham)
+- Add proper support for tolerances in testing methods. ([#8649](https://github.com/rapidsai/cudf/pull/8649)) [@vyasr](https://github.com/vyasr)
+- Support multi-char case conversion in capitalize function ([#8647](https://github.com/rapidsai/cudf/pull/8647)) [@davidwendt](https://github.com/davidwendt)
+- Fix repeated mangled names in read_csv with duplicate column names ([#8645](https://github.com/rapidsai/cudf/pull/8645)) [@karthikeyann](https://github.com/karthikeyann)
+- Temporarily disable libcudf example build tests ([#8642](https://github.com/rapidsai/cudf/pull/8642)) [@isVoid](https://github.com/isVoid)
+- Use conda-sourced cudf artifacts for libcudf example in CI ([#8638](https://github.com/rapidsai/cudf/pull/8638)) [@isVoid](https://github.com/isVoid)
+- Ensure dev environment uses Arrow GPU packages ([#8637](https://github.com/rapidsai/cudf/pull/8637)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix bug that columns only initialized once when specified `columns` and `index` in dataframe ctor ([#8628](https://github.com/rapidsai/cudf/pull/8628)) [@isVoid](https://github.com/isVoid)
+- Propagate **kwargs through to as_*_column methods ([#8618](https://github.com/rapidsai/cudf/pull/8618)) [@shwina](https://github.com/shwina)
+- Fix orc_reader_benchmark.cpp compile error ([#8609](https://github.com/rapidsai/cudf/pull/8609)) [@davidwendt](https://github.com/davidwendt)
+- Fix missed renumbering of Aggregation values ([#8600](https://github.com/rapidsai/cudf/pull/8600)) [@revans2](https://github.com/revans2)
+- Update cmake to 3.20.5 in the Java Docker image ([#8593](https://github.com/rapidsai/cudf/pull/8593)) [@NvTimLiu](https://github.com/NvTimLiu)
+- Fix bug in replace_with_backrefs when group has greedy quantifier ([#8575](https://github.com/rapidsai/cudf/pull/8575)) [@davidwendt](https://github.com/davidwendt)
+- Apply metadata to keys before returning in `Frame._encode` ([#8560](https://github.com/rapidsai/cudf/pull/8560)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix for strings containing special JSON characters in get_json_object(). ([#8556](https://github.com/rapidsai/cudf/pull/8556)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix debug compile error in gather_struct_tests.cpp ([#8554](https://github.com/rapidsai/cudf/pull/8554)) [@davidwendt](https://github.com/davidwendt)
+- String-to-boolean conversion is different from Pandas ([#8549](https://github.com/rapidsai/cudf/pull/8549)) [@skirui-source](https://github.com/skirui-source)
+- Fix `__repr__` output with `display.max_rows` is `None` ([#8547](https://github.com/rapidsai/cudf/pull/8547)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix size passed to column constructors in _with_type_metadata ([#8539](https://github.com/rapidsai/cudf/pull/8539)) [@shwina](https://github.com/shwina)
+- Properly retrieve last column when `-1` is specified for column index ([#8529](https://github.com/rapidsai/cudf/pull/8529)) [@isVoid](https://github.com/isVoid)
+- Fix importing `apply` from `dask` ([#8517](https://github.com/rapidsai/cudf/pull/8517)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix offset of the string dictionary length stream ([#8515](https://github.com/rapidsai/cudf/pull/8515)) [@vuule](https://github.com/vuule)
+- Fix double counting of selected columns in CSV reader ([#8508](https://github.com/rapidsai/cudf/pull/8508)) [@ochan1](https://github.com/ochan1)
+- Incorrect map size in scatter_to_gather corrupts struct columns ([#8507](https://github.com/rapidsai/cudf/pull/8507)) [@gerashegalov](https://github.com/gerashegalov)
+- replace_nulls properly propagates memory resource to gather calls ([#8500](https://github.com/rapidsai/cudf/pull/8500)) [@robertmaynard](https://github.com/robertmaynard)
+- Disallow groupby aggs for `StructColumns` ([#8499](https://github.com/rapidsai/cudf/pull/8499)) [@charlesbluca](https://github.com/charlesbluca)
+- Fixes out-of-bounds access for small files in unzip ([#8498](https://github.com/rapidsai/cudf/pull/8498)) [@elstehle](https://github.com/elstehle)
+- Adding support for writing empty dataframe ([#8490](https://github.com/rapidsai/cudf/pull/8490)) [@shaneding](https://github.com/shaneding)
+- Fix exclusive scan when including nulls and improve testing ([#8478](https://github.com/rapidsai/cudf/pull/8478)) [@harrism](https://github.com/harrism)
+- Add workaround for crash in libcudf debug build using output_indexalator in thrust::lower_bound ([#8432](https://github.com/rapidsai/cudf/pull/8432)) [@davidwendt](https://github.com/davidwendt)
+- Install only the same Thrust files that Thrust itself installs ([#8420](https://github.com/rapidsai/cudf/pull/8420)) [@robertmaynard](https://github.com/robertmaynard)
+- Add nightly version for ucx-py in ci script ([#8419](https://github.com/rapidsai/cudf/pull/8419)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix null_equality config of rolling_collect_set ([#8415](https://github.com/rapidsai/cudf/pull/8415)) [@sperlingxx](https://github.com/sperlingxx)
+- CollectSetAggregation: implement RollingAggregation interface ([#8406](https://github.com/rapidsai/cudf/pull/8406)) [@sperlingxx](https://github.com/sperlingxx)
+- Handle pre-sliced nested columns in contiguous_split. ([#8391](https://github.com/rapidsai/cudf/pull/8391)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix bitmask_tests.cpp host accessing device memory ([#8370](https://github.com/rapidsai/cudf/pull/8370)) [@davidwendt](https://github.com/davidwendt)
+- Fix concurrent_unordered_map to prevent accessing padding bits in pair_type ([#8348](https://github.com/rapidsai/cudf/pull/8348)) [@davidwendt](https://github.com/davidwendt)
+- BUG FIX: Raise appropriate strings error when concatenating strings column ([#8290](https://github.com/rapidsai/cudf/pull/8290)) [@skirui-source](https://github.com/skirui-source)
+- Make gpuCI and pre-commit style configurations consistent ([#8215](https://github.com/rapidsai/cudf/pull/8215)) [@charlesbluca](https://github.com/charlesbluca)
+- Add collect list to dask-cudf groupby aggregations ([#8045](https://github.com/rapidsai/cudf/pull/8045)) [@charlesbluca](https://github.com/charlesbluca)
+
+## 📖 Documentation
+
+- Update Python UDFs notebook ([#8810](https://github.com/rapidsai/cudf/pull/8810)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix dask.dataframe API docs  links after reorg ([#8772](https://github.com/rapidsai/cudf/pull/8772)) [@jsignell](https://github.com/jsignell)
+- Fix instructions for running cuDF/dask-cuDF tests in CONTRIBUTING.md ([#8724](https://github.com/rapidsai/cudf/pull/8724)) [@shwina](https://github.com/shwina)
+- Translate Markdown documentation to rST and remove recommonmark ([#8698](https://github.com/rapidsai/cudf/pull/8698)) [@vyasr](https://github.com/vyasr)
+- Fixed spelling mistakes in libcudf documentation ([#8664](https://github.com/rapidsai/cudf/pull/8664)) [@karthikeyann](https://github.com/karthikeyann)
+- Custom Sphinx Extension: `PandasCompat` ([#8643](https://github.com/rapidsai/cudf/pull/8643)) [@isVoid](https://github.com/isVoid)
+- Fix README.md ([#8535](https://github.com/rapidsai/cudf/pull/8535)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Change namespace contains_nulls to struct ([#8523](https://github.com/rapidsai/cudf/pull/8523)) [@davidwendt](https://github.com/davidwendt)
+- Add info about NVTX ranges to dev guide ([#8461](https://github.com/rapidsai/cudf/pull/8461)) [@jrhemstad](https://github.com/jrhemstad)
+- Fixed documentation bug in groupby agg method ([#8325](https://github.com/rapidsai/cudf/pull/8325)) [@ahmet-uyar](https://github.com/ahmet-uyar)
+
+## 🚀 New Features
+
+- Fix concatenating structs ([#8811](https://github.com/rapidsai/cudf/pull/8811)) [@shaneding](https://github.com/shaneding)
+- Implement JNI for groupby aggregations `M2` and `MERGE_M2` ([#8763](https://github.com/rapidsai/cudf/pull/8763)) [@ttnghia](https://github.com/ttnghia)
+- Bump `isort` to `5.6.4` and remove `isort` overrides made for 5.0.7 ([#8755](https://github.com/rapidsai/cudf/pull/8755)) [@charlesbluca](https://github.com/charlesbluca)
+- Implement `__setitem__` for `StructColumn` ([#8737](https://github.com/rapidsai/cudf/pull/8737)) [@shaneding](https://github.com/shaneding)
+- Add `is_leap_year` to `DateTimeProperties` and `DatetimeIndex` ([#8736](https://github.com/rapidsai/cudf/pull/8736)) [@isVoid](https://github.com/isVoid)
+- Add `struct.explode()` method ([#8729](https://github.com/rapidsai/cudf/pull/8729)) [@shwina](https://github.com/shwina)
+- Add `DataFrame.to_struct()` method to convert a DataFrame to a struct Series ([#8728](https://github.com/rapidsai/cudf/pull/8728)) [@shwina](https://github.com/shwina)
+- Add support for list type in ORC writer ([#8723](https://github.com/rapidsai/cudf/pull/8723)) [@vuule](https://github.com/vuule)
+- Fix slicing from struct columns and accessing struct columns ([#8719](https://github.com/rapidsai/cudf/pull/8719)) [@shaneding](https://github.com/shaneding)
+- Add `datetime::is_leap_year` ([#8711](https://github.com/rapidsai/cudf/pull/8711)) [@isVoid](https://github.com/isVoid)
+- Accessing struct columns from `dask_cudf` ([#8675](https://github.com/rapidsai/cudf/pull/8675)) [@shaneding](https://github.com/shaneding)
+- Added pct_change to Series ([#8650](https://github.com/rapidsai/cudf/pull/8650)) [@TravisHester](https://github.com/TravisHester)
+- Add strings support to cudf::shift function ([#8648](https://github.com/rapidsai/cudf/pull/8648)) [@davidwendt](https://github.com/davidwendt)
+- Support Scatter `struct_scalar` ([#8630](https://github.com/rapidsai/cudf/pull/8630)) [@isVoid](https://github.com/isVoid)
+- Struct scalar from host dictionary ([#8629](https://github.com/rapidsai/cudf/pull/8629)) [@shaneding](https://github.com/shaneding)
+- Add dayofyear and day_of_year to Series, DatetimeColumn, and DatetimeIndex ([#8626](https://github.com/rapidsai/cudf/pull/8626)) [@beckernick](https://github.com/beckernick)
+- JNI support for capitalize ([#8624](https://github.com/rapidsai/cudf/pull/8624)) [@firestarman](https://github.com/firestarman)
+- Add delimiter parameter to cudf::strings::capitalize() ([#8620](https://github.com/rapidsai/cudf/pull/8620)) [@davidwendt](https://github.com/davidwendt)
+- Add NVBench in CMake ([#8619](https://github.com/rapidsai/cudf/pull/8619)) [@PointKernel](https://github.com/PointKernel)
+- Change default datetime index resolution to ns to match pandas ([#8611](https://github.com/rapidsai/cudf/pull/8611)) [@vyasr](https://github.com/vyasr)
+- ListColumn `__setitem__` ([#8606](https://github.com/rapidsai/cudf/pull/8606)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Implement groupby aggregations `M2` and `MERGE_M2` ([#8605](https://github.com/rapidsai/cudf/pull/8605)) [@ttnghia](https://github.com/ttnghia)
+- Add sequence_type parameter to cudf::strings::title function ([#8602](https://github.com/rapidsai/cudf/pull/8602)) [@davidwendt](https://github.com/davidwendt)
+- Adding support for list and struct type in ORC Reader ([#8599](https://github.com/rapidsai/cudf/pull/8599)) [@rgsl888prabhu](https://github.com/rgsl888prabhu)
+- Benchmark for `strings::repeat_strings` APIs ([#8589](https://github.com/rapidsai/cudf/pull/8589)) [@ttnghia](https://github.com/ttnghia)
+- Nested scalar support for copy if else ([#8588](https://github.com/rapidsai/cudf/pull/8588)) [@gerashegalov](https://github.com/gerashegalov)
+- User specified decimal columns to float64 ([#8587](https://github.com/rapidsai/cudf/pull/8587)) [@jdye64](https://github.com/jdye64)
+- Add `get_element` for struct column ([#8578](https://github.com/rapidsai/cudf/pull/8578)) [@isVoid](https://github.com/isVoid)
+- Python changes for adding `__getitem__` for `struct` ([#8577](https://github.com/rapidsai/cudf/pull/8577)) [@shaneding](https://github.com/shaneding)
+- Add `strings::repeat_strings` API that can repeat each string a different number of times ([#8561](https://github.com/rapidsai/cudf/pull/8561)) [@ttnghia](https://github.com/ttnghia)
+- Refactor `tests/iterator_utilities.hpp` functions ([#8540](https://github.com/rapidsai/cudf/pull/8540)) [@ttnghia](https://github.com/ttnghia)
+- Support MERGE_LISTS and MERGE_SETS in Java package ([#8516](https://github.com/rapidsai/cudf/pull/8516)) [@sperlingxx](https://github.com/sperlingxx)
+- Decimal support csv reader ([#8511](https://github.com/rapidsai/cudf/pull/8511)) [@elstehle](https://github.com/elstehle)
+- Add column type tests ([#8505](https://github.com/rapidsai/cudf/pull/8505)) [@isVoid](https://github.com/isVoid)
+- Warn when downscaling decimal columns ([#8492](https://github.com/rapidsai/cudf/pull/8492)) [@ChrisJar](https://github.com/ChrisJar)
+- Add JNI for `strings::repeat_strings` ([#8491](https://github.com/rapidsai/cudf/pull/8491)) [@ttnghia](https://github.com/ttnghia)
+- Add `Index.get_loc` for Numerical, String Index support ([#8489](https://github.com/rapidsai/cudf/pull/8489)) [@isVoid](https://github.com/isVoid)
+- Expose half_up rounding in cuDF ([#8477](https://github.com/rapidsai/cudf/pull/8477)) [@shwina](https://github.com/shwina)
+- Java APIs to fetch CUDA runtime info ([#8465](https://github.com/rapidsai/cudf/pull/8465)) [@sperlingxx](https://github.com/sperlingxx)
+- Add `str.edit_distance_matrix` ([#8463](https://github.com/rapidsai/cudf/pull/8463)) [@isVoid](https://github.com/isVoid)
+- Support constructing `cudf.Scalar` objects from host side lists ([#8459](https://github.com/rapidsai/cudf/pull/8459)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add accurate hash join size functions ([#8453](https://github.com/rapidsai/cudf/pull/8453)) [@PointKernel](https://github.com/PointKernel)
+- Add cudf::strings::integer_to_hex convert API ([#8450](https://github.com/rapidsai/cudf/pull/8450)) [@davidwendt](https://github.com/davidwendt)
+- Create objects from iterables that contain cudf.NA ([#8442](https://github.com/rapidsai/cudf/pull/8442)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- JNI bindings for sort_lists ([#8439](https://github.com/rapidsai/cudf/pull/8439)) [@sperlingxx](https://github.com/sperlingxx)
+- Expose a Decimal32Dtype in cuDF Python ([#8438](https://github.com/rapidsai/cudf/pull/8438)) [@skirui-source](https://github.com/skirui-source)
+- Replace `all_null()` and `all_valid()` by `iterator_all_nulls()` and `iterator_no_null()` in tests ([#8437](https://github.com/rapidsai/cudf/pull/8437)) [@ttnghia](https://github.com/ttnghia)
+- Implement groupby `MERGE_LISTS` and `MERGE_SETS` aggregates ([#8436](https://github.com/rapidsai/cudf/pull/8436)) [@ttnghia](https://github.com/ttnghia)
+- Add public libcudf match_dictionaries API ([#8429](https://github.com/rapidsai/cudf/pull/8429)) [@davidwendt](https://github.com/davidwendt)
+- Add move constructors for `string_scalar` and `struct_scalar` ([#8428](https://github.com/rapidsai/cudf/pull/8428)) [@ttnghia](https://github.com/ttnghia)
+- Implement `strings::repeat_strings` ([#8423](https://github.com/rapidsai/cudf/pull/8423)) [@ttnghia](https://github.com/ttnghia)
+- STRUCT column support for cudf::merge. ([#8422](https://github.com/rapidsai/cudf/pull/8422)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Implement reverse in libcudf ([#8410](https://github.com/rapidsai/cudf/pull/8410)) [@shaneding](https://github.com/shaneding)
+- Support multiple input files/buffers for read_json ([#8403](https://github.com/rapidsai/cudf/pull/8403)) [@jdye64](https://github.com/jdye64)
+- Improve test coverage for struct search ([#8396](https://github.com/rapidsai/cudf/pull/8396)) [@ttnghia](https://github.com/ttnghia)
+- Add `groupby.fillna` ([#8362](https://github.com/rapidsai/cudf/pull/8362)) [@isVoid](https://github.com/isVoid)
+- Enable AST-based joining ([#8214](https://github.com/rapidsai/cudf/pull/8214)) [@vyasr](https://github.com/vyasr)
+- Generalized null support in user defined functions ([#8213](https://github.com/rapidsai/cudf/pull/8213)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add compiled binary operation ([#8192](https://github.com/rapidsai/cudf/pull/8192)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement `.describe() ` for `DataFrameGroupBy` ([#8179](https://github.com/rapidsai/cudf/pull/8179)) [@skirui-source](https://github.com/skirui-source)
+- ORC - Support reading multiple orc files/buffers in a single operation ([#8142](https://github.com/rapidsai/cudf/pull/8142)) [@jdye64](https://github.com/jdye64)
+- Add Python bindings for `lists::concatenate_list_elements` and expose them as `.list.concat()` ([#8006](https://github.com/rapidsai/cudf/pull/8006)) [@shwina](https://github.com/shwina)
+- Use Arrow URI FileSystem backed instance to retrieve remote files ([#7709](https://github.com/rapidsai/cudf/pull/7709)) [@jdye64](https://github.com/jdye64)
+- Example to build custom application and link to libcudf ([#7671](https://github.com/rapidsai/cudf/pull/7671)) [@isVoid](https://github.com/isVoid)
+- Upgrade arrow to 4.0.1 ([#7495](https://github.com/rapidsai/cudf/pull/7495)) [@galipremsagar](https://github.com/galipremsagar)
+
+## 🛠️ Improvements
+
+- Provide a better error message when `CUDA::cuda_driver` not found ([#8794](https://github.com/rapidsai/cudf/pull/8794)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove anonymous namespace from null_mask.cuh ([#8786](https://github.com/rapidsai/cudf/pull/8786)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Allow cudf to be built without libcuda.so existing ([#8751](https://github.com/rapidsai/cudf/pull/8751)) [@robertmaynard](https://github.com/robertmaynard)
+- Pin `mimesis` to `&lt;4.1` ([#8745](https://github.com/rapidsai/cudf/pull/8745)) [@galipremsagar](https://github.com/galipremsagar)
+- Update `conda` environment name for CI ([#8692](https://github.com/rapidsai/cudf/pull/8692)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Remove flatbuffers dependency ([#8671](https://github.com/rapidsai/cudf/pull/8671)) [@Ethyling](https://github.com/Ethyling)
+- Add options to build Arrow with Python and Parquet support ([#8670](https://github.com/rapidsai/cudf/pull/8670)) [@trxcllnt](https://github.com/trxcllnt)
+- Remove unused cudf::strings::create_offsets ([#8663](https://github.com/rapidsai/cudf/pull/8663)) [@davidwendt](https://github.com/davidwendt)
+- Update GDS lib version to 1.0.0 ([#8654](https://github.com/rapidsai/cudf/pull/8654)) [@pxLi](https://github.com/pxLi)
+- Support for groupby/scan rank and dense_rank aggregations ([#8652](https://github.com/rapidsai/cudf/pull/8652)) [@rwlee](https://github.com/rwlee)
+- Fix usage of deprecated arrow ipc API ([#8632](https://github.com/rapidsai/cudf/pull/8632)) [@revans2](https://github.com/revans2)
+- Use absolute imports in `cudf` ([#8631](https://github.com/rapidsai/cudf/pull/8631)) [@galipremsagar](https://github.com/galipremsagar)
+- ENH Add Java CI build script ([#8627](https://github.com/rapidsai/cudf/pull/8627)) [@dillon-cullinan](https://github.com/dillon-cullinan)
+- Add DeprecationWarning to `ser.str.subword_tokenize` ([#8603](https://github.com/rapidsai/cudf/pull/8603)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Rewrite binary operations for improved performance and additional type support ([#8598](https://github.com/rapidsai/cudf/pull/8598)) [@vyasr](https://github.com/vyasr)
+- Fix `mypy` errors surfacing because of `numpy-1.21.0` ([#8595](https://github.com/rapidsai/cudf/pull/8595)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove unneeded includes from cudf::string_view headers ([#8594](https://github.com/rapidsai/cudf/pull/8594)) [@davidwendt](https://github.com/davidwendt)
+- Use cmake 3.20.1 as it is now required by rmm ([#8586](https://github.com/rapidsai/cudf/pull/8586)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove device debug symbols from cmake CUDF_CUDA_FLAGS ([#8584](https://github.com/rapidsai/cudf/pull/8584)) [@davidwendt](https://github.com/davidwendt)
+- Dask-CuDF: use default Dask Dataframe optimizer ([#8581](https://github.com/rapidsai/cudf/pull/8581)) [@madsbk](https://github.com/madsbk)
+- Remove checking if an unsigned value is less than zero ([#8579](https://github.com/rapidsai/cudf/pull/8579)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove strings_count parameter from cudf::strings::detail::create_chars_child_column ([#8576](https://github.com/rapidsai/cudf/pull/8576)) [@davidwendt](https://github.com/davidwendt)
+- Make `cudf.api.types` imports consistent ([#8571](https://github.com/rapidsai/cudf/pull/8571)) [@galipremsagar](https://github.com/galipremsagar)
+- Modernize libcudf basic example CMakeFile; updates CI build tests ([#8568](https://github.com/rapidsai/cudf/pull/8568)) [@isVoid](https://github.com/isVoid)
+- Rename concatenate_tests.cu to .cpp ([#8555](https://github.com/rapidsai/cudf/pull/8555)) [@davidwendt](https://github.com/davidwendt)
+- enable window lead/lag test on struct ([#8548](https://github.com/rapidsai/cudf/pull/8548)) [@wbo4958](https://github.com/wbo4958)
+- Add Java methods to split and write column views ([#8546](https://github.com/rapidsai/cudf/pull/8546)) [@razajafri](https://github.com/razajafri)
+- Small cleanup ([#8534](https://github.com/rapidsai/cudf/pull/8534)) [@codereport](https://github.com/codereport)
+- Unpin `dask` version in CI ([#8533](https://github.com/rapidsai/cudf/pull/8533)) [@galipremsagar](https://github.com/galipremsagar)
+- Added optional flag for building Arrow with S3 filesystem support ([#8531](https://github.com/rapidsai/cudf/pull/8531)) [@jdye64](https://github.com/jdye64)
+- Minor clean up of various internal column and frame utilities ([#8528](https://github.com/rapidsai/cudf/pull/8528)) [@vyasr](https://github.com/vyasr)
+- Rename some copying_test source files .cu to .cpp ([#8527](https://github.com/rapidsai/cudf/pull/8527)) [@davidwendt](https://github.com/davidwendt)
+- Correct the last warnings and issues when using newer cuda versions ([#8525](https://github.com/rapidsai/cudf/pull/8525)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in transform and unary ops ([#8521](https://github.com/rapidsai/cudf/pull/8521)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in string algorithms ([#8509](https://github.com/rapidsai/cudf/pull/8509)) [@robertmaynard](https://github.com/robertmaynard)
+- Add in JNI APIs for scan, replace_nulls, group_by.scan, and group_by.replace_nulls ([#8503](https://github.com/rapidsai/cudf/pull/8503)) [@revans2](https://github.com/revans2)
+- Fix `21.08` forward-merge conflicts ([#8502](https://github.com/rapidsai/cudf/pull/8502)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Fix Cython formatting command in Contributing.md. ([#8496](https://github.com/rapidsai/cudf/pull/8496)) [@marlenezw](https://github.com/marlenezw)
+- Bug/correct unused parameters in reshape and text ([#8495](https://github.com/rapidsai/cudf/pull/8495)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in partitioning and stream compact ([#8494](https://github.com/rapidsai/cudf/pull/8494)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in labelling and list algorithms ([#8493](https://github.com/rapidsai/cudf/pull/8493)) [@robertmaynard](https://github.com/robertmaynard)
+- Refactor index construction ([#8485](https://github.com/rapidsai/cudf/pull/8485)) [@vyasr](https://github.com/vyasr)
+- Correct unused parameter warnings in replace algorithms ([#8483](https://github.com/rapidsai/cudf/pull/8483)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in reduction algorithms ([#8481](https://github.com/rapidsai/cudf/pull/8481)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in io algorithms ([#8480](https://github.com/rapidsai/cudf/pull/8480)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in interop algorithms ([#8479](https://github.com/rapidsai/cudf/pull/8479)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in filling algorithms ([#8468](https://github.com/rapidsai/cudf/pull/8468)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameter warnings in groupby ([#8467](https://github.com/rapidsai/cudf/pull/8467)) [@robertmaynard](https://github.com/robertmaynard)
+- use libcu++ time_point as timestamp ([#8466](https://github.com/rapidsai/cudf/pull/8466)) [@karthikeyann](https://github.com/karthikeyann)
+- Modify reprog_device::extract to return groups in a single pass ([#8460](https://github.com/rapidsai/cudf/pull/8460)) [@davidwendt](https://github.com/davidwendt)
+- Update minimum Dask requirement to 2021.6.0 ([#8458](https://github.com/rapidsai/cudf/pull/8458)) [@pentschev](https://github.com/pentschev)
+- Fix failures when performing binary operations on DataFrames with empty columns ([#8452](https://github.com/rapidsai/cudf/pull/8452)) [@ChrisJar](https://github.com/ChrisJar)
+- Fix conflicts in `8447` ([#8448](https://github.com/rapidsai/cudf/pull/8448)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Add serialization methods for `List` and `StructDtype` ([#8441](https://github.com/rapidsai/cudf/pull/8441)) [@charlesbluca](https://github.com/charlesbluca)
+- Replace make_empty_strings_column with make_empty_column ([#8435](https://github.com/rapidsai/cudf/pull/8435)) [@davidwendt](https://github.com/davidwendt)
+- JNI bindings for get_element ([#8433](https://github.com/rapidsai/cudf/pull/8433)) [@revans2](https://github.com/revans2)
+- Update dask make_meta changes to be compatible with dask upstream ([#8426](https://github.com/rapidsai/cudf/pull/8426)) [@galipremsagar](https://github.com/galipremsagar)
+- Unpin dask version on CI ([#8425](https://github.com/rapidsai/cudf/pull/8425)) [@galipremsagar](https://github.com/galipremsagar)
+- Add benchmark for strings/fixed_point convert APIs ([#8417](https://github.com/rapidsai/cudf/pull/8417)) [@davidwendt](https://github.com/davidwendt)
+- Adapt `cudf::scalar` classes to changes in `rmm::device_scalar` ([#8411](https://github.com/rapidsai/cudf/pull/8411)) [@harrism](https://github.com/harrism)
+- Add benchmark for strings/integers convert APIs ([#8402](https://github.com/rapidsai/cudf/pull/8402)) [@davidwendt](https://github.com/davidwendt)
+- Enable multi-file partitioning in dask_cudf.read_parquet ([#8393](https://github.com/rapidsai/cudf/pull/8393)) [@rjzamora](https://github.com/rjzamora)
+- Correct unused parameter warnings in rolling algorithms ([#8390](https://github.com/rapidsai/cudf/pull/8390)) [@robertmaynard](https://github.com/robertmaynard)
+- Correct unused parameters in column round and search ([#8389](https://github.com/rapidsai/cudf/pull/8389)) [@robertmaynard](https://github.com/robertmaynard)
+- Add functionality to apply `Dtype` metadata to `ColumnBase` ([#8373](https://github.com/rapidsai/cudf/pull/8373)) [@charlesbluca](https://github.com/charlesbluca)
+- Refactor setting stack size in regex code ([#8358](https://github.com/rapidsai/cudf/pull/8358)) [@davidwendt](https://github.com/davidwendt)
+- Update Java bindings to 21.08-SNAPSHOT ([#8344](https://github.com/rapidsai/cudf/pull/8344)) [@pxLi](https://github.com/pxLi)
+- Replace remaining uses of device_vector ([#8343](https://github.com/rapidsai/cudf/pull/8343)) [@harrism](https://github.com/harrism)
+- Statically link libnvcomp into libcudfjni ([#8334](https://github.com/rapidsai/cudf/pull/8334)) [@jlowe](https://github.com/jlowe)
+- Resolve auto merge conflicts for Branch 21.08 from branch 21.06 ([#8329](https://github.com/rapidsai/cudf/pull/8329)) [@galipremsagar](https://github.com/galipremsagar)
+- Minor code refactor for sorted_order ([#8326](https://github.com/rapidsai/cudf/pull/8326)) [@wbo4958](https://github.com/wbo4958)
+- Remove special Index class from the general index class hierarchy ([#8309](https://github.com/rapidsai/cudf/pull/8309)) [@vyasr](https://github.com/vyasr)
+- Add first-class dtype utilities ([#8308](https://github.com/rapidsai/cudf/pull/8308)) [@vyasr](https://github.com/vyasr)
+- Add option to link Java bindings with Arrow dynamically ([#8307](https://github.com/rapidsai/cudf/pull/8307)) [@jlowe](https://github.com/jlowe)
+- Refactor ColumnMethods and its subclasses to remove `column` argument and require `parent` argument ([#8306](https://github.com/rapidsai/cudf/pull/8306)) [@shwina](https://github.com/shwina)
+- Refactor `scatter` for list columns ([#8255](https://github.com/rapidsai/cudf/pull/8255)) [@isVoid](https://github.com/isVoid)
+- Expose pack/unpack API to Python ([#8153](https://github.com/rapidsai/cudf/pull/8153)) [@charlesbluca](https://github.com/charlesbluca)
+- Adding cudf.cut method ([#8002](https://github.com/rapidsai/cudf/pull/8002)) [@marlenezw](https://github.com/marlenezw)
+- Optimize string gather performance for large strings ([#7980](https://github.com/rapidsai/cudf/pull/7980)) [@gaohao95](https://github.com/gaohao95)
+- Add peak memory usage tracking to cuIO benchmarks ([#7770](https://github.com/rapidsai/cudf/pull/7770)) [@devavret](https://github.com/devavret)
+- Updating Clang Version to 11.0.0 ([#6695](https://github.com/rapidsai/cudf/pull/6695)) [@codereport](https://github.com/codereport)
 
 # cuDF 21.06.00 (9 Jun 2021)
 
diff --git a/README.md b/README.md
index 587f18d2603..525820eee01 100644
--- a/README.md
+++ b/README.md
@@ -65,15 +65,15 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel:
 
-For `cudf version == 21.06` :
+For `cudf version == 21.08` :
 ```bash
 # for CUDA 11.0
 conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=21.06 python=3.7 cudatoolkit=11.0
+    cudf=21.08 python=3.7 cudatoolkit=11.0
 
 # or, for CUDA 11.2
 conda install -c rapidsai -c nvidia -c numba -c conda-forge \
-    cudf=21.06 python=3.7 cudatoolkit=11.2
+    cudf=21.08 python=3.7 cudatoolkit=11.2
 
 ```
 
diff --git a/build.sh b/build.sh
index 70b93427d5c..11948c64412 100755
--- a/build.sh
+++ b/build.sh
@@ -18,26 +18,27 @@ ARGS=$*
 REPODIR=$(cd $(dirname $0); pwd)
 
 VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h"
-HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l]
-   clean                - remove all existing build artifacts and configuration (start
-                          over)
-   libcudf              - build the cudf C++ code only
-   cudf                 - build the cudf Python package
-   dask_cudf            - build the dask_cudf Python package
-   benchmarks           - build benchmarks
-   tests                - build tests
-   libcudf_kafka        - build the libcudf_kafka C++ code only
-   cudf_kafka           - build the cudf_kafka Python package
-   custreamz            - build the custreamz Python package
-   -v                   - verbose build mode
-   -g                   - build for debug
-   -n                   - no install step
-   -l                   - build legacy tests
-   --allgpuarch         - build for all supported GPU architectures
-   --disable_nvtx       - disable inserting NVTX profiling ranges
-   --show_depr_warn     - show cmake deprecation warnings
-   --ptds               - enable per-thread default stream
-   -h | --h[elp]        - print this text
+HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\"<args>\"]
+   clean                         - remove all existing build artifacts and configuration (start
+                                   over)
+   libcudf                       - build the cudf C++ code only
+   cudf                          - build the cudf Python package
+   dask_cudf                     - build the dask_cudf Python package
+   benchmarks                    - build benchmarks
+   tests                         - build tests
+   libcudf_kafka                 - build the libcudf_kafka C++ code only
+   cudf_kafka                    - build the cudf_kafka Python package
+   custreamz                     - build the custreamz Python package
+   -v                            - verbose build mode
+   -g                            - build for debug
+   -n                            - no install step
+   -l                            - build legacy tests
+   --allgpuarch                  - build for all supported GPU architectures
+   --disable_nvtx                - disable inserting NVTX profiling ranges
+   --show_depr_warn              - show cmake deprecation warnings
+   --ptds                        - enable per-thread default stream
+   --cmake-args=\\\"<args>\\\"   - pass arbitrary list of CMake configuration options (escape all quotes in argument)
+   -h | --h[elp]                 - print this text
 
    default action (no args) is to build and install 'libcudf' then 'cudf'
    then 'dask_cudf' targets
@@ -71,6 +72,28 @@ function hasArg {
     (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
 }
 
+function cmakeArgs {
+    # Check for multiple cmake args options
+    if [[ $(echo $ARGS | { grep -Eo "\-\-cmake\-args" || true; } | wc -l ) -gt 1 ]]; then
+        echo "Multiple --cmake-args options were provided, please provide only one: ${ARGS}"
+        exit 1
+    fi
+
+    # Check for cmake args option
+    if [[ -n $(echo $ARGS | { grep -E "\-\-cmake\-args" || true; } ) ]]; then
+        # There are possible weird edge cases that may cause this regex filter to output nothing and fail silently
+        # the true pipe will catch any weird edge cases that may happen and will cause the program to fall back
+        # on the invalid option error
+        CMAKE_ARGS=$(echo $ARGS | { grep -Eo "\-\-cmake\-args=\".+\"" || true; })
+        if [[ -n ${CMAKE_ARGS} ]]; then
+            # Remove the full  CMAKE_ARGS argument from list of args so that it passes validArgs function
+            ARGS=${ARGS//$CMAKE_ARGS/}
+            # Filter the full argument down to just the extra string that will be added to cmake call
+            CMAKE_ARGS=$(echo $CMAKE_ARGS | grep -Eo "\".+\"" | sed -e 's/^"//' -e 's/"$//')
+        fi
+    fi
+}
+
 function buildAll {
     ((${NUMARGS} == 0 )) || !(echo " ${ARGS} " | grep -q " [^-]\+ ")
 }
@@ -82,9 +105,11 @@ fi
 
 # Check for valid usage
 if (( ${NUMARGS} != 0 )); then
+    # Check for cmake args
+    cmakeArgs
     for a in ${ARGS}; do
     if ! (echo " ${VALIDARGS} " | grep -q " ${a} "); then
-        echo "Invalid option: ${a}"
+        echo "Invalid option or formatting, check --help: ${a}"
         exit 1
     fi
     done
@@ -139,7 +164,6 @@ fi
 # Configure, build, and install libcudf
 
 if buildAll || hasArg libcudf; then
-
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         CUDF_CMAKE_CUDA_ARCHITECTURES="-DCMAKE_CUDA_ARCHITECTURES="
         echo "Building for the architecture of the GPU in the system..."
@@ -156,7 +180,8 @@ if buildAll || hasArg libcudf; then
           -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
           -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
           -DPER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \
-          -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+          -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+          ${CMAKE_ARGS}
 
     cd ${LIB_BUILD_DIR}
 
@@ -172,8 +197,7 @@ if buildAll || hasArg cudf; then
 
     cd ${REPODIR}/python/cudf
     if [[ ${INSTALL_TARGET} != "" ]]; then
-        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL}
-        python setup.py install --single-version-externally-managed --record=record.txt
+        PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext -j${PARALLEL_LEVEL} install --single-version-externally-managed --record=record.txt
     else
         PARALLEL_LEVEL=${PARALLEL_LEVEL} python setup.py build_ext --inplace -j${PARALLEL_LEVEL} --library-dir=${LIBCUDF_BUILD_DIR}
     fi
@@ -196,7 +220,8 @@ if hasArg libcudf_kafka; then
     cmake -S $REPODIR/cpp/libcudf_kafka -B ${KAFKA_LIB_BUILD_DIR} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           -DBUILD_TESTS=${BUILD_TESTS} \
-          -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
+          -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+          ${CMAKE_ARGS}
 
 
     cd ${KAFKA_LIB_BUILD_DIR}
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 70bbe88a00c..2c0984569db 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=4.0.1=*cuda
+  - pyarrow=5.0.0=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -26,7 +26,6 @@ dependencies:
   - pytest-benchmark
   - pytest-xdist
   - sphinx
-  - sphinx_rtd_theme
   - sphinxcontrib-websupport
   - nbsphinx
   - numpydoc
@@ -43,7 +42,7 @@ dependencies:
   - dask>=2021.6.0
   - distributed>=2021.6.0
   - streamz
-  - arrow-cpp=4.0.1
+  - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
   - arrow-cpp-proc * cuda
   - double-conversion
@@ -57,6 +56,7 @@ dependencies:
   - nvtx>=0.2.1
   - cachetools
   - transformers
+  - pydata-sphinx-theme
   - pip:
       - git+https://github.com/dask/dask.git@main
       - git+https://github.com/dask/distributed.git@main
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 6d2abdda449..766d85e957b 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.53.1
   - numpy
   - pandas>=1.0,<1.3.0dev0
-  - pyarrow=4.0.1=*cuda
+  - pyarrow=5.0.0=*cuda
   - fastavro>=0.22.9
   - notebook>=0.5.0
   - cython>=0.29,<0.30
@@ -26,7 +26,6 @@ dependencies:
   - pytest-benchmark
   - pytest-xdist
   - sphinx
-  - sphinx_rtd_theme
   - sphinxcontrib-websupport
   - nbsphinx
   - numpydoc
@@ -43,7 +42,7 @@ dependencies:
   - dask>=2021.6.0
   - distributed>=2021.6.0
   - streamz
-  - arrow-cpp=4.0.1
+  - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
   - arrow-cpp-proc * cuda
   - double-conversion
@@ -57,6 +56,7 @@ dependencies:
   - nvtx>=0.2.1
   - cachetools
   - transformers
+  - pydata-sphinx-theme
   - pip:
       - git+https://github.com/dask/dask.git@main
       - git+https://github.com/dask/distributed.git@main
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9023e89c2f5..ca36acccfbb 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -30,7 +30,7 @@ requirements:
     - setuptools
     - numba >=0.53.1
     - dlpack>=0.5,<0.6.0a0
-    - pyarrow 4.0.1 *cuda
+    - pyarrow 5.0.0 *cuda
     - libcudf {{ version }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 6c4175a2539..208c21c2dc0 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -37,7 +37,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 4.0.1 *cuda
+    - arrow-cpp 5.0.0 *cuda
     - arrow-cpp-proc * cuda
     - dlpack>=0.5,<0.6.0a0
   run:
@@ -51,11 +51,9 @@ test:
     - test -f $PREFIX/lib/libcudf.so
     - test -f $PREFIX/lib/libcudftestutil.a
     - test -f $PREFIX/include/cudf/aggregation.hpp
-    - test -f $PREFIX/include/cudf/ast/transform.hpp
-    - test -f $PREFIX/include/cudf/ast/detail/linearizer.hpp
+    - test -f $PREFIX/include/cudf/ast/detail/expression_parser.hpp
     - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
-    - test -f $PREFIX/include/cudf/ast/nodes.hpp
-    - test -f $PREFIX/include/cudf/ast/operators.hpp
+    - test -f $PREFIX/include/cudf/ast/expressions.hpp
     - test -f $PREFIX/include/cudf/binaryop.hpp
     - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
     - test -f $PREFIX/include/cudf/column/column_factories.hpp
@@ -102,6 +100,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp
     - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h
     - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
+    - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/encode.hpp
     - test -f $PREFIX/include/cudf/dictionary/detail/merge.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5c05a58b448..3eee1147414 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,6 +28,17 @@ elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "")
   set(CUDF_BUILD_FOR_DETECTED_ARCHS TRUE)
 endif()
 
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+    ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+
 project(CUDF VERSION 21.10.00 LANGUAGES C CXX)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake,
@@ -44,6 +55,7 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks"
 option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON)
 option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
+option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
 option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
 option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
 option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON)
@@ -137,6 +149,9 @@ include(cmake/thirdparty/CUDF_GetArrow.cmake)
 include(cmake/thirdparty/CUDF_GetDLPack.cmake)
 # find libcu++
 include(cmake/thirdparty/CUDF_GetLibcudacxx.cmake)
+# find cuCollections
+# Should come after including thrust and libcudacxx
+include(cmake/thirdparty/CUDF_GetcuCollections.cmake)
 # find or install GoogleTest
 include(cmake/thirdparty/CUDF_GetGTest.cmake)
 # preprocess jitify-able kernels
@@ -151,8 +166,8 @@ add_library(cudf
     src/aggregation/aggregation.cpp
     src/aggregation/aggregation.cu
     src/aggregation/result_cache.cpp
-    src/ast/linearizer.cpp
-    src/ast/transform.cu
+    src/ast/expression_parser.cpp
+    src/ast/expressions.cpp
     src/binaryop/binaryop.cpp
     src/binaryop/compiled/binary_ops.cu
     src/binaryop/compiled/Add.cu
@@ -255,6 +270,7 @@ add_library(cudf
     src/interop/dlpack.cpp
     src/interop/from_arrow.cu
     src/interop/to_arrow.cu
+    src/interop/detail/arrow_allocator.cpp
     src/io/avro/avro.cpp
     src/io/avro/avro_gpu.cu
     src/io/avro/reader_impl.cu
@@ -283,7 +299,7 @@ add_library(cudf
     src/io/orc/writer_impl.cu
     src/io/parquet/compact_protocol_writer.cpp
     src/io/parquet/page_data.cu
-    src/io/parquet/page_dict.cu
+    src/io/parquet/chunk_dict.cu
     src/io/parquet/page_enc.cu
     src/io/parquet/page_hdr.cu
     src/io/parquet/parquet.cpp
@@ -305,6 +321,7 @@ add_library(cudf
     src/join/cross_join.cu
     src/join/hash_join.cu
     src/join/join.cu
+    src/join/join_utils.cu
     src/join/semi_join.cu
     src/lists/contains.cu
     src/lists/combine/concatenate_list_elements.cu
@@ -436,6 +453,7 @@ add_library(cudf
     src/text/subword/wordpiece_tokenizer.cu
     src/text/tokenize.cu
     src/transform/bools_to_mask.cu
+    src/transform/compute_column.cu
     src/transform/encode.cu
     src/transform/mask_to_bools.cu
     src/transform/nans_to_nulls.cu
@@ -523,7 +541,8 @@ target_link_libraries(cudf
            PUBLIC ZLIB::ZLIB
                   ${ARROW_LIBRARIES}
                   cudf::Thrust
-                  rmm::rmm)
+                  rmm::rmm
+           PRIVATE cuco::cuco)
 
 if(CUDA_STATIC_RUNTIME)
     # Tell CMake what CUDA language runtime to use
@@ -628,9 +647,11 @@ endif()
 ###################################################################################################
 # - install targets -------------------------------------------------------------------------------
 
+include(CPack)
+
 include(GNUInstallDirs)
 
-set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/cudf)
+set(INSTALL_CONFIGDIR lib/cmake/cudf)
 set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME cudf)
 
 # install target for cudf_base and the proxy libcudf.so
@@ -679,22 +700,6 @@ configure_package_config_file(cmake/cudf-build-config.cmake.in ${CUDF_BINARY_DIR
 write_basic_package_version_file(${CUDF_BINARY_DIR}/cudf-config-version.cmake
                                  COMPATIBILITY SameMinorVersion)
 
-if(TARGET arrow_shared)
-    get_target_property(arrow_is_imported arrow_shared IMPORTED)
-    if(NOT arrow_is_imported)
-        export(TARGETS arrow_shared arrow_cuda_shared
-            FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake
-            NAMESPACE   cudf::)
-    endif()
-elseif(TARGET arrow_static)
-    get_target_property(arrow_is_imported arrow_static IMPORTED)
-    if(NOT arrow_is_imported)
-        export(TARGETS arrow_static arrow_cuda_static
-            FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake
-            NAMESPACE   cudf::)
-    endif()
-endif()
-
 if(TARGET gtest)
     get_target_property(gtest_is_imported gtest IMPORTED)
     if(NOT gtest_is_imported)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d0a47984053..56f17dc7090 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -29,6 +29,7 @@ target_link_libraries(cudf_datagen
                       GTest::gmock_main
                       GTest::gtest_main
                       benchmark::benchmark
+                      nvbench::nvbench
                       Threads::Threads
                       cudf)
 
@@ -102,6 +103,7 @@ ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchma
 ###################################################################################################
 # - join benchmark --------------------------------------------------------------------------------
 ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
+ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
 
 ###################################################################################################
 # - iterator benchmark ----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp
index 6f131cf0d6a..fd0a0f7d2c8 100644
--- a/cpp/benchmarks/ast/transform_benchmark.cpp
+++ b/cpp/benchmarks/ast/transform_benchmark.cpp
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#include <cudf/ast/transform.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -95,22 +95,22 @@ static void BM_ast_transform(benchmark::State& state)
   // Note that a std::list is required here because of its guarantees against reference invalidation
   // when items are added or removed. References to items in a std::vector are not safe if the
   // vector must re-allocate.
-  auto expressions = std::list<cudf::ast::expression>();
+  auto expressions = std::list<cudf::ast::operation>();
 
   // Construct tree that chains additions like (((a + b) + c) + d)
   auto const op = cudf::ast::ast_operator::ADD;
   if (reuse_columns) {
-    expressions.push_back(cudf::ast::expression(op, column_refs.at(0), column_refs.at(0)));
+    expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(0)));
     for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-      expressions.push_back(cudf::ast::expression(op, expressions.back(), column_refs.at(0)));
+      expressions.push_back(cudf::ast::operation(op, expressions.back(), column_refs.at(0)));
     }
   } else {
-    expressions.push_back(cudf::ast::expression(op, column_refs.at(0), column_refs.at(1)));
+    expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(1)));
     std::transform(std::next(column_refs.cbegin(), 2),
                    column_refs.cend(),
                    std::back_inserter(expressions),
                    [&](auto const& column_ref) {
-                     return cudf::ast::expression(op, expressions.back(), column_ref);
+                     return cudf::ast::operation(op, expressions.back(), column_ref);
                    });
   }
 
@@ -119,7 +119,7 @@ static void BM_ast_transform(benchmark::State& state)
   // Execute benchmark
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf::ast::compute_column(table, expression_tree_root);
+    cudf::compute_column(table, expression_tree_root);
   }
 
   // Use the number of bytes read from global memory
diff --git a/cpp/benchmarks/fixture/rmm_pool_raii.hpp b/cpp/benchmarks/fixture/rmm_pool_raii.hpp
new file mode 100644
index 00000000000..9038f523b29
--- /dev/null
+++ b/cpp/benchmarks/fixture/rmm_pool_raii.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace cudf {
+
+/**
+ * @brief An RAII class setting up RMM memory pool for `nvbench` benchmarks
+ *
+ * This is a temporary solution before templated fixtures tests are supported
+ * in `nvbench`. Similarly to `cudf::benchmark`, creating this RAII object in
+ * each benchmark will ensure that the RAPIDS Memory Manager pool mode is used
+ * in benchmarks, which eliminates memory allocation / deallocation performance
+ * overhead from the benchmark.
+ *
+ * Example:
+ *
+ * void my_benchmark(nvbench::state& state) {
+ * cudf::rmm_pool_raii pool_raii;
+ * state.exec([](nvbench::launch& launch) {
+ *       // benchmark stuff
+ *  });
+ * }
+ *
+ * NVBENCH_BENCH(my_benchmark);
+ */
+class rmm_pool_raii {
+ private:
+  // memory resource factory helpers
+  inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+  inline auto make_pool()
+  {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+  }
+
+ public:
+  rmm_pool_raii()
+  {
+    mr = make_pool();
+    rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
+  }
+
+  ~rmm_pool_raii()
+  {
+    rmm::mr::set_current_device_resource(nullptr);
+    mr.reset();
+  }
+
+ private:
+  std::shared_ptr<rmm::mr::device_memory_resource> mr;
+};
+
+}  // namespace cudf
diff --git a/cpp/benchmarks/groupby/group_nth_benchmark.cu b/cpp/benchmarks/groupby/group_nth_benchmark.cu
index 9765a4a265c..8d1de36db95 100644
--- a/cpp/benchmarks/groupby/group_nth_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_nth_benchmark.cu
@@ -63,7 +63,8 @@ void BM_pre_sorted_nth(benchmark::State& state)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals;
-  requests[0].aggregations.push_back(cudf::make_nth_element_aggregation(-1));
+  requests[0].aggregations.push_back(
+    cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(-1));
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu
index 1455f1cecdc..6351da66fdd 100644
--- a/cpp/benchmarks/groupby/group_sum_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu
@@ -58,7 +58,7 @@ void BM_basic_sum(benchmark::State& state)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals;
-  requests[0].aggregations.push_back(cudf::make_sum_aggregation());
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
@@ -97,7 +97,7 @@ void BM_pre_sorted_sum(benchmark::State& state)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = vals;
-  requests[0].aggregations.push_back(cudf::make_sum_aggregation());
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join_benchmark.cu
index 4a655e29f74..71b90685fb9 100644
--- a/cpp/benchmarks/join/conditional_join_benchmark.cu
+++ b/cpp/benchmarks/join/conditional_join_benchmark.cu
@@ -14,117 +14,24 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
-#include <thrust/iterator/counting_iterator.h>
-
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/join.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <vector>
-
-#include "generate_input_tables.cuh"
+#include <benchmarks/join/join_benchmark_common.hpp>
 
 template <typename key_type, typename payload_type>
 class ConditionalJoin : public cudf::benchmark {
 };
 
-template <typename key_type, typename payload_type, bool Nullable, typename Join>
-static void BM_join(benchmark::State& state, Join JoinFunc)
-{
-  const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)};
-  const cudf::size_type rand_max_val{build_table_size * 2};
-  const double selectivity             = 0.3;
-  const bool is_build_table_key_unique = true;
-
-  // Generate build and probe tables
-  cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
-  auto build_random_null_mask = [&rand_gen](int size) {
-    if (Nullable) {
-      // roughly 25% nulls
-      auto validity = thrust::make_transform_iterator(
-        thrust::make_counting_iterator(0),
-        [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
-      return cudf::test::detail::make_null_mask(validity, validity + size);
-    } else {
-      return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED);
-    }
-  };
-
-  std::unique_ptr<cudf::column> build_key_column = [&]() {
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size,
-                                                build_random_null_mask(build_table_size))
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size);
-  }();
-  std::unique_ptr<cudf::column> probe_key_column = [&]() {
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size,
-                                                build_random_null_mask(probe_table_size))
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size);
-  }();
-
-  generate_input_tables<key_type, cudf::size_type>(
-    build_key_column->mutable_view().data<key_type>(),
-    build_table_size,
-    probe_key_column->mutable_view().data<key_type>(),
-    probe_table_size,
-    selectivity,
-    rand_max_val,
-    is_build_table_key_unique);
-
-  auto payload_data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<payload_type> build_payload_column(
-    payload_data_it, payload_data_it + build_table_size);
-
-  cudf::test::fixed_width_column_wrapper<payload_type> probe_payload_column(
-    payload_data_it, payload_data_it + probe_table_size);
-
-  CHECK_CUDA(0);
-
-  cudf::table_view build_table({build_key_column->view(), build_payload_column});
-  cudf::table_view probe_table({probe_key_column->view(), probe_payload_column});
-
-  // Benchmark the inner join operation
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    // Common column references.
-    const auto col_ref_left_0  = cudf::ast::column_reference(0);
-    const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-    auto left_zero_eq_right_zero =
-      cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
-
-    auto result =
-      JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
-  }
-}
-
 #define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
   BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)            \
   (::benchmark::State & st)                                                             \
   {                                                                                     \
     auto join = [](cudf::table_view const& left,                                        \
                    cudf::table_view const& right,                                       \
-                   cudf::ast::expression binary_pred,                                   \
+                   cudf::ast::operation binary_pred,                                    \
                    cudf::null_equality compare_nulls) {                                 \
       return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                  \
-    BM_join<key_type, payload_type, nullable>(st, join);                                \
+    constexpr bool is_conditional = true;                                               \
+    BM_join<key_type, payload_type, nullable, is_conditional>(st, join);                \
   }
 
 CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, int32_t, false);
@@ -138,11 +45,12 @@ CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int6
   {                                                                                    \
     auto join = [](cudf::table_view const& left,                                       \
                    cudf::table_view const& right,                                      \
-                   cudf::ast::expression binary_pred,                                  \
+                   cudf::ast::operation binary_pred,                                   \
                    cudf::null_equality compare_nulls) {                                \
       return cudf::conditional_left_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                 \
-    BM_join<key_type, payload_type, nullable>(st, join);                               \
+    constexpr bool is_conditional = true;                                              \
+    BM_join<key_type, payload_type, nullable, is_conditional>(st, join);               \
   }
 
 CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, int32_t, false);
@@ -156,11 +64,12 @@ CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_
   {                                                                                    \
     auto join = [](cudf::table_view const& left,                                       \
                    cudf::table_view const& right,                                      \
-                   cudf::ast::expression binary_pred,                                  \
+                   cudf::ast::operation binary_pred,                                   \
                    cudf::null_equality compare_nulls) {                                \
       return cudf::conditional_inner_join(left, right, binary_pred, compare_nulls);    \
     };                                                                                 \
-    BM_join<key_type, payload_type, nullable>(st, join);                               \
+    constexpr bool is_conditional = true;                                              \
+    BM_join<key_type, payload_type, nullable, is_conditional>(st, join);               \
   }
 
 CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, int32_t, false);
@@ -174,11 +83,12 @@ CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_
   {                                                                                         \
     auto join = [](cudf::table_view const& left,                                            \
                    cudf::table_view const& right,                                           \
-                   cudf::ast::expression binary_pred,                                       \
+                   cudf::ast::operation binary_pred,                                        \
                    cudf::null_equality compare_nulls) {                                     \
       return cudf::conditional_left_anti_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                      \
-    BM_join<key_type, payload_type, nullable>(st, join);                                    \
+    constexpr bool is_conditional = true;                                                   \
+    BM_join<key_type, payload_type, nullable, is_conditional>(st, join);                    \
   }
 
 CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit,
@@ -204,11 +114,12 @@ CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nul
   {                                                                                         \
     auto join = [](cudf::table_view const& left,                                            \
                    cudf::table_view const& right,                                           \
-                   cudf::ast::expression binary_pred,                                       \
+                   cudf::ast::operation binary_pred,                                        \
                    cudf::null_equality compare_nulls) {                                     \
       return cudf::conditional_left_semi_join(left, right, binary_pred, compare_nulls);     \
     };                                                                                      \
-    BM_join<key_type, payload_type, nullable>(st, join);                                    \
+    constexpr bool is_conditional = true;                                                   \
+    BM_join<key_type, payload_type, nullable, is_conditional>(st, join);                    \
   }
 
 CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit,
@@ -234,11 +145,6 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
   ->Args({100'000, 1'000'000})
-  // TODO: The below benchmark is slow, but can be useful to validate that the
-  // code works for large data sets. This benchmark was used to compare to the
-  // otherwise equivalent nullable benchmark below, which has memory errors for
-  // sufficiently large data sets.
-  //->Args({1'000'000, 1'000'000})
   ->UseManualTime();
 
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit)
diff --git a/cpp/benchmarks/join/join_benchmark.cu b/cpp/benchmarks/join/join_benchmark.cu
index a7c109db9b4..72d9b541232 100644
--- a/cpp/benchmarks/join/join_benchmark.cu
+++ b/cpp/benchmarks/join/join_benchmark.cu
@@ -14,121 +14,12 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
-
-#include <thrust/iterator/counting_iterator.h>
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/join.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
-
-#include <vector>
-
-#include "generate_input_tables.cuh"
+#include <benchmarks/join/join_benchmark_common.hpp>
 
 template <typename key_type, typename payload_type>
 class Join : public cudf::benchmark {
 };
 
-template <typename key_type, typename payload_type, bool Nullable, typename Join>
-static void BM_join(benchmark::State& state, Join JoinFunc)
-{
-  const cudf::size_type build_table_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type probe_table_size{(cudf::size_type)state.range(1)};
-  const cudf::size_type rand_max_val{build_table_size * 2};
-  const double selectivity             = 0.3;
-  const bool is_build_table_key_unique = true;
-
-  // Generate build and probe tables
-  cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
-  auto build_random_null_mask = [&rand_gen](int size) {
-    if (Nullable) {
-      // roughly 25% nulls
-      auto validity = thrust::make_transform_iterator(
-        thrust::make_counting_iterator(0),
-        [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
-      return cudf::test::detail::make_null_mask(validity, validity + size);
-    } else {
-      return cudf::create_null_mask(size, cudf::mask_state::UNINITIALIZED);
-    }
-  };
-
-  std::unique_ptr<cudf::column> build_key_column = [&]() {
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size,
-                                                build_random_null_mask(build_table_size))
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size);
-  }();
-  std::unique_ptr<cudf::column> probe_key_column = [&]() {
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size,
-                                                build_random_null_mask(probe_table_size))
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size);
-  }();
-
-  generate_input_tables<key_type, cudf::size_type>(
-    build_key_column->mutable_view().data<key_type>(),
-    build_table_size,
-    probe_key_column->mutable_view().data<key_type>(),
-    probe_table_size,
-    selectivity,
-    rand_max_val,
-    is_build_table_key_unique);
-
-  auto payload_data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<payload_type> build_payload_column(
-    payload_data_it, payload_data_it + build_table_size);
-
-  cudf::test::fixed_width_column_wrapper<payload_type> probe_payload_column(
-    payload_data_it, payload_data_it + probe_table_size);
-
-  CHECK_CUDA(0);
-
-  cudf::table_view build_table({build_key_column->view(), build_payload_column});
-  cudf::table_view probe_table({probe_key_column->view(), probe_payload_column});
-
-  // Setup join parameters and result table
-
-  std::vector<cudf::size_type> columns_to_join = {0};
-
-  // Benchmark the inner join operation
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-
-    auto result = JoinFunc(
-      probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
-  }
-}
-
-#define JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)         \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)             \
-  (::benchmark::State & st)                                                   \
-  {                                                                           \
-    auto join = [](cudf::table_view const& left,                              \
-                   cudf::table_view const& right,                             \
-                   std::vector<cudf::size_type> const& left_on,               \
-                   std::vector<cudf::size_type> const& right_on,              \
-                   cudf::null_equality compare_nulls) {                       \
-      return cudf::inner_join(left, right, left_on, right_on, compare_nulls); \
-    };                                                                        \
-    BM_join<key_type, payload_type, nullable>(st, join);                      \
-  }
-
-JOIN_BENCHMARK_DEFINE(join_32bit, int32_t, int32_t, false);
-JOIN_BENCHMARK_DEFINE(join_64bit, int64_t, int64_t, false);
-JOIN_BENCHMARK_DEFINE(join_32bit_nulls, int32_t, int32_t, true);
-JOIN_BENCHMARK_DEFINE(join_64bit_nulls, int64_t, int64_t, true);
-
 #define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)   \
   BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)                 \
   (::benchmark::State & st)                                                       \
@@ -167,43 +58,6 @@ LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false);
 LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true);
 LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true);
 
-// join -----------------------------------------------------------------------
-BENCHMARK_REGISTER_F(Join, join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->Args({10'000'000, 10'000'000})
-  ->Args({10'000'000, 40'000'000})
-  ->Args({10'000'000, 100'000'000})
-  ->Args({100'000'000, 100'000'000})
-  ->Args({80'000'000, 240'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({50'000'000, 50'000'000})
-  ->Args({40'000'000, 120'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->Args({10'000'000, 10'000'000})
-  ->Args({10'000'000, 40'000'000})
-  ->Args({10'000'000, 100'000'000})
-  ->Args({100'000'000, 100'000'000})
-  ->Args({80'000'000, 240'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({50'000'000, 50'000'000})
-  ->Args({40'000'000, 120'000'000})
-  ->UseManualTime();
-
 // left anti-join -------------------------------------------------------------
 BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
   ->Unit(benchmark::kMillisecond)
diff --git a/cpp/benchmarks/join/join_benchmark_common.hpp b/cpp/benchmarks/join/join_benchmark_common.hpp
new file mode 100644
index 00000000000..add87bf7dfb
--- /dev/null
+++ b/cpp/benchmarks/join/join_benchmark_common.hpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <vector>
+
+#include "generate_input_tables.cuh"
+
+template <typename key_type,
+          typename payload_type,
+          bool Nullable,
+          bool is_conditional = false,
+          typename state_type,
+          typename Join>
+static void BM_join(state_type& state, Join JoinFunc)
+{
+  auto const build_table_size = [&]() {
+    if constexpr (std::is_same_v<state_type, benchmark::State>) {
+      return static_cast<cudf::size_type>(state.range(0));
+    }
+    if constexpr (std::is_same_v<state_type, nvbench::state>) {
+      return static_cast<cudf::size_type>(state.get_int64("Build Table Size"));
+    }
+  }();
+  auto const probe_table_size = [&]() {
+    if constexpr (std::is_same_v<state_type, benchmark::State>) {
+      return static_cast<cudf::size_type>(state.range(1));
+    }
+    if constexpr (std::is_same_v<state_type, nvbench::state>) {
+      return static_cast<cudf::size_type>(state.get_int64("Probe Table Size"));
+    }
+  }();
+
+  const cudf::size_type rand_max_val{build_table_size * 2};
+  const double selectivity             = 0.3;
+  const bool is_build_table_key_unique = true;
+
+  // Generate build and probe tables
+  cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
+  auto build_random_null_mask = [&rand_gen](int size) {
+    // roughly 25% nulls
+    auto validity = thrust::make_transform_iterator(
+      thrust::make_counting_iterator(0),
+      [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
+    return cudf::test::detail::make_null_mask(validity, validity + size);
+  };
+
+  std::unique_ptr<cudf::column> build_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size,
+                                                build_random_null_mask(build_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                build_table_size);
+  }();
+  std::unique_ptr<cudf::column> probe_key_column = [&]() {
+    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size,
+                                                build_random_null_mask(probe_table_size))
+                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
+                                                probe_table_size);
+  }();
+
+  generate_input_tables<key_type, cudf::size_type>(
+    build_key_column->mutable_view().data<key_type>(),
+    build_table_size,
+    probe_key_column->mutable_view().data<key_type>(),
+    probe_table_size,
+    selectivity,
+    rand_max_val,
+    is_build_table_key_unique);
+
+  auto payload_data_it = thrust::make_counting_iterator(0);
+  cudf::test::fixed_width_column_wrapper<payload_type> build_payload_column(
+    payload_data_it, payload_data_it + build_table_size);
+
+  cudf::test::fixed_width_column_wrapper<payload_type> probe_payload_column(
+    payload_data_it, payload_data_it + probe_table_size);
+
+  CHECK_CUDA(0);
+
+  cudf::table_view build_table({build_key_column->view(), build_payload_column});
+  cudf::table_view probe_table({probe_key_column->view(), probe_payload_column});
+
+  // Setup join parameters and result table
+  [[maybe_unused]] std::vector<cudf::size_type> columns_to_join = {0};
+
+  // Benchmark the inner join operation
+  if constexpr (std::is_same_v<state_type, benchmark::State> and (not is_conditional)) {
+    for (auto _ : state) {
+      cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+      auto result = JoinFunc(
+        probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
+    }
+  }
+  if constexpr (std::is_same_v<state_type, nvbench::state> and (not is_conditional)) {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      rmm::cuda_stream_view stream_view{launch.get_stream()};
+      JoinFunc(probe_table,
+               build_table,
+               columns_to_join,
+               columns_to_join,
+               cudf::null_equality::UNEQUAL,
+               stream_view);
+    });
+  }
+
+  // Benchmark conditional join
+  if constexpr (std::is_same_v<state_type, benchmark::State> and is_conditional) {
+    // Common column references.
+    const auto col_ref_left_0  = cudf::ast::column_reference(0);
+    const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+    auto left_zero_eq_right_zero =
+      cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+
+    for (auto _ : state) {
+      cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+
+      auto result =
+        JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
+    }
+  }
+}
diff --git a/cpp/benchmarks/join/join_nvbench.cu b/cpp/benchmarks/join/join_nvbench.cu
new file mode 100644
index 00000000000..ffb21d8594d
--- /dev/null
+++ b/cpp/benchmarks/join/join_nvbench.cu
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+#include <benchmarks/join/join_benchmark_common.hpp>
+
+void skip_helper(nvbench::state& state)
+{
+  auto const build_table_size = state.get_int64("Build Table Size");
+  auto const probe_table_size = state.get_int64("Probe Table Size");
+
+  if (build_table_size > probe_table_size) {
+    state.skip("Large build tables are skipped.");
+    return;
+  }
+
+  if (build_table_size * 100 <= probe_table_size) {
+    state.skip("Large probe tables are skipped.");
+    return;
+  }
+}
+
+template <typename key_type, typename payload_type, bool Nullable>
+void nvbench_inner_join(nvbench::state& state,
+                        nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+{
+  skip_helper(state);
+
+  // TODO: to be replaced by nvbench fixture once it's ready
+  cudf::rmm_pool_raii pool_raii;
+
+  auto join = [](cudf::table_view const& left_input,
+                 cudf::table_view const& right_input,
+                 std::vector<cudf::size_type> const& left_on,
+                 std::vector<cudf::size_type> const& right_on,
+                 cudf::null_equality compare_nulls,
+                 rmm::cuda_stream_view stream) {
+    cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
+    return hj_obj.inner_join(right_input.select(right_on), compare_nulls, std::nullopt, stream);
+  };
+
+  BM_join<key_type, payload_type, Nullable>(state, join);
+}
+
+template <typename key_type, typename payload_type, bool Nullable>
+void nvbench_left_join(nvbench::state& state,
+                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+{
+  skip_helper(state);
+
+  // TODO: to be replaced by nvbench fixture once it's ready
+  cudf::rmm_pool_raii pool_raii;
+
+  auto join = [](cudf::table_view const& left_input,
+                 cudf::table_view const& right_input,
+                 std::vector<cudf::size_type> const& left_on,
+                 std::vector<cudf::size_type> const& right_on,
+                 cudf::null_equality compare_nulls,
+                 rmm::cuda_stream_view stream) {
+    cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
+    return hj_obj.left_join(right_input.select(right_on), compare_nulls, std::nullopt, stream);
+  };
+
+  BM_join<key_type, payload_type, Nullable>(state, join);
+}
+
+template <typename key_type, typename payload_type, bool Nullable>
+void nvbench_full_join(nvbench::state& state,
+                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+{
+  skip_helper(state);
+
+  // TODO: to be replaced by nvbench fixture once it's ready
+  cudf::rmm_pool_raii pool_raii;
+
+  auto join = [](cudf::table_view const& left_input,
+                 cudf::table_view const& right_input,
+                 std::vector<cudf::size_type> const& left_on,
+                 std::vector<cudf::size_type> const& right_on,
+                 cudf::null_equality compare_nulls,
+                 rmm::cuda_stream_view stream) {
+    cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
+    return hj_obj.full_join(right_input.select(right_on), compare_nulls, std::nullopt, stream);
+  };
+
+  BM_join<key_type, payload_type, Nullable>(state, join);
+}
+
+// inner join -----------------------------------------------------------------------
+NVBENCH_BENCH_TYPES(nvbench_inner_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::enum_type_list<false>))
+  .set_name("inner_join_32bit")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
+  .add_int64_axis("Probe Table Size",
+                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_inner_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::enum_type_list<false>))
+  .set_name("inner_join_64bit")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
+  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_inner_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::enum_type_list<true>))
+  .set_name("inner_join_32bit_nulls")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
+  .add_int64_axis("Probe Table Size",
+                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_inner_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::enum_type_list<true>))
+  .set_name("inner_join_64bit_nulls")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
+  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+
+// left join ------------------------------------------------------------------------
+NVBENCH_BENCH_TYPES(nvbench_left_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::enum_type_list<false>))
+  .set_name("left_join_32bit")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
+  .add_int64_axis("Probe Table Size",
+                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_left_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::enum_type_list<false>))
+  .set_name("left_join_64bit")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
+  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_left_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::enum_type_list<true>))
+  .set_name("left_join_32bit_nulls")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
+  .add_int64_axis("Probe Table Size",
+                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_left_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::enum_type_list<true>))
+  .set_name("left_join_64bit_nulls")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
+  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+
+// full join ------------------------------------------------------------------------
+NVBENCH_BENCH_TYPES(nvbench_full_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::enum_type_list<false>))
+  .set_name("full_join_32bit")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
+  .add_int64_axis("Probe Table Size",
+                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_full_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::enum_type_list<false>))
+  .set_name("full_join_64bit")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
+  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_full_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::type_list<nvbench::int32_t>,
+                                      nvbench::enum_type_list<true>))
+  .set_name("full_join_32bit_nulls")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
+  .add_int64_axis("Probe Table Size",
+                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+
+NVBENCH_BENCH_TYPES(nvbench_full_join,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::type_list<nvbench::int64_t>,
+                                      nvbench::enum_type_list<true>))
+  .set_name("full_join_64bit_nulls")
+  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
+  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
+  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
diff --git a/cpp/cmake/cudf-build-config.cmake.in b/cpp/cmake/cudf-build-config.cmake.in
index 9b6dd24069a..4b5ad8ebb8d 100644
--- a/cpp/cmake/cudf-build-config.cmake.in
+++ b/cpp/cmake/cudf-build-config.cmake.in
@@ -61,6 +61,9 @@ else()
   if (NOT DEFINED CUDF_ENABLE_ARROW_S3)
     set(CUDF_ENABLE_ARROW_S3 OFF)
   endif()
+  if (NOT DEFINED CUDF_ENABLE_ARROW_ORC)
+    set(CUDF_ENABLE_ARROW_ORC OFF)
+  endif()
   if (NOT DEFINED CUDF_ENABLE_ARROW_PYTHON)
     set(CUDF_ENABLE_ARROW_PYTHON OFF)
   endif()
diff --git a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
index 8cef3e8b9d0..38a5d8da44a 100644
--- a/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
+++ b/cpp/cmake/thirdparty/CUDF_GetArrow.cmake
@@ -14,7 +14,25 @@
 # limitations under the License.
 #=============================================================================
 
-function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON ENABLE_PARQUET)
+function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON ENABLE_PARQUET)
+
+    if(BUILD_STATIC)
+        if(TARGET arrow_static AND TARGET arrow_cuda_static)
+            list(APPEND ARROW_LIBRARIES arrow_static)
+            list(APPEND ARROW_LIBRARIES arrow_cuda_static)
+            set(ARROW_FOUND TRUE PARENT_SCOPE)
+            set(ARROW_LIBRARIES ${ARROW_LIBRARIES} PARENT_SCOPE)
+            return()
+        endif()
+    else()
+        if(TARGET arrow_shared AND TARGET arrow_cuda_shared)
+            list(APPEND ARROW_LIBRARIES arrow_shared)
+            list(APPEND ARROW_LIBRARIES arrow_cuda_shared)
+            set(ARROW_FOUND TRUE PARENT_SCOPE)
+            set(ARROW_LIBRARIES ${ARROW_LIBRARIES} PARENT_SCOPE)
+            return()
+        endif()
+    endif()
 
     set(ARROW_BUILD_SHARED ON)
     set(ARROW_BUILD_STATIC OFF)
@@ -40,12 +58,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E
         list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
         # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
         list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
-        # Arrow's logic to find Thrift is busted, so we have to build it from
-        # source. Why can't we use `THRIFT_SOURCE BUNDLED` you might ask?
-        # Because that's _also_ busted. The only thing that seems to is to set
-        # _all_ dependencies to bundled, then optionall un-set BOOST_SOURCE to
-        # SYSTEM.
-        list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE BUNDLED")
+        list(APPEND ARROW_PYTHON_OPTIONS "Thrift_SOURCE BUNDLED")
+        list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
     endif()
 
     # Set this so Arrow correctly finds the CUDA toolkit when the build machine
@@ -68,6 +82,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E
                         "ARROW_CXXFLAGS -w"
                         "ARROW_JEMALLOC OFF"
                         "ARROW_S3 ${ENABLE_S3}"
+                        "ARROW_ORC ${ENABLE_ORC}"
                         # e.g. needed by blazingsql-io
                         "ARROW_PARQUET ${ENABLE_PARQUET}"
                         ${ARROW_PYTHON_OPTIONS}
@@ -144,14 +159,31 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_PYTHON E
     set(ARROW_FOUND "${ARROW_FOUND}" PARENT_SCOPE)
     set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE)
 
+    if(TARGET arrow_shared)
+        get_target_property(arrow_is_imported arrow_shared IMPORTED)
+        if(NOT arrow_is_imported)
+            export(TARGETS arrow_shared arrow_cuda_shared
+                FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake
+                NAMESPACE   cudf::)
+        endif()
+    elseif(TARGET arrow_static)
+        get_target_property(arrow_is_imported arrow_static IMPORTED)
+        if(NOT arrow_is_imported)
+            export(TARGETS arrow_static arrow_cuda_static
+                FILE ${CUDF_BINARY_DIR}/cudf-arrow-targets.cmake
+                NAMESPACE   cudf::)
+        endif()
+    endif()
+
 endfunction()
 
-set(CUDF_VERSION_Arrow 4.0.1)
+set(CUDF_VERSION_Arrow 5.0.0)
 
 find_and_configure_arrow(
     ${CUDF_VERSION_Arrow}
     ${CUDF_USE_ARROW_STATIC}
     ${CUDF_ENABLE_ARROW_S3}
+    ${CUDF_ENABLE_ARROW_ORC}
     ${CUDF_ENABLE_ARROW_PYTHON}
     ${CUDF_ENABLE_ARROW_PARQUET}
 )
diff --git a/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake
new file mode 100644
index 00000000000..73717249585
--- /dev/null
+++ b/cpp/cmake/thirdparty/CUDF_GetcuCollections.cmake
@@ -0,0 +1,38 @@
+#=============================================================================
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_cucollections)
+
+    if(TARGET cuco::cuco)
+        return()
+    endif()
+
+    # Find or install cuCollections
+    CPMFindPackage(NAME   cuco
+        GITHUB_REPOSITORY NVIDIA/cuCollections
+        GIT_TAG           0d602ae21ea4f38d23ed816aa948453d97b2ee4e
+        OPTIONS           "BUILD_TESTS OFF"
+                          "BUILD_BENCHMARKS OFF"
+                          "BUILD_EXAMPLES OFF"
+    )
+
+    set(CUCO_INCLUDE_DIR "${cuco_SOURCE_DIR}/include" PARENT_SCOPE)
+
+    # Make sure consumers of cudf can also see cuco::cuco target
+    fix_cmake_global_defaults(cuco::cuco)
+endfunction()
+
+find_and_configure_cucollections()
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 9ec64060847..1da2d43cf6c 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -144,6 +144,16 @@ The following guidelines apply to organizing `#include` lines.
  * Always check that includes are only necessary for the file in which they are included. 
    Try to avoid excessive including especially in header files. Double check this when you remove 
    code.
+ * Use quotes `"` to include local headers from the same relative source directory. This should only
+   occur in source files and non-public header files. Otherwise use angle brackets `<>` around 
+   included header filenames.
+ * Avoid relative paths with `..` when possible. Paths with `..` are necessary when including 
+   (internal) headers from source paths not in the same directory as the including file, 
+   because source paths are not passed with `-I`.
+ * Avoid including library internal headers from non-internal files. For example, try not to include
+   headers from libcudf `src` directories in tests or in libcudf public headers. If you find 
+   yourself doing this, start a discussion about moving (parts of) the included internal header 
+   to a public header.
 
 # libcudf Data Structures
 
@@ -246,7 +256,31 @@ An *immutable*, non-owning view of a table.
 
 ### `cudf::mutable_table_view`
 
-A *mutable*, non-owning view of a table. 
+A *mutable*, non-owning view of a table.
+
+## Spans
+
+libcudf provides `span` classes that mimic C++20 `std::span`, which is a lightweight
+view of a contiguous sequence of objects. libcudf provides two classes, `host_span` and 
+`device_span`, which can be constructed from multiple container types, or from a pointer 
+(host or device, respectively) and size, or from iterators. `span` types are useful for defining 
+generic (internal) interfaces which work with multiple input container types. `device_span` can be
+constructed from `thrust::device_vector`, `rmm::device_vector`, or `rmm::device_uvector`. 
+`host_span` can be constructed from `thrust::host_vector`, `std::vector`, or `std::basic_string`.
+
+If you are definining internal (detail) functions that operate on vectors, use spans for the input 
+vector parameters rather than a specific vector type, to make your functions more widely applicable.
+
+When a `span` refers to immutable elements, use `span<T const>`, not `span<T> const`. Since a span
+is lightweight view, it does not propagate `const`-ness. Therefore, `const` should be applied to
+the template type parameter, not to the `span` itself. Also, `span` should be passed by value 
+because it is a lightweight view. APIS in libcudf that take spans as input will look like the 
+following function that copies device data to a host `std::vector`.
+
+```c++
+template <typename T>
+std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+```
 
 ## `cudf::scalar`
 
diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp
index 2467c97393b..cd469af0036 100644
--- a/cpp/examples/basic/src/process_csv.cpp
+++ b/cpp/examples/basic/src/process_csv.cpp
@@ -25,7 +25,7 @@ void write_csv(cudf::table_view const& tbl_view, std::string const& file_path)
 }
 
 std::vector<cudf::groupby::aggregation_request> make_single_aggregation_request(
-  std::unique_ptr<cudf::aggregation>&& agg, cudf::column_view value)
+  std::unique_ptr<cudf::groupby_aggregation>&& agg, cudf::column_view value)
 {
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
@@ -42,7 +42,7 @@ std::unique_ptr<cudf::table> average_closing_price(cudf::table_view stock_info_t
 
   // Compute the average of each company's closing price with entire column
   cudf::groupby::groupby grpby_obj(keys);
-  auto requests = make_single_aggregation_request(cudf::make_mean_aggregation(), val);
+  auto requests = make_single_aggregation_request(cudf::make_mean_aggregation<cudf::groupby_aggregation>(), val);
 
   auto agg_results = grpby_obj.aggregate(requests);
 
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 7ac3638b21c..ff665e2706a 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -106,8 +106,7 @@ class aggregation {
 };
 
 /**
- * @brief Derived class intended for enforcing operation-specific restrictions
- * when calling various cudf functions.
+ * @brief Derived class intended for rolling_window specific aggregation usage.
  *
  * As an example, rolling_window will only accept rolling_aggregation inputs,
  * and the appropriate derived classes (sum_aggregation, mean_aggregation, etc)
@@ -121,6 +120,28 @@ class rolling_aggregation : public virtual aggregation {
   rolling_aggregation() {}
 };
 
+/**
+ * @brief Derived class intended for groupby specific aggregation usage.
+ */
+class groupby_aggregation : public virtual aggregation {
+ public:
+  ~groupby_aggregation() = default;
+
+ protected:
+  groupby_aggregation() {}
+};
+
+/**
+ * @brief Derived class intended for groupby specific scan usage.
+ */
+class groupby_scan_aggregation : public virtual aggregation {
+ public:
+  ~groupby_scan_aggregation() = default;
+
+ protected:
+  groupby_scan_aggregation() {}
+};
+
 enum class udf_type : bool { CUDA, PTX };
 
 /// Factory to create a SUM aggregation
diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
similarity index 68%
rename from cpp/include/cudf/ast/detail/transform.cuh
rename to cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 89fa7d31980..fb198761115 100644
--- a/cpp/include/cudf/ast/detail/transform.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,9 @@
  */
 #pragma once
 
-#include <cudf/ast/detail/linearizer.hpp>
+#include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/detail/operators.hpp>
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/assert.cuh>
@@ -39,28 +38,6 @@ namespace ast {
 
 namespace detail {
 
-// Type trait for wrapping nullable types in a thrust::optional. Non-nullable
-// types are returned as is.
-template <typename T, bool has_nulls>
-struct possibly_null_value;
-
-template <typename T>
-struct possibly_null_value<T, true> {
-  using type = thrust::optional<T>;
-};
-
-template <typename T>
-struct possibly_null_value<T, false> {
-  using type = T;
-};
-
-template <typename T, bool has_nulls>
-using possibly_null_value_t = typename possibly_null_value<T, has_nulls>::type;
-
-// Type used for intermediate storage in expression evaluation.
-template <bool has_nulls>
-using IntermediateDataType = possibly_null_value_t<std::int64_t, has_nulls>;
-
 /**
  * @brief A container for capturing the output of an evaluated expression.
  *
@@ -140,14 +117,15 @@ struct value_expression_result
   /**
    * @brief Returns the underlying data.
    *
-   * @throws thrust::bad_optional_access if the underlying data is not valid.
+   * If the underlying data is not valid, behavior is undefined. Callers should
+   * use is_valid to check for validity before accessing the value.
    */
   __device__ T value() const
   {
     // Using two separate constexprs silences compiler warnings, whereas an
     // if/else does not. An unconditional return is not ignored by the compiler
     // when has_nulls is true and therefore raises a compiler error.
-    if constexpr (has_nulls) { return _obj.value(); }
+    if constexpr (has_nulls) { return *_obj; }
     if constexpr (!has_nulls) { return _obj; }
   }
 
@@ -214,147 +192,30 @@ struct mutable_column_expression_result
 };
 
 /**
- * @brief A container of all device data required to evaluate an expression on tables.
- *
- * This struct should never be instantiated directly. It is created by the
- * `ast_plan` on construction, and the resulting member is publicly accessible
- * for passing to kernels for constructing an `expression_evaluator`.
+ * @brief Dispatch to a binary operator based on a single data type.
  *
+ * This functor is a dispatcher for binary operations that assumes that both
+ * operands are of the same type. This assumption is encoded in the
+ * non-deducible template parameter LHS, the type of the left-hand operand,
+ * which is then used as the template parameter for both the left and right
+ * operands to the binary operator f.
  */
-struct device_ast_plan {
-  device_span<const detail::device_data_reference> data_references;
-  device_span<const cudf::detail::fixed_width_scalar_device_view_base> literals;
-  device_span<const ast_operator> operators;
-  device_span<const cudf::size_type> operator_source_indices;
-  cudf::size_type num_intermediates;
-  int shmem_per_thread;
-};
-
-/**
- * @brief Preprocessor for an expression acting on tables to generate data suitable for AST
- * expression evaluation on the GPU.
- *
- * On construction, an AST plan creates a single "packed" host buffer of all
- * data arrays that will be necessary to evaluate an expression on a pair of
- * tables. This data is copied to a single contiguous device buffer, and
- * pointers are generated to the individual components. Because the plan tends
- * to be small, this is the most efficient approach for low latency. All the
- * data required on the GPU can be accessed via the convenient `dev_plan`
- * member struct, which can be used to construct an `expression_evaluator` on
- * the device.
- *
- * Note that the resulting device data cannot be used once this class goes out of scope.
- */
-struct ast_plan {
-  /**
-   * @brief Construct an AST plan for an expression operating on two tables.
-   *
-   * @param expr The expression for which to construct a plan.
-   * @param left The left table on which the expression acts.
-   * @param right The right table on which the expression acts.
-   * @param has_nulls Boolean indicator of whether or not the data contains nulls.
-   * @param stream Stream view on which to allocate resources and queue execution.
-   * @param mr Device memory resource used to allocate the returned column's device.
-   */
-  ast_plan(detail::node const& expr,
-           cudf::table_view left,
-           cudf::table_view right,
-           bool has_nulls,
-           rmm::cuda_stream_view stream,
-           rmm::mr::device_memory_resource* mr)
-    : _linearizer(expr, left, right)
-  {
-    std::vector<cudf::size_type> sizes;
-    std::vector<const void*> data_pointers;
-
-    extract_size_and_pointer(_linearizer.data_references(), sizes, data_pointers);
-    extract_size_and_pointer(_linearizer.literals(), sizes, data_pointers);
-    extract_size_and_pointer(_linearizer.operators(), sizes, data_pointers);
-    extract_size_and_pointer(_linearizer.operator_source_indices(), sizes, data_pointers);
-
-    // Create device buffer
-    auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
-    auto buffer_offsets    = std::vector<int>(sizes.size());
-    thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0);
-
-    auto h_data_buffer = std::make_unique<char[]>(buffer_size);
-    for (unsigned int i = 0; i < data_pointers.size(); ++i) {
-      std::memcpy(h_data_buffer.get() + buffer_offsets[i], data_pointers[i], sizes[i]);
-    }
-
-    _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr);
-
-    stream.synchronize();
-
-    // Create device pointers to components of plan
-    auto device_data_buffer_ptr = static_cast<const char*>(_device_data_buffer.data());
-    dev_plan.data_references    = device_span<const detail::device_data_reference>(
-      reinterpret_cast<const detail::device_data_reference*>(device_data_buffer_ptr +
-                                                             buffer_offsets[0]),
-      _linearizer.data_references().size());
-    dev_plan.literals = device_span<const cudf::detail::fixed_width_scalar_device_view_base>(
-      reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
-        device_data_buffer_ptr + buffer_offsets[1]),
-      _linearizer.literals().size());
-    dev_plan.operators = device_span<const ast_operator>(
-      reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]),
-      _linearizer.operators().size());
-    dev_plan.operator_source_indices = device_span<const cudf::size_type>(
-      reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]),
-      _linearizer.operator_source_indices().size());
-    dev_plan.num_intermediates = _linearizer.intermediate_count();
-    dev_plan.shmem_per_thread  = static_cast<int>(
-      (has_nulls ? sizeof(IntermediateDataType<true>) : sizeof(IntermediateDataType<false>)) *
-      dev_plan.num_intermediates);
-  }
-
+struct single_dispatch_binary_operator {
   /**
-   * @brief Construct an AST plan for an expression operating on one table.
+   * @brief Single-type dispatch to a binary operation.
    *
-   * @param expr The expression for which to construct a plan.
-   * @param table The table on which the expression acts.
-   * @param has_nulls Boolean indicator of whether or not the data contains nulls.
-   * @param stream Stream view on which to allocate resources and queue execution.
-   * @param mr Device memory resource used to allocate the returned column's device.
-   */
-  ast_plan(detail::node const& expr,
-           cudf::table_view table,
-           bool has_nulls,
-           rmm::cuda_stream_view stream,
-           rmm::mr::device_memory_resource* mr)
-    : ast_plan(expr, table, table, has_nulls, stream, mr)
-  {
-  }
-
-  cudf::data_type output_type() const { return _linearizer.root_data_type(); }
-
-  device_ast_plan
-    dev_plan;  ///< The collection of data required to evaluate the expression on the device.
-
- private:
-  /**
-   * @brief Helper function for adding components (operators, literals, etc) to AST plan
+   * @tparam LHS Left input type.
+   * @tparam F Type of forwarded binary operator functor.
+   * @tparam Ts Parameter pack of forwarded arguments.
    *
-   * @tparam T  The underlying type of the input `std::vector`
-   * @param[in]  v  The `std::vector` containing components (operators, literals, etc).
-   * @param[in,out]  sizes  The `std::vector` containing the size of each data buffer.
-   * @param[in,out]  data_pointers  The `std::vector` containing pointers to each data buffer.
+   * @param f Binary operator functor.
+   * @param args Forwarded arguments to `operator()` of `f`.
    */
-  template <typename T>
-  void extract_size_and_pointer(std::vector<T> const& v,
-                                std::vector<cudf::size_type>& sizes,
-                                std::vector<const void*>& data_pointers)
+  template <typename LHS, typename F, typename... Ts>
+  CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args)
   {
-    auto const data_size = sizeof(T) * v.size();
-    sizes.push_back(data_size);
-    data_pointers.push_back(v.data());
+    f.template operator()<LHS, LHS>(std::forward<Ts>(args)...);
   }
-
-  rmm::device_buffer
-    _device_data_buffer;  ///< The device-side data buffer containing the plan information, which is
-                          ///< owned by this class and persists until it is destroyed.
-  linearizer const _linearizer;  ///< The linearizer created from the provided expression that is
-                                 ///< used to construct device-side operators and references.
 };
 
 /**
@@ -379,7 +240,7 @@ struct expression_evaluator {
    */
   __device__ expression_evaluator(table_device_view const& left,
                                   table_device_view const& right,
-                                  device_ast_plan const& plan,
+                                  expression_device_view const& plan,
                                   IntermediateDataType<has_nulls>* thread_intermediate_storage,
                                   null_equality compare_nulls = null_equality::EQUAL)
     : left(left),
@@ -400,7 +261,7 @@ struct expression_evaluator {
    * @param compare_nulls Whether the equality operator returns true or false for two nulls.
    */
   __device__ expression_evaluator(table_device_view const& table,
-                                  device_ast_plan const& plan,
+                                  expression_device_view const& plan,
                                   IntermediateDataType<has_nulls>* thread_intermediate_storage,
                                   null_equality compare_nulls = null_equality::EQUAL)
     : left(table),
@@ -426,17 +287,26 @@ struct expression_evaluator {
    */
   template <typename Element, CUDF_ENABLE_IF(column_device_view::has_element_accessor<Element>())>
   __device__ possibly_null_value_t<Element, has_nulls> resolve_input(
-    detail::device_data_reference device_data_reference, cudf::size_type row_index) const
+    detail::device_data_reference device_data_reference,
+    cudf::size_type left_row_index,
+    thrust::optional<cudf::size_type> right_row_index = {}) const
   {
     auto const data_index = device_data_reference.data_index;
     auto const ref_type   = device_data_reference.reference_type;
     // TODO: Everywhere in the code assumes that the table reference is either
     // left or right. Should we error-check somewhere to prevent
     // table_reference::OUTPUT from being specified?
-    auto const& table = device_data_reference.table_source == table_reference::LEFT ? left : right;
-    using ReturnType  = possibly_null_value_t<Element, has_nulls>;
+    using ReturnType = possibly_null_value_t<Element, has_nulls>;
     if (ref_type == detail::device_data_reference_type::COLUMN) {
       // If we have nullable data, return an empty nullable type with no value if the data is null.
+      auto const& table =
+        (device_data_reference.table_source == table_reference::LEFT) ? left : right;
+      // Note that the code below assumes that a right index has been passed in
+      // any case where device_data_reference.table_source == table_reference::RIGHT.
+      // Otherwise, behavior is undefined.
+      auto const row_index = (device_data_reference.table_source == table_reference::LEFT)
+                               ? left_row_index
+                               : *right_row_index;
       if constexpr (has_nulls) {
         return table.column(data_index).is_valid(row_index)
                  ? ReturnType(table.column(data_index).element<Element>(row_index))
@@ -462,7 +332,9 @@ struct expression_evaluator {
   template <typename Element,
             CUDF_ENABLE_IF(not column_device_view::has_element_accessor<Element>())>
   __device__ possibly_null_value_t<Element, has_nulls> resolve_input(
-    detail::device_data_reference device_data_reference, cudf::size_type row_index) const
+    detail::device_data_reference device_data_reference,
+    cudf::size_type left_row_index,
+    thrust::optional<cudf::size_type> right_row_index = {}) const
   {
     cudf_assert(false && "Unsupported type in resolve_input.");
     // Unreachable return used to silence compiler warnings.
@@ -484,11 +356,11 @@ struct expression_evaluator {
    */
   template <typename Input, typename OutputType>
   __device__ void operator()(OutputType& output_object,
-                             const cudf::size_type input_row_index,
-                             const detail::device_data_reference input,
-                             const detail::device_data_reference output,
-                             const cudf::size_type output_row_index,
-                             const ast_operator op) const
+                             cudf::size_type const input_row_index,
+                             detail::device_data_reference const input,
+                             detail::device_data_reference const output,
+                             cudf::size_type const output_row_index,
+                             ast_operator const op) const
   {
     auto const typed_input = resolve_input<Input>(input, input_row_index);
     ast_operator_dispatcher(op,
@@ -517,16 +389,16 @@ struct expression_evaluator {
    */
   template <typename LHS, typename RHS, typename OutputType>
   __device__ void operator()(OutputType& output_object,
-                             const cudf::size_type left_row_index,
-                             const cudf::size_type right_row_index,
-                             const detail::device_data_reference lhs,
-                             const detail::device_data_reference rhs,
-                             const detail::device_data_reference output,
-                             const cudf::size_type output_row_index,
-                             const ast_operator op) const
+                             cudf::size_type const left_row_index,
+                             cudf::size_type const right_row_index,
+                             detail::device_data_reference const lhs,
+                             detail::device_data_reference const rhs,
+                             detail::device_data_reference const output,
+                             cudf::size_type const output_row_index,
+                             ast_operator const op) const
   {
-    auto const typed_lhs = resolve_input<LHS>(lhs, left_row_index);
-    auto const typed_rhs = resolve_input<RHS>(rhs, right_row_index);
+    auto const typed_lhs = resolve_input<LHS>(lhs, left_row_index, right_row_index);
+    auto const typed_rhs = resolve_input<RHS>(rhs, left_row_index, right_row_index);
     ast_operator_dispatcher(op,
                             binary_expression_output_handler<LHS, RHS>(*this),
                             output_object,
@@ -544,11 +416,11 @@ struct expression_evaluator {
   __device__ void operator()(OutputType& output_object,
                              cudf::size_type left_row_index,
                              cudf::size_type right_row_index,
-                             const detail::device_data_reference lhs,
-                             const detail::device_data_reference rhs,
-                             const detail::device_data_reference output,
+                             detail::device_data_reference const lhs,
+                             detail::device_data_reference const rhs,
+                             detail::device_data_reference const output,
                              cudf::size_type output_row_index,
-                             const ast_operator op) const
+                             ast_operator const op) const
   {
     cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
   }
@@ -587,19 +459,18 @@ struct expression_evaluator {
                            cudf::size_type const right_row_index,
                            cudf::size_type const output_row_index)
   {
-    auto operator_source_index = static_cast<cudf::size_type>(0);
+    cudf::size_type operator_source_index{0};
     for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
-         operator_index++) {
+         ++operator_index) {
       // Execute operator
       auto const op    = plan.operators[operator_index];
       auto const arity = ast_operator_arity(op);
       if (arity == 1) {
         // Unary operator
         auto const input =
-          plan.data_references[plan.operator_source_indices[operator_source_index]];
+          plan.data_references[plan.operator_source_indices[operator_source_index++]];
         auto const output =
-          plan.data_references[plan.operator_source_indices[operator_source_index + 1]];
-        operator_source_index += arity + 1;
+          plan.data_references[plan.operator_source_indices[operator_source_index++]];
         auto input_row_index =
           input.table_source == table_reference::LEFT ? left_row_index : right_row_index;
         type_dispatcher(input.data_type,
@@ -612,12 +483,12 @@ struct expression_evaluator {
                         op);
       } else if (arity == 2) {
         // Binary operator
-        auto const lhs = plan.data_references[plan.operator_source_indices[operator_source_index]];
+        auto const lhs =
+          plan.data_references[plan.operator_source_indices[operator_source_index++]];
         auto const rhs =
-          plan.data_references[plan.operator_source_indices[operator_source_index + 1]];
+          plan.data_references[plan.operator_source_indices[operator_source_index++]];
         auto const output =
-          plan.data_references[plan.operator_source_indices[operator_source_index + 2]];
-        operator_source_index += arity + 1;
+          plan.data_references[plan.operator_source_indices[operator_source_index++]];
         type_dispatcher(lhs.data_type,
                         detail::single_dispatch_binary_operator{},
                         *this,
@@ -670,9 +541,9 @@ struct expression_evaluator {
               typename OutputType,
               CUDF_ENABLE_IF(is_rep_layout_compatible<Element>())>
     __device__ void resolve_output(OutputType& output_object,
-                                   const detail::device_data_reference device_data_reference,
-                                   const cudf::size_type row_index,
-                                   const possibly_null_value_t<Element, has_nulls> result) const
+                                   detail::device_data_reference const device_data_reference,
+                                   cudf::size_type const row_index,
+                                   possibly_null_value_t<Element, has_nulls> const result) const
     {
       auto const ref_type = device_data_reference.reference_type;
       if (ref_type == detail::device_data_reference_type::COLUMN) {
@@ -690,9 +561,9 @@ struct expression_evaluator {
               typename OutputType,
               CUDF_ENABLE_IF(not is_rep_layout_compatible<Element>())>
     __device__ void resolve_output(OutputType& output_object,
-                                   const detail::device_data_reference device_data_reference,
-                                   const cudf::size_type row_index,
-                                   const possibly_null_value_t<Element, has_nulls> result) const
+                                   detail::device_data_reference const device_data_reference,
+                                   cudf::size_type const row_index,
+                                   possibly_null_value_t<Element, has_nulls> const result) const
     {
       cudf_assert(false && "Invalid type in resolve_output.");
     }
@@ -730,9 +601,9 @@ struct expression_evaluator {
       typename OutputType,
       std::enable_if_t<detail::is_valid_unary_op<detail::operator_functor<op>, Input>>* = nullptr>
     __device__ void operator()(OutputType& output_object,
-                               const cudf::size_type output_row_index,
-                               const possibly_null_value_t<Input, has_nulls> input,
-                               const detail::device_data_reference output) const
+                               cudf::size_type const output_row_index,
+                               possibly_null_value_t<Input, has_nulls> const input,
+                               detail::device_data_reference const output) const
     {
       using OperatorFunctor = detail::operator_functor<op>;
       using Out             = cuda::std::invoke_result_t<OperatorFunctor, Input>;
@@ -752,9 +623,9 @@ struct expression_evaluator {
       typename OutputType,
       std::enable_if_t<!detail::is_valid_unary_op<detail::operator_functor<op>, Input>>* = nullptr>
     __device__ void operator()(OutputType& output_object,
-                               const cudf::size_type output_row_index,
-                               const possibly_null_value_t<Input, has_nulls> input,
-                               const detail::device_data_reference output) const
+                               cudf::size_type const output_row_index,
+                               possibly_null_value_t<Input, has_nulls> const input,
+                               detail::device_data_reference const output) const
     {
       cudf_assert(false && "Invalid unary dispatch operator for the provided input.");
     }
@@ -790,10 +661,10 @@ struct expression_evaluator {
               std::enable_if_t<
                 detail::is_valid_binary_op<detail::operator_functor<op>, LHS, RHS>>* = nullptr>
     __device__ void operator()(OutputType& output_object,
-                               const cudf::size_type output_row_index,
-                               const possibly_null_value_t<LHS, has_nulls> lhs,
-                               const possibly_null_value_t<RHS, has_nulls> rhs,
-                               const detail::device_data_reference output) const
+                               cudf::size_type const output_row_index,
+                               possibly_null_value_t<LHS, has_nulls> const lhs,
+                               possibly_null_value_t<RHS, has_nulls> const rhs,
+                               detail::device_data_reference const output) const
     {
       using OperatorFunctor = detail::operator_functor<op>;
       using Out             = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
@@ -832,10 +703,10 @@ struct expression_evaluator {
               std::enable_if_t<
                 !detail::is_valid_binary_op<detail::operator_functor<op>, LHS, RHS>>* = nullptr>
     __device__ void operator()(OutputType& output_object,
-                               const cudf::size_type output_row_index,
-                               const possibly_null_value_t<LHS, has_nulls> lhs,
-                               const possibly_null_value_t<RHS, has_nulls> rhs,
-                               const detail::device_data_reference output) const
+                               cudf::size_type const output_row_index,
+                               possibly_null_value_t<LHS, has_nulls> const lhs,
+                               possibly_null_value_t<RHS, has_nulls> const rhs,
+                               detail::device_data_reference output) const
     {
       cudf_assert(false && "Invalid binary dispatch operator for the provided input.");
     }
@@ -843,7 +714,7 @@ struct expression_evaluator {
 
   table_device_view const& left;   ///< The left table to operate on.
   table_device_view const& right;  ///< The right table to operate on.
-  device_ast_plan const&
+  expression_device_view const&
     plan;  ///< The container of device data representing the expression to evaluate.
   IntermediateDataType<has_nulls>*
     thread_intermediate_storage;  ///< The shared memory store of intermediates produced during
@@ -852,23 +723,6 @@ struct expression_evaluator {
     compare_nulls;  ///< Whether the equality operator returns true or false for two nulls.
 };
 
-/**
- * @brief Compute a new column by evaluating an expression tree on a table.
- *
- * This evaluates an expression over a table to produce a new column. Also called an n-ary
- * transform.
- *
- * @param table The table used for expression evaluation.
- * @param expr The root of the expression tree.
- * @param stream Stream on which to perform the computation.
- * @param mr Device memory resource.
- * @return std::unique_ptr<column> Output column.
- */
-std::unique_ptr<column> compute_column(
-  table_view const table,
-  expression const& expr,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 
 }  // namespace ast
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
new file mode 100644
index 00000000000..1f35b54ea61
--- /dev/null
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/optional.h>
+
+#include <functional>
+#include <numeric>
+#include <optional>
+
+namespace cudf {
+namespace ast {
+namespace detail {
+
+/**
+ * @brief Node data reference types.
+ *
+ * This enum is device-specific. For instance, intermediate data references are generated by the
+ * linearization process but cannot be explicitly created by the user.
+ */
+enum class device_data_reference_type {
+  COLUMN,       // A value in a table column
+  LITERAL,      // A literal value
+  INTERMEDIATE  // An internal temporary value
+};
+
+/**
+ * @brief A device data reference describes a source of data used by a expression.
+ *
+ * This is a POD class used to create references describing data type and locations for consumption
+ * by the `row_evaluator`.
+ */
+struct alignas(8) device_data_reference {
+  device_data_reference(device_data_reference_type reference_type,
+                        cudf::data_type data_type,
+                        cudf::size_type data_index,
+                        table_reference table_source);
+
+  device_data_reference(device_data_reference_type reference_type,
+                        cudf::data_type data_type,
+                        cudf::size_type data_index);
+
+  device_data_reference_type const reference_type;  // Source of data
+  cudf::data_type const data_type;                  // Type of data
+  cudf::size_type const data_index;                 // The column index of a table, index of a
+                                                    // literal, or index of an intermediate
+  table_reference const table_source;
+
+  bool operator==(device_data_reference const& rhs) const
+  {
+    return std::tie(data_index, reference_type, table_source) ==
+           std::tie(rhs.data_index, rhs.reference_type, rhs.table_source);
+  }
+};
+
+// Type trait for wrapping nullable types in a thrust::optional. Non-nullable
+// types are returned as is.
+template <typename T, bool has_nulls>
+struct possibly_null_value;
+
+template <typename T>
+struct possibly_null_value<T, true> {
+  using type = thrust::optional<T>;
+};
+
+template <typename T>
+struct possibly_null_value<T, false> {
+  using type = T;
+};
+
+template <typename T, bool has_nulls>
+using possibly_null_value_t = typename possibly_null_value<T, has_nulls>::type;
+
+// Type used for intermediate storage in expression evaluation.
+template <bool has_nulls>
+using IntermediateDataType = possibly_null_value_t<std::int64_t, has_nulls>;
+
+/**
+ * @brief A container of all device data required to evaluate an expression on tables.
+ *
+ * This struct should never be instantiated directly. It is created by the
+ * `expression_parser` on construction, and the resulting member is publicly accessible
+ * for passing to kernels for constructing an `expression_evaluator`.
+ *
+ */
+struct expression_device_view {
+  device_span<detail::device_data_reference const> data_references;
+  device_span<cudf::detail::fixed_width_scalar_device_view_base const> literals;
+  device_span<ast_operator const> operators;
+  device_span<cudf::size_type const> operator_source_indices;
+  cudf::size_type num_intermediates;
+  int shmem_per_thread;
+};
+
+/**
+ * @brief The expression_parser traverses an expression and converts it into a form suitable for
+ * execution on the device.
+ *
+ * This class is part of a "visitor" pattern with the `expression` class.
+ *
+ * This class does pre-processing work on the host, validating operators and operand data types. It
+ * traverses downward from a root expression in a depth-first fashion, capturing information about
+ * the expressions and constructing vectors of information that are later used by the device for
+ * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are
+ * resolved into intermediate data storage in shared memory.
+ */
+class expression_parser {
+ public:
+  /**
+   * @brief Construct a new expression_parser object
+   *
+   * @param expr The expression to create an evaluable expression_parser for.
+   * @param left The left table used for evaluating the abstract syntax tree.
+   * @param right The right table used for evaluating the abstract syntax tree.
+   */
+  expression_parser(expression const& expr,
+                    cudf::table_view const& left,
+                    std::optional<std::reference_wrapper<cudf::table_view const>> right,
+                    bool has_nulls,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr)
+    : _left{left},
+      _right{right},
+      _expression_count{0},
+      _intermediate_counter{},
+      _has_nulls(has_nulls)
+  {
+    expr.accept(*this);
+    move_to_device(stream, mr);
+  }
+
+  /**
+   * @brief Construct a new expression_parser object
+   *
+   * @param expr The expression to create an evaluable expression_parser for.
+   * @param table The table used for evaluating the abstract syntax tree.
+   */
+  expression_parser(expression const& expr,
+                    cudf::table_view const& table,
+                    bool has_nulls,
+                    rmm::cuda_stream_view stream,
+                    rmm::mr::device_memory_resource* mr)
+    : expression_parser(expr, table, {}, has_nulls, stream, mr)
+  {
+  }
+
+  /**
+   * @brief Get the root data type of the abstract syntax tree.
+   *
+   * @return cudf::data_type
+   */
+  cudf::data_type output_type() const;
+
+  /**
+   * @brief Visit a literal expression.
+   *
+   * @param expr Literal expression.
+   * @return cudf::size_type Index of device data reference for the expression.
+   */
+  cudf::size_type visit(literal const& expr);
+
+  /**
+   * @brief Visit a column reference expression.
+   *
+   * @param expr Column reference expression.
+   * @return cudf::size_type Index of device data reference for the expression.
+   */
+  cudf::size_type visit(column_reference const& expr);
+
+  /**
+   * @brief Visit an expression expression.
+   *
+   * @param expr Expression expression.
+   * @return cudf::size_type Index of device data reference for the expression.
+   */
+  cudf::size_type visit(operation const& expr);
+
+  /**
+   * @brief Internal class used to track the utilization of intermediate storage locations.
+   *
+   * As expressions are being evaluated, they may generate "intermediate" data that is immediately
+   * consumed. Rather than manifesting this data in global memory, we can store intermediates of any
+   * fixed width type (up to 8 bytes) by placing them in shared memory. This class helps to track
+   * the number and indices of intermediate data in shared memory using a give-take model. Locations
+   * in shared memory can be "taken" and used for storage, "given back," and then later re-used.
+   * This aims to minimize the maximum amount of shared memory needed at any point during the
+   * evaluation.
+   *
+   */
+  class intermediate_counter {
+   public:
+    intermediate_counter() : used_values(), max_used(0) {}
+    cudf::size_type take();
+    void give(cudf::size_type value);
+    cudf::size_type get_max_used() const { return max_used; }
+
+   private:
+    /**
+     * @brief Find the first missing value in a contiguous sequence of integers.
+     *
+     * From a sorted container of integers, find the first "missing" value.
+     * For example, {0, 1, 2, 4, 5} is missing 3, and {1, 2, 3} is missing 0.
+     * If there are no missing values, return the size of the container.
+     *
+     * @return cudf::size_type Smallest value not already in the container.
+     */
+    cudf::size_type find_first_missing() const;
+
+    std::vector<cudf::size_type> used_values;
+    cudf::size_type max_used;
+  };
+
+  expression_device_view device_expression_data;  ///< The collection of data required to evaluate
+                                                  ///< the expression on the device.
+
+ private:
+  /**
+   * @brief Helper function for adding components (operators, literals, etc) to AST plan
+   *
+   * @tparam T  The underlying type of the input `std::vector`
+   * @param[in]  v  The `std::vector` containing components (operators, literals, etc).
+   * @param[in,out]  sizes  The `std::vector` containing the size of each data buffer.
+   * @param[in,out]  data_pointers  The `std::vector` containing pointers to each data buffer.
+   */
+  template <typename T>
+  void extract_size_and_pointer(std::vector<T> const& v,
+                                std::vector<cudf::size_type>& sizes,
+                                std::vector<void const*>& data_pointers)
+  {
+    auto const data_size = sizeof(T) * v.size();
+    sizes.push_back(data_size);
+    data_pointers.push_back(v.data());
+  }
+
+  void move_to_device(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  {
+    std::vector<cudf::size_type> sizes;
+    std::vector<void const*> data_pointers;
+
+    extract_size_and_pointer(_data_references, sizes, data_pointers);
+    extract_size_and_pointer(_literals, sizes, data_pointers);
+    extract_size_and_pointer(_operators, sizes, data_pointers);
+    extract_size_and_pointer(_operator_source_indices, sizes, data_pointers);
+
+    // Create device buffer
+    auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
+    auto buffer_offsets    = std::vector<int>(sizes.size());
+    thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0);
+
+    auto h_data_buffer = std::vector<char>(buffer_size);
+    for (unsigned int i = 0; i < data_pointers.size(); ++i) {
+      std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
+    }
+
+    _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr);
+
+    stream.synchronize();
+
+    // Create device pointers to components of plan
+    auto device_data_buffer_ptr            = static_cast<char const*>(_device_data_buffer.data());
+    device_expression_data.data_references = device_span<detail::device_data_reference const>(
+      reinterpret_cast<detail::device_data_reference const*>(device_data_buffer_ptr +
+                                                             buffer_offsets[0]),
+      _data_references.size());
+    device_expression_data.literals =
+      device_span<cudf::detail::fixed_width_scalar_device_view_base const>(
+        reinterpret_cast<cudf::detail::fixed_width_scalar_device_view_base const*>(
+          device_data_buffer_ptr + buffer_offsets[1]),
+        _literals.size());
+    device_expression_data.operators = device_span<ast_operator const>(
+      reinterpret_cast<ast_operator const*>(device_data_buffer_ptr + buffer_offsets[2]),
+      _operators.size());
+    device_expression_data.operator_source_indices = device_span<cudf::size_type const>(
+      reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[3]),
+      _operator_source_indices.size());
+    device_expression_data.num_intermediates = _intermediate_counter.get_max_used();
+    device_expression_data.shmem_per_thread  = static_cast<int>(
+      (_has_nulls ? sizeof(IntermediateDataType<true>) : sizeof(IntermediateDataType<false>)) *
+      device_expression_data.num_intermediates);
+  }
+
+  /**
+   * @brief Helper function for recursive traversal of expressions.
+   *
+   * When parsing an expression composed of subexpressions, all subexpressions
+   * must be evaluated before an operator can be applied to them. This method
+   * performs that recursive traversal (in conjunction with the
+   * `expression_parser.visit` and `expression.accept` methods if necessary to
+   * descend deeper into an expression tree).
+   *
+   * @param  operands  The operands to visit.
+   *
+   * @return The indices of the operands stored in the data references.
+   */
+  std::vector<cudf::size_type> visit_operands(
+    std::vector<std::reference_wrapper<expression const>> operands);
+
+  /**
+   * @brief Add a data reference to the internal list.
+   *
+   * @param  data_ref  The data reference to add.
+   *
+   * @return The index of the added data reference in the internal data references list.
+   */
+  cudf::size_type add_data_reference(detail::device_data_reference data_ref);
+
+  rmm::device_buffer
+    _device_data_buffer;  ///< The device-side data buffer containing the plan information, which is
+                          ///< owned by this class and persists until it is destroyed.
+
+  cudf::table_view const& _left;
+  std::optional<std::reference_wrapper<cudf::table_view const>> _right;
+  cudf::size_type _expression_count;
+  intermediate_counter _intermediate_counter;
+  bool _has_nulls;
+  std::vector<detail::device_data_reference> _data_references;
+  std::vector<ast_operator> _operators;
+  std::vector<cudf::size_type> _operator_source_indices;
+  std::vector<cudf::detail::fixed_width_scalar_device_view_base> _literals;
+};
+
+}  // namespace detail
+
+}  // namespace ast
+
+}  // namespace cudf
diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp
deleted file mode 100644
index 59eda1df7b7..00000000000
--- a/cpp/include/cudf/ast/detail/linearizer.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/ast/operators.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
-namespace cudf {
-namespace ast {
-
-// Forward declaration
-enum class table_reference;
-class literal;
-class column_reference;
-class expression;
-
-namespace detail {
-
-/**
- * @brief Enum defining data reference types used by a node.
- *
- * This enum is device-specific. For instance, intermediate data references are generated by the
- * linearization process but cannot be explicitly created by the user.
- */
-enum class device_data_reference_type {
-  COLUMN,       // A value in a table column
-  LITERAL,      // A literal value
-  INTERMEDIATE  // An internal temporary value
-};
-
-/**
- * @brief A device data reference describes a source of data used by a node.
- *
- * This is a POD class used to create references describing data type and locations for consumption
- * by the `row_evaluator`.
- */
-struct alignas(8) device_data_reference {
-  device_data_reference(device_data_reference_type reference_type,
-                        cudf::data_type data_type,
-                        cudf::size_type data_index,
-                        table_reference table_source);
-
-  device_data_reference(device_data_reference_type reference_type,
-                        cudf::data_type data_type,
-                        cudf::size_type data_index);
-
-  const device_data_reference_type reference_type;  // Source of data
-  const cudf::data_type data_type;                  // Type of data
-  const cudf::size_type data_index;                 // The column index of a table, index of a
-                                                    // literal, or index of an intermediate
-  const table_reference table_source;
-
-  inline bool operator==(const device_data_reference& rhs) const
-  {
-    return std::tie(data_index, reference_type, table_source) ==
-           std::tie(rhs.data_index, rhs.reference_type, rhs.table_source);
-  }
-};
-
-// Forward declaration
-class linearizer;
-
-/**
- * @brief A generic node that can be evaluated to return a value.
- *
- * This class is a part of a "visitor" pattern with the `linearizer` class.
- * Nodes inheriting from this class can accept visitors.
- */
-struct node {
-  virtual cudf::size_type accept(detail::linearizer& visitor) const = 0;
-};
-
-/**
- * @brief The linearizer traverses an abstract syntax tree to prepare for execution on the device.
- *
- * This class is part of a "visitor" pattern with the `node` class.
- *
- * This class does pre-processing work on the host, validating operators and operand data types. It
- * traverses downward from a root node in a depth-first fashion, capturing information about
- * the nodes and constructing vectors of information that are later used by the device for
- * evaluating the abstract syntax tree as a "linear" list of operators whose input dependencies are
- * resolved into intermediate data storage in shared memory.
- */
-class linearizer {
- public:
-  /**
-   * @brief Construct a new linearizer object
-   *
-   * @param expr The expression to create an evaluable linearizer for.
-   * @param left The left table used for evaluating the abstract syntax tree.
-   * @param right The right table used for evaluating the abstract syntax tree.
-   */
-  linearizer(detail::node const& expr, cudf::table_view left, cudf::table_view right)
-    : _left{left}, _right{right}, _node_count{0}, _intermediate_counter{}
-  {
-    expr.accept(*this);
-  }
-
-  /**
-   * @brief Construct a new linearizer object
-   *
-   * @param expr The expression to create an evaluable linearizer for.
-   * @param table The table used for evaluating the abstract syntax tree.
-   */
-  linearizer(detail::node const& expr, cudf::table_view table)
-    : _left{table}, _right{table}, _node_count{0}, _intermediate_counter{}
-  {
-    expr.accept(*this);
-  }
-
-  /**
-   * @brief Get the root data type of the abstract syntax tree.
-   *
-   * @return cudf::data_type
-   */
-  cudf::data_type root_data_type() const;
-
-  /**
-   * @brief Get the maximum number of intermediates stored by the abstract syntax tree.
-   *
-   * @return cudf::size_type
-   */
-  cudf::size_type intermediate_count() const { return _intermediate_counter.get_max_used(); }
-
-  /**
-   * @brief Get the device data references.
-   *
-   * @return std::vector<detail::device_data_reference>
-   */
-  std::vector<detail::device_data_reference> const& data_references() const
-  {
-    return _data_references;
-  }
-
-  /**
-   * @brief Get the operators.
-   *
-   * @return std::vector<ast_operator>
-   */
-  std::vector<ast_operator> const& operators() const { return _operators; }
-
-  /**
-   * @brief Get the operator source indices.
-   *
-   * @return std::vector<cudf::size_type>
-   */
-  std::vector<cudf::size_type> const& operator_source_indices() const
-  {
-    return _operator_source_indices;
-  }
-
-  /**
-   * @brief Get the literal device views.
-   *
-   * @return std::vector<cudf::detail::fixed_width_scalar_device_view_base>
-   */
-  std::vector<cudf::detail::fixed_width_scalar_device_view_base> const& literals() const
-  {
-    return _literals;
-  }
-
-  /**
-   * @brief Visit a literal node.
-   *
-   * @param expr Literal node.
-   * @return cudf::size_type Index of device data reference for the node.
-   */
-  cudf::size_type visit(literal const& expr);
-
-  /**
-   * @brief Visit a column reference node.
-   *
-   * @param expr Column reference node.
-   * @return cudf::size_type Index of device data reference for the node.
-   */
-  cudf::size_type visit(column_reference const& expr);
-
-  /**
-   * @brief Visit an expression node.
-   *
-   * @param expr Expression node.
-   * @return cudf::size_type Index of device data reference for the node.
-   */
-  cudf::size_type visit(expression const& expr);
-
-  /**
-   * @brief Internal class used to track the utilization of intermediate storage locations.
-   *
-   * As nodes are being evaluated, they may generate "intermediate" data that is immediately
-   * consumed. Rather than manifesting this data in global memory, we can store intermediates of any
-   * fixed width type (up to 8 bytes) by placing them in shared memory. This class helps to track
-   * the number and indices of intermediate data in shared memory using a give-take model. Locations
-   * in shared memory can be "taken" and used for storage, "given back," and then later re-used.
-   * This aims to minimize the maximum amount of shared memory needed at any point during the
-   * evaluation.
-   *
-   */
-  class intermediate_counter {
-   public:
-    intermediate_counter() : used_values(), max_used(0) {}
-    cudf::size_type take();
-    void give(cudf::size_type value);
-    cudf::size_type get_max_used() const { return max_used; }
-
-   private:
-    cudf::size_type find_first_missing() const;
-    std::vector<cudf::size_type> used_values;
-    cudf::size_type max_used;
-  };
-
- private:
-  std::vector<cudf::size_type> visit_operands(
-    std::vector<std::reference_wrapper<const node>> operands);
-  cudf::size_type add_data_reference(detail::device_data_reference data_ref);
-
-  // State information about the "linearized" GPU execution plan
-  cudf::table_view const& _left;
-  cudf::table_view const& _right;
-  cudf::size_type _node_count;
-  intermediate_counter _intermediate_counter;
-  std::vector<detail::device_data_reference> _data_references;
-  std::vector<ast_operator> _operators;
-  std::vector<cudf::size_type> _operator_source_indices;
-  std::vector<cudf::detail::fixed_width_scalar_device_view_base> _literals;
-};
-
-}  // namespace detail
-
-}  // namespace ast
-
-}  // namespace cudf
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 01ec5b74b77..00723004a9f 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -787,14 +787,6 @@ struct single_dispatch_binary_operator_types {
   }
 };
 
-struct single_dispatch_binary_operator {
-  template <typename LHS, typename F, typename... Ts>
-  CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<LHS, LHS>(std::forward<Ts>(args)...);
-  }
-};
-
 /**
  * @brief Functor performing a type dispatch for a binary operator.
  *
diff --git a/cpp/include/cudf/ast/nodes.hpp b/cpp/include/cudf/ast/expressions.hpp
similarity index 52%
rename from cpp/include/cudf/ast/nodes.hpp
rename to cpp/include/cudf/ast/expressions.hpp
index 70dda58816e..d9ba197f8fe 100644
--- a/cpp/include/cudf/ast/nodes.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -15,9 +15,6 @@
  */
 #pragma once
 
-#include <cudf/ast/detail/linearizer.hpp>
-#include <cudf/ast/detail/operators.hpp>
-#include <cudf/ast/operators.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -27,6 +24,75 @@
 namespace cudf {
 namespace ast {
 
+// Forward declaration.
+namespace detail {
+class expression_parser;
+}
+
+/**
+ * @brief A generic expression that can be evaluated to return a value.
+ *
+ * This class is a part of a "visitor" pattern with the `linearizer` class.
+ * Nodes inheriting from this class can accept visitors.
+ */
+struct expression {
+  virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0;
+
+  virtual ~expression() {}
+};
+
+/**
+ * @brief Enum of supported operators.
+ */
+enum class ast_operator {
+  // Binary operators
+  ADD,            ///< operator +
+  SUB,            ///< operator -
+  MUL,            ///< operator *
+  DIV,            ///< operator / using common type of lhs and rhs
+  TRUE_DIV,       ///< operator / after promoting type to floating point
+  FLOOR_DIV,      ///< operator / after promoting to 64 bit floating point and then
+                  ///< flooring the result
+  MOD,            ///< operator %
+  PYMOD,          ///< operator % but following python's sign rules for negatives
+  POW,            ///< lhs ^ rhs
+  EQUAL,          ///< operator ==
+  NOT_EQUAL,      ///< operator !=
+  LESS,           ///< operator <
+  GREATER,        ///< operator >
+  LESS_EQUAL,     ///< operator <=
+  GREATER_EQUAL,  ///< operator >=
+  BITWISE_AND,    ///< operator &
+  BITWISE_OR,     ///< operator |
+  BITWISE_XOR,    ///< operator ^
+  LOGICAL_AND,    ///< operator &&
+  LOGICAL_OR,     ///< operator ||
+  // Unary operators
+  IDENTITY,    ///< Identity function
+  SIN,         ///< Trigonometric sine
+  COS,         ///< Trigonometric cosine
+  TAN,         ///< Trigonometric tangent
+  ARCSIN,      ///< Trigonometric sine inverse
+  ARCCOS,      ///< Trigonometric cosine inverse
+  ARCTAN,      ///< Trigonometric tangent inverse
+  SINH,        ///< Hyperbolic sine
+  COSH,        ///< Hyperbolic cosine
+  TANH,        ///< Hyperbolic tangent
+  ARCSINH,     ///< Hyperbolic sine inverse
+  ARCCOSH,     ///< Hyperbolic cosine inverse
+  ARCTANH,     ///< Hyperbolic tangent inverse
+  EXP,         ///< Exponential (base e, Euler number)
+  LOG,         ///< Natural Logarithm (base e)
+  SQRT,        ///< Square-root (x^0.5)
+  CBRT,        ///< Cube-root (x^(1.0/3))
+  CEIL,        ///< Smallest integer value not less than arg
+  FLOOR,       ///< largest integer value not greater than arg
+  ABS,         ///< Absolute value
+  RINT,        ///< Rounds the floating-point argument arg to an integer value
+  BIT_INVERT,  ///< Bitwise Not (~)
+  NOT          ///< Logical Not (!)
+};
+
 /**
  * @brief Enum of table references.
  *
@@ -41,7 +107,7 @@ enum class table_reference {
 /**
  * @brief A literal value used in an abstract syntax tree.
  */
-class literal : public detail::node {
+class literal : public expression {
  public:
   /**
    * @brief Construct a new literal object.
@@ -96,21 +162,21 @@ class literal : public detail::node {
    * @param visitor Visitor.
    * @return cudf::size_type Index of device data reference for this instance.
    */
-  cudf::size_type accept(detail::linearizer& visitor) const override;
+  cudf::size_type accept(detail::expression_parser& visitor) const override;
 
  private:
-  const cudf::detail::fixed_width_scalar_device_view_base value;
+  cudf::detail::fixed_width_scalar_device_view_base const value;
 };
 
 /**
- * @brief A node referring to data from a column in a table.
+ * @brief A expression referring to data from a column in a table.
  */
-class column_reference : public detail::node {
+class column_reference : public expression {
  public:
   /**
    * @brief Construct a new column reference object
    *
-   * @param column_index Index of this column in the table (provided when the node is
+   * @param column_index Index of this column in the table (provided when the expression is
    * evaluated).
    * @param table_source Which table to use in cases with two tables (e.g. joins).
    */
@@ -140,7 +206,7 @@ class column_reference : public detail::node {
    * @param table Table used to determine types.
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type(const table_view& table) const
+  cudf::data_type get_data_type(table_view const& table) const
   {
     return table.column(get_column_index()).type();
   }
@@ -152,9 +218,9 @@ class column_reference : public detail::node {
    * @param right_table Right table used to determine types.
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type(const table_view& left_table, const table_view& right_table) const
+  cudf::data_type get_data_type(table_view const& left_table, table_view const& right_table) const
   {
-    const auto table = [&] {
+    auto const table = [&] {
       if (get_table_source() == table_reference::LEFT) {
         return left_table;
       } else if (get_table_source() == table_reference::RIGHT) {
@@ -172,7 +238,7 @@ class column_reference : public detail::node {
    * @param visitor Visitor.
    * @return cudf::size_type Index of device data reference for this instance.
    */
-  cudf::size_type accept(detail::linearizer& visitor) const override;
+  cudf::size_type accept(detail::expression_parser& visitor) const override;
 
  private:
   cudf::size_type column_index;
@@ -180,43 +246,33 @@ class column_reference : public detail::node {
 };
 
 /**
- * @brief An expression node holds an operator and zero or more operands.
+ * @brief An operation expression holds an operator and zero or more operands.
  */
-class expression : public detail::node {
+class operation : public expression {
  public:
   /**
-   * @brief Construct a new unary expression object.
+   * @brief Construct a new unary operation object.
    *
    * @param op Operator
-   * @param input Input node (operand)
+   * @param input Input expression (operand)
    */
-  expression(ast_operator op, node const& input) : op(op), operands({input})
-  {
-    if (cudf::ast::detail::ast_operator_arity(op) != 1) {
-      CUDF_FAIL("The provided operator is not a unary operator.");
-    }
-  }
+  operation(ast_operator op, expression const& input);
 
   /**
-   * @brief Construct a new binary expression object.
+   * @brief Construct a new binary operation object.
    *
    * @param op Operator
-   * @param left Left input node (left operand)
-   * @param right Right input node (right operand)
+   * @param left Left input expression (left operand)
+   * @param right Right input expression (right operand)
    */
-  expression(ast_operator op, node const& left, node const& right) : op(op), operands({left, right})
-  {
-    if (cudf::ast::detail::ast_operator_arity(op) != 2) {
-      CUDF_FAIL("The provided operator is not a binary operator.");
-    }
-  }
+  operation(ast_operator op, expression const& left, expression const& right);
 
-  // expression only stores references to nodes, so it does not accept r-value
-  // references: the calling code must own the nodes.
-  expression(ast_operator op, node&& input)                   = delete;
-  expression(ast_operator op, node&& left, node&& right)      = delete;
-  expression(ast_operator op, node&& left, node const& right) = delete;
-  expression(ast_operator op, node const& left, node&& right) = delete;
+  // operation only stores references to expressions, so it does not accept r-value
+  // references: the calling code must own the expressions.
+  operation(ast_operator op, expression&& input)                         = delete;
+  operation(ast_operator op, expression&& left, expression&& right)      = delete;
+  operation(ast_operator op, expression&& left, expression const& right) = delete;
+  operation(ast_operator op, expression const& left, expression&& right) = delete;
 
   /**
    * @brief Get the operator.
@@ -228,9 +284,9 @@ class expression : public detail::node {
   /**
    * @brief Get the operands.
    *
-   * @return std::vector<std::reference_wrapper<const node>>
+   * @return std::vector<std::reference_wrapper<const expression>>
    */
-  std::vector<std::reference_wrapper<const node>> get_operands() const { return operands; }
+  std::vector<std::reference_wrapper<expression const>> get_operands() const { return operands; }
 
   /**
    * @brief Accepts a visitor class.
@@ -238,11 +294,11 @@ class expression : public detail::node {
    * @param visitor Visitor.
    * @return cudf::size_type Index of device data reference for this instance.
    */
-  cudf::size_type accept(detail::linearizer& visitor) const override;
+  cudf::size_type accept(detail::expression_parser& visitor) const override;
 
  private:
-  const ast_operator op;
-  const std::vector<std::reference_wrapper<const node>> operands;
+  ast_operator const op;
+  std::vector<std::reference_wrapper<expression const>> const operands;
 };
 
 }  // namespace ast
diff --git a/cpp/include/cudf/ast/operators.hpp b/cpp/include/cudf/ast/operators.hpp
deleted file mode 100644
index 78e56340246..00000000000
--- a/cpp/include/cudf/ast/operators.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cudf {
-
-namespace ast {
-
-/**
- * @brief Enum of supported operators.
- */
-enum class ast_operator {
-  // Binary operators
-  ADD,            ///< operator +
-  SUB,            ///< operator -
-  MUL,            ///< operator *
-  DIV,            ///< operator / using common type of lhs and rhs
-  TRUE_DIV,       ///< operator / after promoting type to floating point
-  FLOOR_DIV,      ///< operator / after promoting to 64 bit floating point and then
-                  ///< flooring the result
-  MOD,            ///< operator %
-  PYMOD,          ///< operator % but following python's sign rules for negatives
-  POW,            ///< lhs ^ rhs
-  EQUAL,          ///< operator ==
-  NOT_EQUAL,      ///< operator !=
-  LESS,           ///< operator <
-  GREATER,        ///< operator >
-  LESS_EQUAL,     ///< operator <=
-  GREATER_EQUAL,  ///< operator >=
-  BITWISE_AND,    ///< operator &
-  BITWISE_OR,     ///< operator |
-  BITWISE_XOR,    ///< operator ^
-  LOGICAL_AND,    ///< operator &&
-  LOGICAL_OR,     ///< operator ||
-  // Unary operators
-  IDENTITY,    ///< Identity function
-  SIN,         ///< Trigonometric sine
-  COS,         ///< Trigonometric cosine
-  TAN,         ///< Trigonometric tangent
-  ARCSIN,      ///< Trigonometric sine inverse
-  ARCCOS,      ///< Trigonometric cosine inverse
-  ARCTAN,      ///< Trigonometric tangent inverse
-  SINH,        ///< Hyperbolic sine
-  COSH,        ///< Hyperbolic cosine
-  TANH,        ///< Hyperbolic tangent
-  ARCSINH,     ///< Hyperbolic sine inverse
-  ARCCOSH,     ///< Hyperbolic cosine inverse
-  ARCTANH,     ///< Hyperbolic tangent inverse
-  EXP,         ///< Exponential (base e, Euler number)
-  LOG,         ///< Natural Logarithm (base e)
-  SQRT,        ///< Square-root (x^0.5)
-  CBRT,        ///< Cube-root (x^(1.0/3))
-  CEIL,        ///< Smallest integer value not less than arg
-  FLOOR,       ///< largest integer value not greater than arg
-  ABS,         ///< Absolute value
-  RINT,        ///< Rounds the floating-point argument arg to an integer value
-  BIT_INVERT,  ///< Bitwise Not (~)
-  NOT          ///< Logical Not (!)
-};
-
-}  // namespace ast
-
-}  // namespace cudf
diff --git a/cpp/include/cudf/ast/transform.hpp b/cpp/include/cudf/ast/transform.hpp
deleted file mode 100644
index 59697e5f75c..00000000000
--- a/cpp/include/cudf/ast/transform.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/ast/nodes.hpp>
-#include <cudf/table/table_view.hpp>
-
-namespace cudf {
-
-namespace ast {
-
-/**
- * @brief Compute a new column by evaluating an expression tree on a table.
- *
- * This evaluates an expression over a table to produce a new column. Also called an n-ary
- * transform.
- *
- * @param table The table used for expression evaluation.
- * @param expr The root of the expression tree.
- * @param mr Device memory resource.
- * @return std::unique_ptr<column> Output column.
- */
-std::unique_ptr<column> compute_column(
-  table_view const table,
-  expression const& expr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace ast
-
-}  // namespace cudf
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 87fbc1ac651..5950edabbfc 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -24,6 +24,7 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -853,6 +854,14 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
     return d_children[child_index];
   }
 
+  /**
+   * @brief Returns a span containing the children of this column
+   */
+  __device__ device_span<column_device_view const> children() const noexcept
+  {
+    return device_span<column_device_view const>(d_children, _num_children);
+  }
+
   /**
    * @brief Returns the number of child columns
    *
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 3d90ac063e1..2e4ac870969 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -206,6 +206,21 @@ std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Extract the number of days in the month
+ *
+ * output[i] contains the number of days in the month of date `column[i]`
+ * output[i] is null if `column[i]` is null
+ *
+ * @throw cudf::logic_error if input column datatype is not a TIMESTAMP
+ *
+ * @param cudf::column_view of the input datetime values
+ * @return cudf::column of datatype INT16 of days in month of the corresponding date
+ */
+std::unique_ptr<cudf::column> days_in_month(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief  Returns the quarter of the date
  *
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 163ad3e480f..4e4c63ae517 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -130,7 +130,9 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
 /**
  * @brief Derived class for specifying a sum aggregation
  */
-class sum_aggregation final : public rolling_aggregation {
+class sum_aggregation final : public rolling_aggregation,
+                              public groupby_aggregation,
+                              public groupby_scan_aggregation {
  public:
   sum_aggregation() : aggregation(SUM) {}
 
@@ -149,7 +151,7 @@ class sum_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a product aggregation
  */
-class product_aggregation final : public aggregation {
+class product_aggregation final : public groupby_aggregation {
  public:
   product_aggregation() : aggregation(PRODUCT) {}
 
@@ -168,7 +170,9 @@ class product_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a min aggregation
  */
-class min_aggregation final : public rolling_aggregation {
+class min_aggregation final : public rolling_aggregation,
+                              public groupby_aggregation,
+                              public groupby_scan_aggregation {
  public:
   min_aggregation() : aggregation(MIN) {}
 
@@ -187,7 +191,9 @@ class min_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a max aggregation
  */
-class max_aggregation final : public rolling_aggregation {
+class max_aggregation final : public rolling_aggregation,
+                              public groupby_aggregation,
+                              public groupby_scan_aggregation {
  public:
   max_aggregation() : aggregation(MAX) {}
 
@@ -206,7 +212,9 @@ class max_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a count aggregation
  */
-class count_aggregation final : public rolling_aggregation {
+class count_aggregation final : public rolling_aggregation,
+                                public groupby_aggregation,
+                                public groupby_scan_aggregation {
  public:
   count_aggregation(aggregation::Kind kind) : aggregation(kind) {}
 
@@ -263,7 +271,7 @@ class all_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a sum_of_squares aggregation
  */
-class sum_of_squares_aggregation final : public aggregation {
+class sum_of_squares_aggregation final : public groupby_aggregation {
  public:
   sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {}
 
@@ -282,7 +290,7 @@ class sum_of_squares_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a mean aggregation
  */
-class mean_aggregation final : public rolling_aggregation {
+class mean_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   mean_aggregation() : aggregation(MEAN) {}
 
@@ -301,7 +309,7 @@ class mean_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a m2 aggregation
  */
-class m2_aggregation : public aggregation {
+class m2_aggregation : public groupby_aggregation {
  public:
   m2_aggregation() : aggregation{M2} {}
 
@@ -320,7 +328,7 @@ class m2_aggregation : public aggregation {
 /**
  * @brief Derived class for specifying a standard deviation/variance aggregation
  */
-class std_var_aggregation : public aggregation {
+class std_var_aggregation : public groupby_aggregation {
  public:
   size_type _ddof;  ///< Delta degrees of freedom
 
@@ -339,7 +347,6 @@ class std_var_aggregation : public aggregation {
     CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
                  "std_var_aggregation can accept only STD, VARIANCE");
   }
-
   size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
 };
 
@@ -348,7 +355,10 @@ class std_var_aggregation : public aggregation {
  */
 class var_aggregation final : public std_var_aggregation {
  public:
-  var_aggregation(size_type ddof) : std_var_aggregation{aggregation::VARIANCE, ddof} {}
+  var_aggregation(size_type ddof)
+    : aggregation{aggregation::VARIANCE}, std_var_aggregation{aggregation::VARIANCE, ddof}
+  {
+  }
 
   std::unique_ptr<aggregation> clone() const override
   {
@@ -367,7 +377,10 @@ class var_aggregation final : public std_var_aggregation {
  */
 class std_aggregation final : public std_var_aggregation {
  public:
-  std_aggregation(size_type ddof) : std_var_aggregation{aggregation::STD, ddof} {}
+  std_aggregation(size_type ddof)
+    : aggregation{aggregation::STD}, std_var_aggregation{aggregation::STD, ddof}
+  {
+  }
 
   std::unique_ptr<aggregation> clone() const override
   {
@@ -384,7 +397,7 @@ class std_aggregation final : public std_var_aggregation {
 /**
  * @brief Derived class for specifying a median aggregation
  */
-class median_aggregation final : public aggregation {
+class median_aggregation final : public groupby_aggregation {
  public:
   median_aggregation() : aggregation(MEDIAN) {}
 
@@ -403,7 +416,7 @@ class median_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a quantile aggregation
  */
-class quantile_aggregation final : public aggregation {
+class quantile_aggregation final : public groupby_aggregation {
  public:
   quantile_aggregation(std::vector<double> const& q, interpolation i)
     : aggregation{QUANTILE}, _quantiles{q}, _interpolation{i}
@@ -449,7 +462,7 @@ class quantile_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying an argmax aggregation
  */
-class argmax_aggregation final : public rolling_aggregation {
+class argmax_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   argmax_aggregation() : aggregation(ARGMAX) {}
 
@@ -468,7 +481,7 @@ class argmax_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying an argmin aggregation
  */
-class argmin_aggregation final : public rolling_aggregation {
+class argmin_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   argmin_aggregation() : aggregation(ARGMIN) {}
 
@@ -487,7 +500,7 @@ class argmin_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a nunique aggregation
  */
-class nunique_aggregation final : public aggregation {
+class nunique_aggregation final : public groupby_aggregation {
  public:
   nunique_aggregation(null_policy null_handling)
     : aggregation{NUNIQUE}, _null_handling{null_handling}
@@ -523,7 +536,7 @@ class nunique_aggregation final : public aggregation {
 /**
  * @brief Derived class for specifying a nth element aggregation
  */
-class nth_element_aggregation final : public aggregation {
+class nth_element_aggregation final : public groupby_aggregation {
  public:
   nth_element_aggregation(size_type n, null_policy null_handling)
     : aggregation{NTH_ELEMENT}, _n{n}, _null_handling{null_handling}
@@ -582,7 +595,7 @@ class row_number_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a rank aggregation
  */
-class rank_aggregation final : public rolling_aggregation {
+class rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation {
  public:
   rank_aggregation() : aggregation{RANK} {}
 
@@ -601,7 +614,7 @@ class rank_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying a dense rank aggregation
  */
-class dense_rank_aggregation final : public rolling_aggregation {
+class dense_rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation {
  public:
   dense_rank_aggregation() : aggregation{DENSE_RANK} {}
 
@@ -620,7 +633,7 @@ class dense_rank_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived aggregation class for specifying COLLECT_LIST aggregation
  */
-class collect_list_aggregation final : public rolling_aggregation {
+class collect_list_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE)
     : aggregation{COLLECT_LIST}, _null_handling{null_handling}
@@ -656,7 +669,7 @@ class collect_list_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived aggregation class for specifying COLLECT_SET aggregation
  */
-class collect_set_aggregation final : public rolling_aggregation {
+class collect_set_aggregation final : public rolling_aggregation, public groupby_aggregation {
  public:
   explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
                                    null_equality nulls_equal = null_equality::EQUAL,
@@ -795,7 +808,7 @@ class udf_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_LISTS aggregation
  */
-class merge_lists_aggregation final : public aggregation {
+class merge_lists_aggregation final : public groupby_aggregation {
  public:
   explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {}
 
@@ -814,7 +827,7 @@ class merge_lists_aggregation final : public aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_SETS aggregation
  */
-class merge_sets_aggregation final : public aggregation {
+class merge_sets_aggregation final : public groupby_aggregation {
  public:
   explicit merge_sets_aggregation(null_equality nulls_equal, nan_equality nans_equal)
     : aggregation{MERGE_SETS}, _nulls_equal(nulls_equal), _nans_equal(nans_equal)
@@ -855,7 +868,7 @@ class merge_sets_aggregation final : public aggregation {
 /**
  * @brief Derived aggregation class for specifying MERGE_M2 aggregation
  */
-class merge_m2_aggregation final : public aggregation {
+class merge_m2_aggregation final : public groupby_aggregation {
  public:
   explicit merge_m2_aggregation() : aggregation{MERGE_M2} {}
 
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index fb24b7669d7..0ae7ba0a6a6 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -34,5 +34,16 @@ std::unique_ptr<table> tile(
   size_type count,
   rmm::cuda_stream_view               = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::interleave_columns
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<column> interleave_columns(
+  table_view const& input,
+  rmm::cuda_stream_view               = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index b94223cdabe..12948498455 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/ast/expressions.hpp>
 #include <cudf/transform.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -35,6 +36,17 @@ std::unique_ptr<column> transform(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::compute_column
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> compute_column(
+  table_view const table,
+  ast::operation const& expr,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::nans_to_nulls
  *
diff --git a/java/src/main/java/ai/rapids/cudf/ast/Expression.java b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
similarity index 59%
rename from java/src/main/java/ai/rapids/cudf/ast/Expression.java
rename to cpp/include/cudf/detail/utilities/visitor_overload.hpp
index 8d391298cef..a55ca323c50 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/Expression.java
+++ b/cpp/include/cudf/detail/utilities/visitor_overload.hpp
@@ -13,19 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
 
-package ai.rapids.cudf.ast;
+namespace cudf::detail {
 
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
+/**
+ * @brief Helper class to support inline-overloading for all of a variant's alternative types
+ */
+template <class... Ts>
+struct visitor_overload : Ts... {
+  using Ts::operator()...;
+};
+template <class... Ts>
+visitor_overload(Ts...) -> visitor_overload<Ts...>;
 
-/** Base class of every AST expression. */
-public abstract class Expression extends AstNode {
-  public CompiledExpression compile() {
-    int size = getSerializedSize();
-    ByteBuffer bb = ByteBuffer.allocate(size);
-    bb.order(ByteOrder.nativeOrder());
-    serialize(bb);
-    return new CompiledExpression(bb.array());
-  }
-}
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 5656b38a0ef..3b8354ebc9f 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -56,8 +56,23 @@ class sort_groupby_helper;
  * `values.size()` column must equal `keys.num_rows()`.
  */
 struct aggregation_request {
-  column_view values;                                      ///< The elements to aggregate
-  std::vector<std::unique_ptr<aggregation>> aggregations;  ///< Desired aggregations
+  column_view values;                                              ///< The elements to aggregate
+  std::vector<std::unique_ptr<groupby_aggregation>> aggregations;  ///< Desired aggregations
+};
+
+/**
+ * @brief Request for groupby aggregation(s) for scanning a column.
+ *
+ * The group membership of each `value[i]` is determined by the corresponding
+ * row `i` in the original order of `keys` used to construct the
+ * `groupby`. I.e., for each `aggregation`, `values[i]` is aggregated with all
+ * other `values[j]` where rows `i` and `j` in `keys` are equivalent.
+ *
+ * `values.size()` column must equal `keys.num_rows()`.
+ */
+struct scan_request {
+  column_view values;  ///< The elements to aggregate
+  std::vector<std::unique_ptr<groupby_scan_aggregation>> aggregations;  ///< Desired aggregations
 };
 
 /**
@@ -222,7 +237,7 @@ class groupby {
    * specified in `requests`.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
-    host_span<aggregation_request const> requests,
+    host_span<scan_request const> requests,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -388,7 +403,7 @@ class groupby {
     rmm::mr::device_memory_resource* mr);
 
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
-    host_span<aggregation_request const> requests,
+    host_span<scan_request const> requests,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr);
 };
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 34410209c72..774690c939f 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -195,12 +195,9 @@ class avro_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.avro";
- *  cudf::avro_reader_options options =
- * cudf::avro_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_avro(options);
+ *  auto source  = cudf::io::source_info("dataset.avro");
+ *  auto options = cudf::io::avro_reader_options::builder(source);
+ *  auto result  = cudf::io::read_avro(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 1dff99735ec..455ffce7ed8 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -104,14 +104,18 @@ class csv_reader_options {
   // Whether a quote inside a value is double-quoted
   bool _doublequote = true;
   // Names of columns to read as datetime
-  std::vector<std::string> _infer_date_names;
+  std::vector<std::string> _parse_dates_names;
   // Indexes of columns to read as datetime
-  std::vector<int> _infer_date_indexes;
+  std::vector<int> _parse_dates_indexes;
+  // Names of columns to parse as hexadecimal
+  std::vector<std::string> _parse_hex_names;
+  // Indexes of columns to parse as hexadecimal
+  std::vector<int> _parse_hex_indexes;
 
   // Conversion settings
 
   // Per-column types; disables type inference on those columns
-  std::variant<std::vector<std::string>, std::vector<data_type>> _dtypes;
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
   // Additional values to recognize as boolean true values
   std::vector<std::string> _true_values{"True", "TRUE", "true"};
   // Additional values to recognize as boolean false values
@@ -280,17 +284,27 @@ class csv_reader_options {
   /**
    * @brief Returns names of columns to read as datetime.
    */
-  std::vector<std::string> const& get_infer_date_names() const { return _infer_date_names; }
+  std::vector<std::string> const& get_parse_dates_names() const { return _parse_dates_names; }
 
   /**
    * @brief Returns indexes of columns to read as datetime.
    */
-  std::vector<int> const& get_infer_date_indexes() const { return _infer_date_indexes; }
+  std::vector<int> const& get_parse_dates_indexes() const { return _parse_dates_indexes; }
+
+  /**
+   * @brief Returns names of columns to read as hexadecimal.
+   */
+  std::vector<std::string> const& get_parse_hex_names() const { return _parse_hex_names; }
+
+  /**
+   * @brief Returns indexes of columns to read as hexadecimal.
+   */
+  std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
 
   /**
    * @brief Returns per-column types.
    */
-  std::variant<std::vector<std::string>, std::vector<data_type>> const& get_dtypes() const
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
   {
     return _dtypes;
   }
@@ -547,9 +561,9 @@ class csv_reader_options {
    *
    * @param col_names Vector of column names to infer as datetime.
    */
-  void set_infer_date_names(std::vector<std::string> col_names)
+  void set_parse_dates(std::vector<std::string> col_names)
   {
-    _infer_date_names = std::move(col_names);
+    _parse_dates_names = std::move(col_names);
   }
 
   /**
@@ -557,30 +571,38 @@ class csv_reader_options {
    *
    * @param col_names Vector of column indices to infer as datetime.
    */
-  void set_infer_date_indexes(std::vector<int> col_ind)
+  void set_parse_dates(std::vector<int> col_ind) { _parse_dates_indexes = std::move(col_ind); }
+
+  /**
+   * @brief Sets names of columns to parse as hexadecimal
+   *
+   * @param col_names Vector of column names to parse as hexadecimal
+   */
+  void set_parse_hex(std::vector<std::string> col_names)
   {
-    _infer_date_indexes = std::move(col_ind);
+    _parse_hex_names = std::move(col_names);
   }
 
+  /**
+   * @brief Sets indexes of columns to parse as hexadecimal
+   *
+   * @param col_names Vector of column indices to parse as hexadecimal
+   */
+  void set_parse_hex(std::vector<int> col_ind) { _parse_hex_indexes = std::move(col_ind); }
+
   /**
    * @brief Sets per-column types
    *
-   * @param types Vector specifying the columns' target data types.
+   * @param types Column name -> data type map specifying the columns' target data types
    */
-  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
+  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
 
   /**
-   * @brief Sets per-column types, specified by the type's respective string representation.
+   * @brief Sets per-column types
    *
-   * @param types Vector of dtypes in which the column needs to be read.
+   * @param types Vector specifying the columns' target data types.
    */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) instead.")]] void
-  set_dtypes(std::vector<std::string> types)
-  {
-    _dtypes = std::move(types);
-  }
+  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
 
   /**
    * @brief Sets additional values to recognize as boolean true values.
@@ -958,49 +980,70 @@ class csv_reader_options_builder {
   /**
    * @brief Sets names of columns to read as datetime.
    *
-   * @param col_names Vector of column names to infer as datetime.
+   * @param col_names Vector of column names to read as datetime.
    * @return this for chaining.
    */
-  csv_reader_options_builder& infer_date_names(std::vector<std::string> col_names)
+  csv_reader_options_builder& parse_dates(std::vector<std::string> col_names)
   {
-    options._infer_date_names = std::move(col_names);
+    options._parse_dates_names = std::move(col_names);
     return *this;
   }
 
   /**
    * @brief Sets indexes of columns to read as datetime.
    *
-   * @param col_names Vector of column indices to infer as datetime.
+   * @param col_ind Vector of column indices to read as datetime
+   * @return this for chaining.
+   */
+  csv_reader_options_builder& parse_dates(std::vector<int> col_ind)
+  {
+    options._parse_dates_indexes = std::move(col_ind);
+    return *this;
+  }
+
+  /**
+   * @brief Sets names of columns to parse as hexadecimal.
+   *
+   * @param col_names Vector of column names to parse as hexadecimal
+   * @return this for chaining.
+   */
+  csv_reader_options_builder& parse_hex(std::vector<std::string> col_names)
+  {
+    options._parse_hex_names = std::move(col_names);
+    return *this;
+  }
+
+  /**
+   * @brief Sets indexes of columns to parse as hexadecimal.
+   *
+   * @param col_ind Vector of column indices to parse as hexadecimal
    * @return this for chaining.
    */
-  csv_reader_options_builder& infer_date_indexes(std::vector<int> col_ind)
+  csv_reader_options_builder& parse_hex(std::vector<int> col_ind)
   {
-    options._infer_date_indexes = std::move(col_ind);
+    options._parse_hex_indexes = std::move(col_ind);
     return *this;
   }
 
   /**
    * @brief Sets per-column types.
    *
-   * @param types Vector of data types in which the column needs to be read.
+   * @param types Column name -> data type map specifying the columns' target data types
    * @return this for chaining.
    */
-  csv_reader_options_builder& dtypes(std::vector<data_type> types)
+  csv_reader_options_builder& dtypes(std::map<std::string, data_type> types)
   {
     options._dtypes = std::move(types);
     return *this;
   }
 
   /**
-   * @brief Sets per-column types, specified by the type's respective string representation.
+   * @brief Sets per-column types.
    *
-   * @param types Vector of dtypes in which the column needs to be read.
+   * @param types Vector of data types in which the column needs to be read.
    * @return this for chaining.
    */
-  [[deprecated(
-    "The string-based interface will be deprecated."
-    "Use dtypes(std::vector<data_type>) instead.")]] csv_reader_options_builder&
-  dtypes(std::vector<std::string> types)
+  csv_reader_options_builder& dtypes(std::vector<data_type> types)
   {
     options._dtypes = std::move(types);
     return *this;
@@ -1108,11 +1151,9 @@ class csv_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  std::string filepath = "dataset.csv";
- *  cudf::io::csv_reader_options options =
- * cudf::io::csv_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_csv(options);
+ *  auto source  = cudf::io::source_info("dataset.csv");
+ *  auto options = cudf::io::csv_reader_options::builder(source);
+ *  auto result  = cudf::io::read_csv(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
@@ -1437,12 +1478,12 @@ class csv_writer_options_builder {
  *
  * The following code snippet demonstrates how to write columns to a file:
  * @code
- *  std::string filepath = "dataset.csv";
- *  cudf::io::sink_info sink_info(filepath);
+ *  auto destination = cudf::io::sink_info("dataset.csv");
+ *  auto options     = cudf::io::csv_writer_options(destination, table->view())
+ *    .na_rep(na)
+ *    .include_header(include_header)
+ *    .rows_per_chunk(rows_per_chunk);
  *
- *  cudf::io::csv_writer_options options = cudf::io::csv_writer_options(sink_info,
- * table->view()).na_rep(na).include_header(include_header).rows_per_chunk(rows_per_chunk);
- *  ...
  *  cudf::io::write_csv(options);
  * @endcode
  *
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 2f4d0936d8b..31201e30ac6 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -23,7 +23,9 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <map>
 #include <string>
+#include <variant>
 #include <vector>
 
 namespace cudf {
@@ -66,7 +68,7 @@ class json_reader_options {
   source_info _source;
 
   // Data types of the column; empty to infer dtypes
-  std::vector<std::string> _dtypes;
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> _dtypes;
   // Specify the compression format of the source or infer from file extension
   compression_type _compression = compression_type::AUTO;
 
@@ -114,7 +116,10 @@ class json_reader_options {
   /**
    * @brief Returns data types of the columns.
    */
-  std::vector<std::string> const& get_dtypes() const { return _dtypes; }
+  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
+  {
+    return _dtypes;
+  }
 
   /**
    * @brief Returns compression format of the source.
@@ -141,19 +146,26 @@ class json_reader_options {
    */
   bool is_enabled_dayfirst() const { return _dayfirst; }
 
+  /**
+   * @brief Set data types for columns to be read.
+   *
+   * @param types Vector of dtypes
+   */
+  void set_dtypes(std::vector<data_type> types) { _dtypes = std::move(types); }
+
   /**
    * @brief Set data types for columns to be read.
    *
    * @param types Vector dtypes in string format.
    */
-  void dtypes(std::vector<std::string> types) { _dtypes = std::move(types); }
+  void set_dtypes(std::map<std::string, data_type> types) { _dtypes = std::move(types); }
 
   /**
    * @brief Set the compression type.
    *
    * @param comp_type The compression type used.
    */
-  void compression(compression_type comp_type) { _compression = comp_type; }
+  void set_compression(compression_type comp_type) { _compression = comp_type; }
 
   /**
    * @brief Set number of bytes to skip from source start.
@@ -205,10 +217,22 @@ class json_reader_options_builder {
   /**
    * @brief Set data types for columns to be read.
    *
-   * @param types Vector dtypes in string format.
-   * @return this for chaining.
+   * @param types Vector of dtypes
+   * @return this for chaining
+   */
+  json_reader_options_builder& dtypes(std::vector<data_type> types)
+  {
+    options._dtypes = std::move(types);
+    return *this;
+  }
+
+  /**
+   * @brief Set data types for columns to be read.
+   *
+   * @param types Column name -> dtype map.
+   * @return this for chaining
    */
-  json_reader_options_builder& dtypes(std::vector<std::string> types)
+  json_reader_options_builder& dtypes(std::map<std::string, data_type> types)
   {
     options._dtypes = std::move(types);
     return *this;
@@ -292,11 +316,9 @@ class json_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.json";
- *  cudf::read_json_options options = cudf::read_json_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_json(options);
+ *  auto source  = cudf::io::source_info("dataset.json");
+ *  auto options = cudf::io::read_json_options::builder(source);
+ *  auto result  = cudf::io::read_json(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 997f35ed922..4ae09b516a4 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -346,14 +346,14 @@ class orc_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.orc";
- *  cudf::orc_reader_options options =
- * cudf::orc_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_orc(options);
+ *  auto source  = cudf::io::source_info("dataset.orc");
+ *  auto options = cudf::io::orc_reader_options::builder(source);
+ *  auto result  = cudf::io::read_orc(options);
  * @endcode
  *
+ * Note: Support for reading files with struct columns is currently experimental, the output may not
+ * be as reliable as reading for other datatypes.
+ *
  * @param options Settings for controlling reading behavior.
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata.
@@ -565,12 +565,9 @@ class orc_writer_options_builder {
  *
  * The following code snippet demonstrates how to write columns to a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.orc";
- *  cudf::orc_writer_options options = cudf::orc_writer_options::builder(cudf::sink_info(filepath),
- * table->view());
- *  ...
- *  cudf::write_orc(options);
+ *  auto destination = cudf::io::sink_info("dataset.orc");
+ *  auto options     = cudf::io::orc_writer_options::builder(destination, table->view());
+ *  cudf::io::write_orc(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ecd9607a87e..25cbb6fd554 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -50,7 +50,7 @@ class parquet_reader_options_builder;
 class parquet_reader_options {
   source_info _source;
 
-  // Names of column to read; empty is all
+  // Path in schema of column to read; empty is all
   std::vector<std::string> _columns;
 
   // List of individual row groups to read (ignored if empty)
@@ -354,12 +354,9 @@ class parquet_reader_options_builder {
  *
  * The following code snippet demonstrates how to read a dataset from a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.parquet";
- *  cudf::io::parquet_reader_options options =
- *  cudf::io::parquet_reader_options::builder(cudf::source_info(filepath));
- *  ...
- *  auto result = cudf::read_parquet(options);
+ *  auto source  = cudf::io::source_info("dataset.parquet");
+ *  auto options = cudf::io::parquet_reader_options::builder(source);
+ *  auto result  = cudf::io::read_parquet(options);
  * @endcode
  *
  * @param options Settings for controlling reading behavior
@@ -784,12 +781,9 @@ class parquet_writer_options_builder {
  *
  * The following code snippet demonstrates how to write columns to a file:
  * @code
- *  ...
- *  std::string filepath = "dataset.parquet";
- *  cudf::io::parquet_writer_options options =
- *  cudf::io::parquet_writer_options::builder(cudf::sink_info(filepath), table->view());
- *  ...
- *  cudf::write_parquet(options);
+ *  auto destination = cudf::io::sink_info("dataset.parquet");
+ *  auto options     = cudf::io::parquet_writer_options::builder(destination, table->view());
+ *  cudf::io::write_parquet(options);
  * @endcode
  *
  * @param options Settings for controlling writing behavior.
@@ -1019,15 +1013,12 @@ std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
  * one logical table by writing a series of individual cudf::tables.
  *
  * @code
- *  ...
- *  std::string filepath = "dataset.parquet";
- *  cudf::io::chunked_parquet_writer_options options =
- *  cudf::io::chunked_parquet_writer_options::builder(cudf::sink_info(filepath), table->view());
- *  ...
- *  cudf::io::parquet_chunked_writer writer(options)
+ *  auto destination = cudf::io::sink_info("dataset.parquet");
+ *  auto options = cudf::io::chunked_parquet_writer_options::builder(destination, table->view());
+ *  auto writer  = cudf::io::parquet_chunked_writer(options);
+ *
  *  writer.write(table0)
  *  writer.write(table1)
- *  ...
  *  writer.close()
  *  @endcode
  */
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index d0d2083b85b..483cd75c739 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/ast/nodes.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
@@ -673,8 +673,6 @@ class hash_join {
  * Result: {{1}, {0}}
  * @endcode
  *
- * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
- * mismatch.
  * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
  *
  * @param left The left table
@@ -689,11 +687,12 @@ class hash_join {
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_inner_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls            = null_equality::EQUAL,
+  std::optional<std::size_t> output_size = {},
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -721,8 +720,6 @@ conditional_inner_join(
  * Result: {{0, 1, 2}, {None, 0, None}}
  * @endcode
  *
- * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
- * mismatch.
  * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
  *
  * @param left The left table
@@ -736,10 +733,11 @@ conditional_inner_join(
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_left_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
-                      null_equality compare_nulls         = null_equality::EQUAL,
+conditional_left_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
+                      null_equality compare_nulls            = null_equality::EQUAL,
+                      std::optional<std::size_t> output_size = {},
                       rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -767,8 +765,6 @@ conditional_left_join(table_view left,
  * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
  * @endcode
  *
- * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
- * mismatch.
  * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
  *
  * @param left The left table
@@ -782,9 +778,9 @@ conditional_left_join(table_view left,
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_full_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
+conditional_full_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
                       null_equality compare_nulls         = null_equality::EQUAL,
                       rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -808,8 +804,6 @@ conditional_full_join(table_view left,
  * Result: {1}
  * @endcode
  *
- * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
- * mismatch.
  * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
  *
  * @param left The left table
@@ -823,11 +817,12 @@ conditional_full_join(table_view left,
  * `right` .
  */
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls            = null_equality::EQUAL,
+  std::optional<std::size_t> output_size = {},
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -849,8 +844,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
  * Result: {0, 2}
  * @endcode
  *
- * @throw cudf::logic_error if number of elements in `left_keys` or `right_keys`
- * mismatch.
  * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
  *
  * @param left The left table
@@ -864,11 +857,111 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
  * `right` .
  */
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls            = null_equality::EQUAL,
+  std::optional<std::size_t> output_size = {},
+  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * conditional inner join between the specified tables where the predicate
+ * evaluates to true.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output.
+ *
+ * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return The size that would result from performing the requested join.
+ */
+std::size_t conditional_inner_join_size(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * conditional left join between the specified tables where the predicate
+ * evaluates to true.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output.
+ *
+ * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return The size that would result from performing the requested join.
+ */
+std::size_t conditional_left_join_size(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * conditional left semi join between the specified tables where the predicate
+ * evaluates to true.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output.
+ *
+ * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return The size that would result from performing the requested join.
+ */
+std::size_t conditional_left_semi_join_size(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * conditional left anti join between the specified tables where the predicate
+ * evaluates to true.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output.
+ *
+ * @throw cudf::logic_error if the binary predicate outputs a non-boolean result.
+ *
+ * @param left The left table
+ * @param right The right table
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether the equality operator returns true or false for two nulls.
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return The size that would result from performing the requested join.
+ */
+std::size_t conditional_left_anti_join_size(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 13a3da14cce..39bd2984095 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -706,13 +706,16 @@ class list_scalar : public scalar {
  */
 class struct_scalar : public scalar {
  public:
-  struct_scalar()                           = delete;
-  ~struct_scalar()                          = default;
-  struct_scalar(struct_scalar&& other)      = default;
-  struct_scalar(struct_scalar const& other) = default;
+  struct_scalar()                      = delete;
+  ~struct_scalar()                     = default;
+  struct_scalar(struct_scalar&& other) = default;
   struct_scalar& operator=(struct_scalar const& other) = delete;
   struct_scalar& operator=(struct_scalar&& other) = delete;
 
+  struct_scalar(struct_scalar const& other,
+                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
   /**
    * @brief Construct a new struct scalar object from table_view.
    *
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index 8758a28885f..6894c34a077 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -18,7 +18,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
-#include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 
@@ -205,81 +204,6 @@ auto make_strings_children(
   return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
 }
 
-/**
- * @brief Creates child offsets, chars columns and null mask, null count of a strings column by
- * applying the template function that can be used for computing the output size of each string as
- * well as create the output.
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must have members `d_offsets`, `d_chars`, and `d_validities` which are set to memory
- *         containing the offsets column, chars column and string validities during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string, which is
- *                         written into the `d_offsets` array. After that, `d_chars` is set and this
- *                         is called again to fill in the chars memory. The `d_validities` array may
- *                         be modified to set the value `0` for the corresponding rows that contain
- *                         null string elements.
- * @param exec_size Range for executing the function `size_and_exec_fn`.
- * @param strings_count Number of strings.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @return offsets child column, chars child column, null_mask, and null_count for a strings column.
- */
-template <typename SizeAndExecuteFunction>
-std::tuple<std::unique_ptr<column>, std::unique_ptr<column>, rmm::device_buffer, size_type>
-make_strings_children_with_null_mask(
-  SizeAndExecuteFunction size_and_exec_fn,
-  size_type exec_size,
-  size_type strings_count,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view          = offsets_column->mutable_view();
-  auto d_offsets             = offsets_view.template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
-
-  auto validities               = rmm::device_uvector<int8_t>(strings_count, stream);
-  size_and_exec_fn.d_validities = validities.begin();
-
-  // This is called twice: once for offsets and validities, and once for chars
-  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       exec_size,
-                       size_and_exec_fn);
-  };
-
-  // Compute the string sizes (storing in `d_offsets`) and string validities
-  for_each_fn(size_and_exec_fn);
-
-  // Compute the offsets from string sizes
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
-
-  // Now build the chars column
-  auto const bytes  = cudf::detail::get_value<int32_t>(offsets_view, strings_count, stream);
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-
-  // Execute the function fn again to fill the chars column.
-  // Note that if the output chars column has zero size, the function fn should not be called to
-  // avoid accidentally overwriting the offsets.
-  if (bytes > 0) {
-    size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
-    for_each_fn(size_and_exec_fn);
-  }
-
-  // Finally compute null mask and null count from the validities array
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
-
-  return std::make_tuple(std::move(offsets_column),
-                         std::move(chars_column),
-                         null_count > 0 ? std::move(null_mask) : rmm::device_buffer{},
-                         null_count);
-}
-
 // This template is a thin wrapper around per-context singleton objects.
 // It maintains a single object for each CUDA context.
 template <typename TableType>
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 28ab19e53d9..087d1a94603 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,22 +72,24 @@ std::unique_ptr<column> replace_re(
 
 /**
  * @brief For each string, replaces any character sequence matching the given pattern
- * using the repl template for back-references.
+ * using the replacement template for back-references.
  *
  * Any null string entries return corresponding null output column entries.
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
  *
+ * @throw cudf::logic_error if capture index values in `replacement` are not in range 1-99
+ *
  * @param strings Strings instance for this operation.
  * @param pattern The regular expression patterns to search within each string.
- * @param repl The replacement template for creating the output string.
+ * @param replacement The replacement template for creating the output string.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& strings,
   std::string const& pattern,
-  std::string const& repl,
+  std::string const& replacement,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index f5880e9b37f..af2858d948e 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
 
 #include <memory>
@@ -74,6 +75,24 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Compute a new column by evaluating an expression tree on a table.
+ *
+ * This evaluates an expression over a table to produce a new column. Also called an n-ary
+ * transform.
+ *
+ * @throws cudf::logic_error if passed an expression operating on table_reference::RIGHT.
+ *
+ * @param table The table used for expression evaluation.
+ * @param expr The root of the expression tree.
+ * @param mr Device memory resource.
+ * @return std::unique_ptr<column> Output column.
+ */
+std::unique_ptr<column> compute_column(
+  table_view const& table,
+  ast::expression const& expr,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Creates a bitmask from a column of boolean elements.
  *
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 016f2367139..f0c522257fb 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -362,6 +362,8 @@ std::unique_ptr<Base> make_sum_aggregation()
 }
 template std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_sum_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_sum_aggregation<groupby_aggregation>();
+template std::unique_ptr<groupby_scan_aggregation> make_sum_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a PRODUCT aggregation
 template <typename Base>
@@ -370,6 +372,7 @@ std::unique_ptr<Base> make_product_aggregation()
   return std::make_unique<detail::product_aggregation>();
 }
 template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_product_aggregation<groupby_aggregation>();
 
 /// Factory to create a MIN aggregation
 template <typename Base>
@@ -379,6 +382,8 @@ std::unique_ptr<Base> make_min_aggregation()
 }
 template std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_min_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_min_aggregation<groupby_aggregation>();
+template std::unique_ptr<groupby_scan_aggregation> make_min_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a MAX aggregation
 template <typename Base>
@@ -388,6 +393,8 @@ std::unique_ptr<Base> make_max_aggregation()
 }
 template std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_max_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_max_aggregation<groupby_aggregation>();
+template std::unique_ptr<groupby_scan_aggregation> make_max_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a COUNT aggregation
 template <typename Base>
@@ -401,6 +408,10 @@ template std::unique_ptr<aggregation> make_count_aggregation<aggregation>(
   null_policy null_handling);
 template std::unique_ptr<rolling_aggregation> make_count_aggregation<rolling_aggregation>(
   null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_count_aggregation<groupby_aggregation>(
+  null_policy null_handling);
+template std::unique_ptr<groupby_scan_aggregation> make_count_aggregation<groupby_scan_aggregation>(
+  null_policy null_handling);
 
 /// Factory to create a ANY aggregation
 template <typename Base>
@@ -425,6 +436,8 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation()
   return std::make_unique<detail::sum_of_squares_aggregation>();
 }
 template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation>
+make_sum_of_squares_aggregation<groupby_aggregation>();
 
 /// Factory to create a MEAN aggregation
 template <typename Base>
@@ -434,6 +447,7 @@ std::unique_ptr<Base> make_mean_aggregation()
 }
 template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_mean_aggregation<groupby_aggregation>();
 
 /// Factory to create a M2 aggregation
 template <typename Base>
@@ -442,6 +456,7 @@ std::unique_ptr<Base> make_m2_aggregation()
   return std::make_unique<detail::m2_aggregation>();
 }
 template std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_m2_aggregation<groupby_aggregation>();
 
 /// Factory to create a VARIANCE aggregation
 template <typename Base>
@@ -450,6 +465,8 @@ std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
   return std::make_unique<detail::var_aggregation>(ddof);
 }
 template std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(size_type ddof);
+template std::unique_ptr<groupby_aggregation> make_variance_aggregation<groupby_aggregation>(
+  size_type ddof);
 
 /// Factory to create a STD aggregation
 template <typename Base>
@@ -458,6 +475,8 @@ std::unique_ptr<Base> make_std_aggregation(size_type ddof)
   return std::make_unique<detail::std_aggregation>(ddof);
 }
 template std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
+template std::unique_ptr<groupby_aggregation> make_std_aggregation<groupby_aggregation>(
+  size_type ddof);
 
 /// Factory to create a MEDIAN aggregation
 template <typename Base>
@@ -466,6 +485,7 @@ std::unique_ptr<Base> make_median_aggregation()
   return std::make_unique<detail::median_aggregation>();
 }
 template std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_median_aggregation<groupby_aggregation>();
 
 /// Factory to create a QUANTILE aggregation
 template <typename Base>
@@ -475,6 +495,8 @@ std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& q, in
 }
 template std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
   std::vector<double> const& q, interpolation i);
+template std::unique_ptr<groupby_aggregation> make_quantile_aggregation<groupby_aggregation>(
+  std::vector<double> const& q, interpolation i);
 
 /// Factory to create an ARGMAX aggregation
 template <typename Base>
@@ -484,6 +506,7 @@ std::unique_ptr<Base> make_argmax_aggregation()
 }
 template std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmax_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_argmax_aggregation<groupby_aggregation>();
 
 /// Factory to create an ARGMIN aggregation
 template <typename Base>
@@ -493,6 +516,7 @@ std::unique_ptr<Base> make_argmin_aggregation()
 }
 template std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_argmin_aggregation<rolling_aggregation>();
+template std::unique_ptr<groupby_aggregation> make_argmin_aggregation<groupby_aggregation>();
 
 /// Factory to create an NUNIQUE aggregation
 template <typename Base>
@@ -502,6 +526,8 @@ std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling)
 }
 template std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
   null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_nunique_aggregation<groupby_aggregation>(
+  null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
 template <typename Base>
@@ -511,6 +537,8 @@ std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null
 }
 template std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
   size_type n, null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_nth_element_aggregation<groupby_aggregation>(
+  size_type n, null_policy null_handling);
 
 /// Factory to create a ROW_NUMBER aggregation
 template <typename Base>
@@ -528,6 +556,8 @@ std::unique_ptr<Base> make_rank_aggregation()
   return std::make_unique<detail::rank_aggregation>();
 }
 template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>();
+template std::unique_ptr<groupby_scan_aggregation>
+make_rank_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a DENSE_RANK aggregation
 template <typename Base>
@@ -536,6 +566,8 @@ std::unique_ptr<Base> make_dense_rank_aggregation()
   return std::make_unique<detail::dense_rank_aggregation>();
 }
 template std::unique_ptr<aggregation> make_dense_rank_aggregation<aggregation>();
+template std::unique_ptr<groupby_scan_aggregation>
+make_dense_rank_aggregation<groupby_scan_aggregation>();
 
 /// Factory to create a COLLECT_LIST aggregation
 template <typename Base>
@@ -547,6 +579,8 @@ template std::unique_ptr<aggregation> make_collect_list_aggregation<aggregation>
   null_policy null_handling);
 template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<rolling_aggregation>(
   null_policy null_handling);
+template std::unique_ptr<groupby_aggregation> make_collect_list_aggregation<groupby_aggregation>(
+  null_policy null_handling);
 
 /// Factory to create a COLLECT_SET aggregation
 template <typename Base>
@@ -560,6 +594,8 @@ template std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolling_aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
+template std::unique_ptr<groupby_aggregation> make_collect_set_aggregation<groupby_aggregation>(
+  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
 
 /// Factory to create a LAG aggregation
 template <typename Base>
@@ -605,6 +641,7 @@ std::unique_ptr<Base> make_merge_lists_aggregation()
   return std::make_unique<detail::merge_lists_aggregation>();
 }
 template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_merge_lists_aggregation<groupby_aggregation>();
 
 /// Factory to create a MERGE_SETS aggregation
 template <typename Base>
@@ -615,6 +652,8 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
 }
 template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
                                                                                nan_equality);
+template std::unique_ptr<groupby_aggregation> make_merge_sets_aggregation<groupby_aggregation>(
+  null_equality, nan_equality);
 
 /// Factory to create a MERGE_M2 aggregation
 template <typename Base>
@@ -623,6 +662,7 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
   return std::make_unique<detail::merge_m2_aggregation>();
 }
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
+template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
 namespace detail {
 namespace {
diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/expression_parser.cpp
similarity index 61%
rename from cpp/src/ast/linearizer.cpp
rename to cpp/src/ast/expression_parser.cpp
index 3e442305552..1072bff43dd 100644
--- a/cpp/src/ast/linearizer.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -13,9 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/ast/detail/linearizer.hpp>
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
@@ -56,7 +56,7 @@ device_data_reference::device_data_reference(device_data_reference_type referenc
 {
 }
 
-cudf::size_type linearizer::intermediate_counter::take()
+cudf::size_type expression_parser::intermediate_counter::take()
 {
   auto const first_missing = find_first_missing();
   used_values.insert(used_values.cbegin() + first_missing, first_missing);
@@ -64,7 +64,7 @@ cudf::size_type linearizer::intermediate_counter::take()
   return first_missing;
 }
 
-void linearizer::intermediate_counter::give(cudf::size_type value)
+void expression_parser::intermediate_counter::give(cudf::size_type value)
 {
   // TODO: add comment
   auto const lower_bound = std::lower_bound(used_values.cbegin(), used_values.cend(), value);
@@ -72,18 +72,7 @@ void linearizer::intermediate_counter::give(cudf::size_type value)
     used_values.erase(lower_bound);
 }
 
-/**
- * @brief Find the first missing value in a contiguous sequence of integers.
- *
- * From a sorted container of integers, find the first "missing" value.
- * For example, {0, 1, 2, 4, 5} is missing 3, and {1, 2, 3} is missing 0.
- * If there are no missing values, return the size of the container.
- *
- * @param start Starting index.
- * @param end Ending index.
- * @return cudf::size_type Smallest value not already in the container.
- */
-cudf::size_type linearizer::intermediate_counter::find_first_missing() const
+cudf::size_type expression_parser::intermediate_counter::find_first_missing() const
 {
   if (used_values.empty() || (used_values.front() != 0)) { return 0; }
   // Search for the first non-contiguous pair of elements.
@@ -94,42 +83,62 @@ cudf::size_type linearizer::intermediate_counter::find_first_missing() const
            : used_values.size();  // No missing elements. Return the next element in the sequence.
 }
 
-cudf::size_type linearizer::visit(literal const& expr)
+cudf::size_type expression_parser::visit(literal const& expr)
 {
-  _node_count++;                                                 // Increment the node index
-  auto const data_type     = expr.get_data_type();               // Resolve node type
-  auto device_view         = expr.get_value();                   // Construct a scalar device view
-  auto const literal_index = cudf::size_type(_literals.size());  // Push literal
-  _literals.push_back(device_view);
-  auto const source = detail::device_data_reference(
-    detail::device_data_reference_type::LITERAL, data_type, literal_index);  // Push data reference
-  return add_data_reference(source);
+  if (_expression_count == 0) {
+    // Handle the trivial case of a literal as the entire expression.
+    return visit(operation(ast_operator::IDENTITY, expr));
+  } else {
+    _expression_count++;                                           // Increment the expression index
+    auto const data_type     = expr.get_data_type();               // Resolve expression type
+    auto device_view         = expr.get_value();                   // Construct a scalar device view
+    auto const literal_index = cudf::size_type(_literals.size());  // Push literal
+    _literals.push_back(device_view);
+    auto const source = detail::device_data_reference(detail::device_data_reference_type::LITERAL,
+                                                      data_type,
+                                                      literal_index);  // Push data reference
+    return add_data_reference(source);
+  }
 }
 
-cudf::size_type linearizer::visit(column_reference const& expr)
+cudf::size_type expression_parser::visit(column_reference const& expr)
 {
-  // Increment the node index
-  _node_count++;
-  // Resolve node type
-  auto const data_type = expr.get_table_source() == table_reference::LEFT
-                           ? expr.get_data_type(_left)
-                           : expr.get_data_type(_right);
-  // Push data reference
-  auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN,
-                                                    data_type,
-                                                    expr.get_column_index(),
-                                                    expr.get_table_source());
-  return add_data_reference(source);
+  if (_expression_count == 0) {
+    // Handle the trivial case of a column reference as the entire expression.
+    return visit(operation(ast_operator::IDENTITY, expr));
+  } else {
+    // Increment the expression index
+    _expression_count++;
+    // Resolve expression type
+    cudf::data_type data_type;
+    if (expr.get_table_source() == table_reference::LEFT) {
+      data_type = expr.get_data_type(_left);
+    } else {
+      if (_right.has_value()) {
+        data_type = expr.get_data_type(*_right);
+      } else {
+        CUDF_FAIL(
+          "Your expression contains a reference to the RIGHT table even though it will only be "
+          "evaluated on a single table (by convention, the LEFT table).");
+      }
+    }
+    // Push data reference
+    auto const source = detail::device_data_reference(detail::device_data_reference_type::COLUMN,
+                                                      data_type,
+                                                      expr.get_column_index(),
+                                                      expr.get_table_source());
+    return add_data_reference(source);
+  }
 }
 
-cudf::size_type linearizer::visit(expression const& expr)
+cudf::size_type expression_parser::visit(operation const& expr)
 {
-  // Increment the node index
-  auto const node_index = _node_count++;
-  // Visit children (operands) of this node
+  // Increment the expression index
+  auto const expression_index = _expression_count++;
+  // Visit children (operands) of this expression
   auto const operand_data_ref_indices = visit_operands(expr.get_operands());
   // Resolve operand types
-  auto data_ref = [this](auto const& index) { return data_references()[index].data_type; };
+  auto data_ref = [this](auto const& index) { return _data_references[index].data_type; };
   auto begin    = thrust::make_transform_iterator(operand_data_ref_indices.cbegin(), data_ref);
   auto end      = begin + operand_data_ref_indices.size();
   auto const operand_types = std::vector<cudf::data_type>(begin, end);
@@ -145,29 +154,30 @@ cudf::size_type linearizer::visit(expression const& expr)
     operand_data_ref_indices.cbegin(),
     operand_data_ref_indices.cend(),
     [this](auto const& data_reference_index) {
-      auto const operand_source = data_references()[data_reference_index];
+      auto const operand_source = _data_references[data_reference_index];
       if (operand_source.reference_type == detail::device_data_reference_type::INTERMEDIATE) {
         auto const intermediate_index = operand_source.data_index;
         _intermediate_counter.give(intermediate_index);
       }
     });
-  // Resolve node type
+  // Resolve expression type
   auto const op        = expr.get_operator();
   auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types);
   _operators.push_back(op);
   // Push data reference
   auto const output = [&]() {
-    if (node_index == 0) {
-      // This node is the root. Output should be directed to the output column.
+    if (expression_index == 0) {
+      // This expression is the root. Output should be directed to the output column.
       return detail::device_data_reference(
         detail::device_data_reference_type::COLUMN, data_type, 0, table_reference::OUTPUT);
     } else {
-      // This node is not the root. Output is an intermediate value.
+      // This expression is not the root. Output is an intermediate value.
       // Ensure that the output type is fixed width and fits in the intermediate storage.
       if (!cudf::is_fixed_width(data_type)) {
         CUDF_FAIL(
           "The output data type is not a fixed-width type but must be stored in an intermediate.");
-      } else if (cudf::size_of(data_type) > sizeof(std::int64_t)) {
+      } else if (cudf::size_of(data_type) > (_has_nulls ? sizeof(IntermediateDataType<true>)
+                                                        : sizeof(IntermediateDataType<false>))) {
         CUDF_FAIL("The output data type is too large to be stored in an intermediate.");
       }
       return detail::device_data_reference(
@@ -183,14 +193,14 @@ cudf::size_type linearizer::visit(expression const& expr)
   return index;
 }
 
-cudf::data_type linearizer::root_data_type() const
+cudf::data_type expression_parser::output_type() const
 {
-  return data_references().empty() ? cudf::data_type(cudf::type_id::EMPTY)
-                                   : data_references().back().data_type;
+  return _data_references.empty() ? cudf::data_type(cudf::type_id::EMPTY)
+                                  : _data_references.back().data_type;
 }
 
-std::vector<cudf::size_type> linearizer::visit_operands(
-  std::vector<std::reference_wrapper<const node>> operands)
+std::vector<cudf::size_type> expression_parser::visit_operands(
+  std::vector<std::reference_wrapper<expression const>> operands)
 {
   auto operand_data_reference_indices = std::vector<cudf::size_type>();
   for (auto const& operand : operands) {
@@ -200,7 +210,7 @@ std::vector<cudf::size_type> linearizer::visit_operands(
   return operand_data_reference_indices;
 }
 
-cudf::size_type linearizer::add_data_reference(detail::device_data_reference data_ref)
+cudf::size_type expression_parser::add_data_reference(detail::device_data_reference data_ref)
 {
   // If an equivalent data reference already exists, return its index. Otherwise add this data
   // reference and return the new index.
@@ -215,16 +225,6 @@ cudf::size_type linearizer::add_data_reference(detail::device_data_reference dat
 
 }  // namespace detail
 
-cudf::size_type literal::accept(detail::linearizer& visitor) const { return visitor.visit(*this); }
-cudf::size_type column_reference::accept(detail::linearizer& visitor) const
-{
-  return visitor.visit(*this);
-}
-cudf::size_type expression::accept(detail::linearizer& visitor) const
-{
-  return visitor.visit(*this);
-}
-
 }  // namespace ast
 
 }  // namespace cudf
diff --git a/cpp/src/ast/expressions.cpp b/cpp/src/ast/expressions.cpp
new file mode 100644
index 00000000000..88cc6650d6c
--- /dev/null
+++ b/cpp/src/ast/expressions.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+namespace cudf {
+namespace ast {
+
+operation::operation(ast_operator op, expression const& input) : op(op), operands({input})
+{
+  if (cudf::ast::detail::ast_operator_arity(op) != 1) {
+    CUDF_FAIL("The provided operator is not a unary operator.");
+  }
+}
+
+operation::operation(ast_operator op, expression const& left, expression const& right)
+  : op(op), operands({left, right})
+{
+  if (cudf::ast::detail::ast_operator_arity(op) != 2) {
+    CUDF_FAIL("The provided operator is not a binary operator.");
+  }
+}
+
+cudf::size_type literal::accept(detail::expression_parser& visitor) const
+{
+  return visitor.visit(*this);
+}
+cudf::size_type column_reference::accept(detail::expression_parser& visitor) const
+{
+  return visitor.visit(*this);
+}
+cudf::size_type operation::accept(detail::expression_parser& visitor) const
+{
+  return visitor.visit(*this);
+}
+
+}  // namespace ast
+
+}  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index f5f3937089f..f4b6a8bf5fd 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -374,11 +374,18 @@ void traverse_children::operator()<cudf::string_view>(host_span<column_view cons
   size_t const total_char_count = std::accumulate(
     cols.begin(), cols.end(), std::size_t{}, [stream](size_t a, auto const& b) -> size_t {
       strings_column_view scv(b);
-      return a + (b.is_empty()
-                    ? 0
-                    : cudf::detail::get_value<offset_type>(
-                        scv.offsets(), scv.offset() + b.size(), stream) -
-                        cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream));
+      return a + (scv.is_empty() ? 0
+                  // if the column is unsliced, skip the offset retrieval.
+                  : scv.offset() > 0
+                    ? cudf::detail::get_value<offset_type>(
+                        scv.offsets(), scv.offset() + scv.size(), stream) -
+                        cudf::detail::get_value<offset_type>(scv.offsets(), scv.offset(), stream)
+                  // if the offset() is 0, it can still be sliced to a shorter length. in this case
+                  // we only need to read a single offset. otherwise just return the full length
+                  // (chars_size())
+                  : scv.size() + 1 == scv.offsets().size()
+                    ? scv.chars_size()
+                    : cudf::detail::get_value<offset_type>(scv.offsets(), scv.size(), stream));
     });
   // note:  output text must include "exceeds size_type range" for python error handling
   CUDF_EXPECTS(total_char_count <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 4d8acb3bd3b..9879a6c5423 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -83,12 +83,6 @@ static __device__ int16_t const days_until_month[2][13] = {
   {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}   // For leap years
 };
 
-CUDA_DEVICE_CALLABLE uint8_t days_in_month(cuda::std::chrono::month mon, bool is_leap_year)
-{
-  return days_until_month[is_leap_year][unsigned{mon}] -
-         days_until_month[is_leap_year][unsigned{mon} - 1];
-}
-
 // Round up the date to the last day of the month and return the
 // date only (without the time component)
 struct extract_last_day_of_month {
@@ -96,18 +90,23 @@ struct extract_last_day_of_month {
   CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
-    // IDEAL: does not work with CUDA10.0 due to nvcc compiler bug
-    // cannot invoke ym_last_day.day()
-    // const year_month_day orig_ymd(floor<days>(ts));
-    // const year_month_day_last ym_last_day(orig_ymd.year(), month_day_last(orig_ymd.month()));
-    // return timestamp_D(sys_days(ym_last_day));
-
-    // Only has the days - time component is chopped off, which is what we want
-    auto const days_since_epoch = floor<days>(ts);
-    auto const date             = year_month_day(days_since_epoch);
-    auto const last_day         = days_in_month(date.month(), date.year().is_leap());
+    const year_month_day ymd(floor<days>(ts));
+    auto const ymdl = year_month_day_last{ymd.year() / ymd.month() / last};
+    return timestamp_D{sys_days{ymdl}};
+  }
+};
 
-    return timestamp_D(days_since_epoch + days(last_day - static_cast<unsigned>(date.day())));
+// Extract the number of days of the month
+// A similar operator to `extract_last_day_of_month`, except this returns
+// an integer while the other returns a timestamp.
+struct days_in_month_op {
+  template <typename Timestamp>
+  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  {
+    using namespace cuda::std::chrono;
+    auto const date = year_month_day(floor<days>(ts));
+    auto const ymdl = year_month_day_last(date.year() / date.month() / last);
+    return static_cast<int16_t>(unsigned{ymdl.day()});
   }
 };
 
@@ -144,6 +143,7 @@ struct extract_quarter_op {
   }
 };
 
+// Returns true if the year is a leap year
 struct is_leap_year_op {
   template <typename Timestamp>
   CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const
@@ -220,22 +220,6 @@ struct add_calendrical_months_functor {
   {
   }
 
-  // std chrono implementation is copied here due to nvcc bug 2909685
-  // https://howardhinnant.github.io/date_algorithms.html#days_from_civil
-  static CUDA_DEVICE_CALLABLE timestamp_D
-  compute_sys_days(cuda::std::chrono::year_month_day const& ymd)
-  {
-    const int yr = static_cast<int>(ymd.year()) - (ymd.month() <= cuda::std::chrono::month{2});
-    const unsigned mth = static_cast<unsigned>(ymd.month());
-    const unsigned dy  = static_cast<unsigned>(ymd.day());
-
-    const int era      = (yr >= 0 ? yr : yr - 399) / 400;
-    const unsigned yoe = static_cast<unsigned>(yr - era * 400);                // [0, 399]
-    const unsigned doy = (153 * (mth + (mth > 2 ? -3 : 9)) + 2) / 5 + dy - 1;  // [0, 365]
-    const unsigned doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;                // [0, 146096]
-    return timestamp_D{duration_D{era * 146097 + static_cast<int>(doe) - 719468}};
-  }
-
   template <typename Element>
   typename std::enable_if_t<!cudf::is_timestamp_t<Element>::value, void> operator()(
     rmm::cuda_stream_view stream) const
@@ -265,15 +249,10 @@ struct add_calendrical_months_functor {
 
                         // If the new date isn't valid, scale it back to the last day of the
                         // month.
-                        // IDEAL: if (!ymd.ok()) ymd = ymd.year()/ymd.month()/last;
-                        auto month_days = days_in_month(ymd.month(), ymd.year().is_leap());
-                        if (unsigned{ymd.day()} > month_days)
-                          ymd = ymd.year() / ymd.month() / day{month_days};
+                        if (!ymd.ok()) ymd = ymd.year() / ymd.month() / last;
 
                         // Put back the time component to the date
-                        return
-                          // IDEAL: sys_days{ymd} + ...
-                          compute_sys_days(ymd) + (time_val - days_since_epoch);
+                        return sys_days{ymd} + (time_val - days_since_epoch);
                       });
   }
 };
@@ -393,6 +372,13 @@ std::unique_ptr<column> is_leap_year(column_view const& column,
   return apply_datetime_op<is_leap_year_op, type_id::BOOL8>(column, stream, mr);
 }
 
+std::unique_ptr<column> days_in_month(column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  return apply_datetime_op<days_in_month_op, type_id::INT16>(column, stream, mr);
+}
+
 std::unique_ptr<column> extract_quarter(column_view const& column,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
@@ -476,6 +462,13 @@ std::unique_ptr<column> is_leap_year(column_view const& column, rmm::mr::device_
   return detail::is_leap_year(column, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<column> days_in_month(column_view const& column,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::days_in_month(column, rmm::cuda_stream_default, mr);
+}
+
 std::unique_ptr<column> extract_quarter(column_view const& column,
                                         rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index 2ff0a3e0a2a..e972403cad3 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/indexalator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/detail/merge.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
@@ -62,8 +63,11 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
   return make_dictionary_column(
     std::make_unique<column>(lcol.keys(), stream, mr),
     std::move(indices_column),
-    rmm::device_buffer{
-      lcol.has_nulls() || rcol.has_nulls() ? static_cast<size_t>(merged_size) : 0, stream, mr},
+    cudf::detail::create_null_mask(
+      lcol.has_nulls() || rcol.has_nulls() ? static_cast<size_t>(merged_size) : 0,
+      mask_state::UNINITIALIZED,
+      stream,
+      mr),
     lcol.null_count() + rcol.null_count());
 }
 
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index e8d5c60f81a..3da20fb9af3 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -24,8 +24,10 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
-inline std::vector<aggregation_result> extract_results(
-  host_span<aggregation_request const> requests, cudf::detail::result_cache& cache)
+
+template <typename RequestType>
+inline std::vector<aggregation_result> extract_results(host_span<RequestType const> requests,
+                                                       cudf::detail::result_cache& cache)
 {
   std::vector<aggregation_result> results(requests.size());
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 8c43c071a85..a26d69e3d46 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -120,7 +120,8 @@ struct empty_column_constructor {
 };
 
 /// Make an empty table with appropriate types for requested aggs
-auto empty_results(host_span<aggregation_request const> requests)
+template <typename RequestType>
+auto empty_results(host_span<RequestType const> requests)
 {
   std::vector<aggregation_result> empty_results;
 
@@ -144,7 +145,8 @@ auto empty_results(host_span<aggregation_request const> requests)
 }
 
 /// Verifies the agg requested on the request's values is valid
-void verify_valid_requests(host_span<aggregation_request const> requests)
+template <typename RequestType>
+void verify_valid_requests(host_span<RequestType const> requests)
 {
   CUDF_EXPECTS(
     std::all_of(
@@ -184,7 +186,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
 // Compute scan requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
-  host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<scan_request const> requests, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 450a8313402..c43df77bb5e 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -152,7 +152,7 @@ void scan_result_functor::operator()<aggregation::DENSE_RANK>(aggregation const&
 
 // Sort-based groupby
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_scan(
-  host_span<aggregation_request const> requests,
+  host_span<scan_request const> requests,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
new file mode 100644
index 00000000000..cb67c893573
--- /dev/null
+++ b/cpp/src/interop/detail/arrow_allocator.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/interop.hpp>
+
+namespace cudf {
+namespace detail {
+
+std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr)
+{
+  /*
+  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
+  and `ValueOrDie` are used inside a CUDA compilation unit.
+
+  To work around this issue we compile an allocation shim in C++ and use
+  that from our cuda sources
+  */
+  auto result = arrow::AllocateBuffer(size, ar_mr);
+  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
+  return std::move(result).ValueOrDie();
+}
+
+std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr)
+{
+  /*
+  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
+  and `ValueOrDie` are used inside a CUDA compilation unit.
+
+  To work around this issue we compile an allocation shim in C++ and use
+  that from our cuda sources
+  */
+  auto result = arrow::AllocateBitmap(size, ar_mr);
+  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
+  return std::move(result).ValueOrDie();
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/detail/arrow_allocator.hpp b/cpp/src/interop/detail/arrow_allocator.hpp
new file mode 100644
index 00000000000..20099f91afa
--- /dev/null
+++ b/cpp/src/interop/detail/arrow_allocator.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/interop.hpp>
+
+namespace cudf {
+namespace detail {
+
+// unique_ptr because that is what AllocateBuffer returns
+std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(const int64_t size, arrow::MemoryPool* ar_mr);
+
+// shared_ptr because that is what AllocateBitmap returns
+std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(const int64_t size, arrow::MemoryPool* ar_mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 3cd515e9981..3271804bf39 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -34,6 +34,8 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include "detail/arrow_allocator.hpp"
+
 namespace cudf {
 namespace detail {
 namespace {
@@ -48,10 +50,7 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
 {
   const int64_t data_size_in_bytes = sizeof(T) * input_view.size();
 
-  auto result = arrow::AllocateBuffer(data_size_in_bytes, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
-
-  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+  auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
 
   CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
                            input_view.data<T>(),
@@ -59,7 +58,7 @@ std::shared_ptr<arrow::Buffer> fetch_data_buffer(column_view input_view,
                            cudaMemcpyDeviceToHost,
                            stream.value()));
 
-  return data_buffer;
+  return std::move(data_buffer);
 }
 
 /**
@@ -72,9 +71,7 @@ std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
   const int64_t mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());
 
   if (input_view.has_nulls()) {
-    auto result = arrow::AllocateBitmap(static_cast<int64_t>(input_view.size()), ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for mask");
-    std::shared_ptr<arrow::Buffer> mask_buffer = std::move(result.ValueOrDie());
+    auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
     CUDA_TRY(cudaMemcpyAsync(
       mask_buffer->mutable_data(),
       (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(),
@@ -163,10 +160,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
                    });
 
   auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
-  auto result                  = arrow::AllocateBuffer(buf_size_in_bytes, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
-
-  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+  auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
   CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
                            buf.data(),
@@ -176,7 +170,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
 
   auto type    = arrow::decimal(18, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
-  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, data_buffer};
+  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
   auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
 
   return std::make_shared<arrow::Decimal128Array>(data);
@@ -191,10 +185,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
 {
   auto bitmask = bools_to_mask(input, stream);
 
-  auto result = arrow::AllocateBuffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer for data");
-
-  std::shared_ptr<arrow::Buffer> data_buffer = std::move(result.ValueOrDie());
+  auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
   CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
                            bitmask.first->data(),
@@ -203,7 +194,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                            stream.value()));
   return to_arrow_array(id,
                         static_cast<int64_t>(input.size()),
-                        data_buffer,
+                        std::move(data_buffer),
                         fetch_mask_buffer(input, ar_mr, stream),
                         static_cast<int64_t>(input.null_count()));
 }
@@ -225,19 +216,13 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
   column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
   auto child_arrays      = fetch_child_array(input_view, {{}, {}}, ar_mr, stream);
   if (child_arrays.empty()) {
-    arrow::Result<std::unique_ptr<arrow::Buffer>> result;
-
     // Empty string will have only one value in offset of 4 bytes
-    result = arrow::AllocateBuffer(4, ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to allocate buffer");
-    std::shared_ptr<arrow::Buffer> tmp_offset_buffer = std::move(result.ValueOrDie());
-    tmp_offset_buffer->mutable_data()[0]             = 0;
-
-    result = arrow::AllocateBuffer(0, ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to allocate buffer");
-    std::shared_ptr<arrow::Buffer> tmp_data_buffer = std::move(result.ValueOrDie());
+    auto tmp_offset_buffer               = allocate_arrow_buffer(4, ar_mr);
+    auto tmp_data_buffer                 = allocate_arrow_buffer(0, ar_mr);
+    tmp_offset_buffer->mutable_data()[0] = 0;
 
-    return std::make_shared<arrow::StringArray>(0, tmp_offset_buffer, tmp_data_buffer);
+    return std::make_shared<arrow::StringArray>(
+      0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));
   }
   auto offset_buffer = child_arrays[0]->data()->buffers[1];
   auto data_buffer   = child_arrays[1]->data()->buffers[1];
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 68ac67b900d..4d3736a41f0 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -193,7 +193,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
   int actual_col  = 0;
 
   // Going through all the columns of a given record
-  while (col < column_flags.size() && field_start <= row_end) {
+  while (col < column_flags.size() && field_start < row_end) {
     auto next_delimiter = cudf::io::gpu::seek_field_end(field_start, row_end, opts);
 
     // Checking if this is a column that the user wants --- user can filter columns
@@ -579,7 +579,7 @@ __global__ void __launch_bounds__(csvparse_block_dim)
   int col         = 0;
   int actual_col  = 0;
 
-  while (col < column_flags.size() && field_start <= row_end) {
+  while (col < column_flags.size() && field_start < row_end) {
     auto next_delimiter = cudf::io::gpu::seek_field_end(next_field, row_end, options);
 
     if (column_flags[col] & column_parse::enabled) {
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 70ce0fce1cc..7f85589a8aa 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -49,18 +49,6 @@ using cudf::device_span;
 using cudf::host_span;
 using cudf::detail::make_device_uvector_async;
 
-namespace {
-/**
- * @brief Helper class to support inline-overloading for all of a variant's alternative types
- */
-template <class... Ts>
-struct VisitorOverload : Ts... {
-  using Ts::operator()...;
-};
-template <class... Ts>
-VisitorOverload(Ts...) -> VisitorOverload<Ts...>;
-}  // namespace
-
 namespace cudf {
 namespace io {
 namespace detail {
@@ -280,6 +268,41 @@ reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
   return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
 }
 
+std::vector<data_type> reader::impl::select_data_types(
+  std::map<std::string, data_type> const& col_type_map)
+{
+  std::vector<data_type> selected_dtypes;
+
+  for (int col = 0; col < num_actual_cols_; col++) {
+    if (column_flags_[col] & column_parse::enabled) {
+      auto const col_type_it = col_type_map.find(col_names_[col]);
+      CUDF_EXPECTS(col_type_it != col_type_map.end(),
+                   "Must specify data types for all active columns");
+      selected_dtypes.emplace_back(col_type_it->second);
+    }
+  }
+  return selected_dtypes;
+}
+
+std::vector<data_type> reader::impl::select_data_types(std::vector<data_type> const& dtypes)
+{
+  std::vector<data_type> selected_dtypes;
+
+  if (dtypes.size() == 1) {
+    // If it's a single dtype, assign that dtype to all active columns
+    selected_dtypes.resize(num_active_cols_, dtypes.front());
+  } else {
+    // If it's a list, assign dtypes to active columns in the given order
+    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_cols_,
+                 "Must specify data types for all columns");
+
+    for (int col = 0; col < num_actual_cols_; col++) {
+      if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); }
+    }
+  }
+  return selected_dtypes;
+}
+
 table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
 {
   auto const data_row_offsets = select_data_and_row_offsets(stream);
@@ -355,13 +378,13 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     }
   }
 
-  // User can specify which columns should be inferred as datetime
-  if (!opts_.get_infer_date_indexes().empty() || !opts_.get_infer_date_names().empty()) {
-    for (const auto index : opts_.get_infer_date_indexes()) {
+  // User can specify which columns should be read as datetime
+  if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) {
+    for (const auto index : opts_.get_parse_dates_indexes()) {
       column_flags_[index] |= column_parse::as_datetime;
     }
 
-    for (const auto& name : opts_.get_infer_date_names()) {
+    for (const auto& name : opts_.get_parse_dates_names()) {
       auto it = std::find(col_names_.begin(), col_names_.end(), name);
       if (it != col_names_.end()) {
         column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
@@ -369,6 +392,20 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
     }
   }
 
+  // User can specify which columns should be parsed as hexadecimal
+  if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
+    for (const auto index : opts_.get_parse_hex_indexes()) {
+      column_flags_[index] |= column_parse::as_hexadecimal;
+    }
+
+    for (const auto& name : opts_.get_parse_hex_names()) {
+      auto it = std::find(col_names_.begin(), col_names_.end(), name);
+      if (it != col_names_.end()) {
+        column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal;
+      }
+    }
+  }
+
   // Return empty table rather than exception if nothing to load
   if (num_active_cols_ == 0) { return {std::make_unique<table>(), {}}; }
 
@@ -382,11 +419,8 @@ table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
   if (has_to_infer_column_types) {
     column_types = infer_column_types(data, row_offsets, stream);
   } else {
-    column_types =
-      std::visit(VisitorOverload{
-                   [&](const std::vector<data_type>& data_types) { return data_types; },
-                   [&](const std::vector<string>& dtypes) { return parse_column_types(dtypes); }},
-                 opts_.get_dtypes());
+    column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
+                              opts_.get_dtypes());
   }
 
   out_columns.reserve(column_types.size());
@@ -666,81 +700,6 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
   return dtypes;
 }
 
-std::vector<data_type> reader::impl::parse_column_types(
-  const std::vector<std::string>& types_as_strings)
-{
-  std::vector<data_type> dtypes;
-
-  const bool is_dict = std::all_of(types_as_strings.begin(),
-                                   types_as_strings.end(),
-                                   [](const auto& s) { return s.find(':') != std::string::npos; });
-
-  if (!is_dict) {
-    if (types_as_strings.size() == 1) {
-      // If it's a single dtype, assign that dtype to all active columns
-      data_type dtype_;
-      column_parse::flags col_flags_;
-      std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[0]);
-      dtypes.resize(num_active_cols_, dtype_);
-      for (int col = 0; col < num_actual_cols_; col++) {
-        column_flags_[col] |= col_flags_;
-      }
-      CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-    } else {
-      // If it's a list, assign dtypes to active columns in the given order
-      CUDF_EXPECTS(static_cast<int>(types_as_strings.size()) >= num_actual_cols_,
-                   "Must specify data types for all columns");
-
-      auto dtype_ = std::back_inserter(dtypes);
-
-      for (int col = 0; col < num_actual_cols_; col++) {
-        if (column_flags_[col] & column_parse::enabled) {
-          column_parse::flags col_flags_;
-          std::tie(dtype_, col_flags_) = get_dtype_info(types_as_strings[col]);
-          column_flags_[col] |= col_flags_;
-          CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-        }
-      }
-    }
-  } else {
-    // Translate vector of `name : dtype` strings to map
-    // NOTE: Incoming pairs can be out-of-order from column names in dataset
-    std::unordered_map<std::string, std::string> col_type_map;
-    for (const auto& pair : types_as_strings) {
-      const auto pos     = pair.find_last_of(':');
-      const auto name    = pair.substr(0, pos);
-      const auto dtype   = pair.substr(pos + 1, pair.size());
-      col_type_map[name] = dtype;
-    }
-
-    auto dtype_ = std::back_inserter(dtypes);
-
-    for (int col = 0; col < num_actual_cols_; col++) {
-      if (column_flags_[col] & column_parse::enabled) {
-        CUDF_EXPECTS(col_type_map.find(col_names_[col]) != col_type_map.end(),
-                     "Must specify data types for all active columns");
-        column_parse::flags col_flags_;
-        std::tie(dtype_, col_flags_) = get_dtype_info(col_type_map[col_names_[col]]);
-        column_flags_[col] |= col_flags_;
-        CUDF_EXPECTS(dtypes.back().id() != cudf::type_id::EMPTY, "Unsupported data type");
-      }
-    }
-  }
-
-  if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
-    for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
-    }
-  }
-
-  for (size_t i = 0; i < dtypes.size(); i++) {
-    // Replace EMPTY dtype with STRING
-    if (dtypes[i].id() == type_id::EMPTY) { dtypes[i] = data_type{type_id::STRING}; }
-  }
-
-  return dtypes;
-}
-
 std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
                                                      device_span<uint64_t const> row_offsets,
                                                      host_span<data_type const> column_types,
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
index 29c6b48bc8a..4416457be16 100644
--- a/cpp/src/io/csv/reader_impl.hpp
+++ b/cpp/src/io/csv/reader_impl.hpp
@@ -182,13 +182,20 @@ class reader::impl {
                                             rmm::cuda_stream_view stream);
 
   /**
-   * @brief Parses the columns' data types from the vector of dtypes that are provided as strings.
+   * @brief Selects the columns' data types from the map of dtypes.
    *
-   * @param types_as_strings The vector of strings from which to parse the columns' target data
-   * types
-   * @return List of columns' data types
+   * @param col_type_map Column name -> data type map specifying the columns' target data types
+   * @return Sorted list of selected columns' data types
    */
-  std::vector<data_type> parse_column_types(std::vector<std::string> const& types_as_strings);
+  std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
+
+  /**
+   * @brief Selects the columns' data types from the list of dtypes.
+   *
+   * @param dtypes Vector of data types specifying the columns' target data types
+   * @return Sorted list of selected columns' data types
+   */
+  std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
 
   /**
    * @brief Converts the row-column data and outputs to column bufferrs.
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index b4395d6c965..f1080342312 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -27,6 +27,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/strings/detail/replace.hpp>
@@ -50,7 +51,6 @@ namespace json {
 using namespace cudf::io;
 
 namespace {
-
 /**
  * @brief Estimates the maximum expected length or a row, based on the number
  * of columns
@@ -87,12 +87,12 @@ std::unique_ptr<table> aggregate_keys_info(std::unique_ptr<table> info)
   auto const info_view = info->view();
   std::vector<groupby::aggregation_request> requests;
   requests.emplace_back(groupby::aggregation_request{info_view.column(0)});
-  requests.back().aggregations.emplace_back(make_min_aggregation());
-  requests.back().aggregations.emplace_back(make_nth_element_aggregation(0));
+  requests.back().aggregations.emplace_back(make_min_aggregation<groupby_aggregation>());
+  requests.back().aggregations.emplace_back(make_nth_element_aggregation<groupby_aggregation>(0));
 
   requests.emplace_back(groupby::aggregation_request{info_view.column(1)});
-  requests.back().aggregations.emplace_back(make_min_aggregation());
-  requests.back().aggregations.emplace_back(make_nth_element_aggregation(0));
+  requests.back().aggregations.emplace_back(make_min_aggregation<groupby_aggregation>());
+  requests.back().aggregations.emplace_back(make_nth_element_aggregation<groupby_aggregation>(0));
 
   // Aggregate by hash values
   groupby::groupby gb_obj(
@@ -236,7 +236,9 @@ void reader::impl::ingest_raw_input(size_t range_offset, size_t range_size)
 {
   size_t map_range_size = 0;
   if (range_size != 0) {
-    map_range_size = range_size + calculate_max_row_size(options_.get_dtypes().size());
+    auto const dtype_option_size =
+      std::visit([](const auto& dtypes) { return dtypes.size(); }, options_.get_dtypes());
+    map_range_size = range_size + calculate_max_row_size(dtype_option_size);
   }
 
   // Support delayed opening of the file if using memory mapping datasource
@@ -467,44 +469,29 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
 void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
                                   rmm::cuda_stream_view stream)
 {
-  auto const dtype = options_.get_dtypes();
-  if (!dtype.empty()) {
-    CUDF_EXPECTS(dtype.size() == metadata_.column_names.size(),
-                 "Need to specify the type of each column.\n");
-
-    // Assume that the dtype is in dictionary format only if all elements contain a colon
-    const bool is_dict =
-      std::all_of(std::cbegin(dtype), std::cend(dtype), [](const std::string& s) {
-        return std::find(std::cbegin(s), std::cend(s), ':') != std::cend(s);
-      });
-
-    auto split_on_colon = [](std::string_view s) {
-      auto const i = s.find(":");
-      return std::pair{s.substr(0, i), s.substr(i + 1)};
-    };
-
-    if (is_dict) {
-      std::map<std::string, data_type> col_type_map;
-      std::transform(
-        std::cbegin(dtype),
-        std::cend(dtype),
-        std::inserter(col_type_map, col_type_map.end()),
-        [&](auto const& ts) {
-          auto const [col_name, type_str] = split_on_colon(ts);
-          return std::pair{std::string{col_name}, convert_string_to_dtype(std::string{type_str})};
-        });
-
-      // Using the map here allows O(n log n) complexity
-      std::transform(std::cbegin(metadata_.column_names),
-                     std::cend(metadata_.column_names),
-                     std::back_inserter(dtypes_),
-                     [&](auto const& column_name) { return col_type_map[column_name]; });
-    } else {
-      std::transform(std::cbegin(dtype),
-                     std::cend(dtype),
-                     std::back_inserter(dtypes_),
-                     [](auto const& col_dtype) { return convert_string_to_dtype(col_dtype); });
-    }
+  bool has_to_infer_column_types =
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
+  if (!has_to_infer_column_types) {
+    dtypes_ = std::visit(cudf::detail::visitor_overload{
+                           [&](const std::vector<data_type>& dtypes) {
+                             CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
+                                          "Must specify types for all columns");
+                             return dtypes;
+                           },
+                           [&](const std::map<std::string, data_type>& dtypes) {
+                             std::vector<data_type> sorted_dtypes;
+                             std::transform(std::cbegin(metadata_.column_names),
+                                            std::cend(metadata_.column_names),
+                                            std::back_inserter(sorted_dtypes),
+                                            [&](auto const& column_name) {
+                                              auto const it = dtypes.find(column_name);
+                                              CUDF_EXPECTS(it != dtypes.end(),
+                                                           "Must specify types for all columns");
+                                              return it->second;
+                                            });
+                             return sorted_dtypes;
+                           }},
+                         options_.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
     auto const num_columns       = metadata_.column_names.size();
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index efc7b78cdb2..004812615eb 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -112,6 +112,7 @@ struct ColumnDesc {
   int32_t decimal_scale;  // number of fractional decimal digits for decimal type
   int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
   column_validity_info parent_validity_info;  // consists of parent column valid_map and null count
+  uint32_t* parent_null_count_prefix_sums;  // per-stripe prefix sums of parent column's null count
 };
 
 /**
@@ -138,7 +139,7 @@ struct EncChunk {
   int32_t scale;                     // scale for decimals or timestamps
 
   uint32_t* dict_index;  // dictionary index from row index
-  device_span<uint32_t> decimal_offsets;
+  uint32_t* decimal_offsets;
   column_device_view const* leaf_column;
 };
 
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 033a2d9aff5..f7bd5ae86b8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -759,6 +759,49 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
   }
 }
 
+/**
+ * @brief Compute the per-stripe prefix sum of null count, for each struct column in the current
+ * layer.
+ */
+void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
+                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      rmm::cuda_stream_view stream)
+{
+  auto const num_stripes = chunks.size().first;
+  if (num_stripes == 0) return;
+
+  auto const num_columns = chunks.size().second;
+  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
+    // Null counts sums are only needed for children of struct columns
+    if (chunks[0][col_idx].type_kind == STRUCT) {
+      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+    }
+  }
+  auto const d_prefix_sums_to_update =
+    cudf::detail::make_device_uvector_async(prefix_sums_to_update, stream);
+
+  thrust::for_each(rmm::exec_policy(stream),
+                   d_prefix_sums_to_update.begin(),
+                   d_prefix_sums_to_update.end(),
+                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+                     auto const& idx_psums) {
+                     auto const col_idx = idx_psums.first;
+                     auto const psums   = idx_psums.second;
+
+                     thrust::transform(
+                       thrust::seq,
+                       thrust::make_counting_iterator(0),
+                       thrust::make_counting_iterator(0) + psums.size(),
+                       psums.begin(),
+                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+
+                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
+                   });
+  // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
+  stream.synchronize();
+}
+
 void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                                       size_t num_dicts,
                                       size_t skip_rows,
@@ -817,8 +860,6 @@ void reader::impl::decode_stream_data(cudf::detail::hostdevice_2dvector<gpu::Col
                       [&](auto null_count, auto const stripe_idx) {
                         return null_count + chunks[stripe_idx][col_idx].null_count;
                       });
-    // Add parent null count in case this is a child column of a struct
-    out_buffers[col_idx].null_count() += chunks[0][col_idx].parent_validity_info.null_count;
   });
 }
 
@@ -841,6 +882,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
   num_child_rows.resize(_selected_columns[level + 1].size());
   std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
   parent_column_data.resize(number_of_child_chunks);
+  _col_meta.parent_column_index.resize(number_of_child_chunks);
   _col_meta.child_start_row.resize(number_of_child_chunks);
   _col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks);
   _col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols);
@@ -899,7 +941,8 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
     auto num_rows          = out_buffers[parent_col_idx].size;
 
     for (uint32_t id = 0; id < p_col.num_children; id++) {
-      const auto child_col_idx = index + id;
+      const auto child_col_idx                     = index + id;
+      _col_meta.parent_column_index[child_col_idx] = parent_col_idx;
       if (type == type_id::STRUCT) {
         parent_column_data[child_col_idx] = {parent_valid_map, parent_null_count};
         // Number of rows in child will remain same as parent in case of struct column
@@ -1042,6 +1085,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   std::vector<std::vector<column_buffer>> out_buffers(_selected_columns.size());
   std::vector<column_name_info> schema_info;
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(_selected_columns.size());
+  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
   table_metadata out_metadata;
 
   // There are no columns in the table
@@ -1124,6 +1168,14 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       // Logically view streams as columns
       std::vector<orc_stream_info> stream_info;
 
+      null_count_prefix_sums.emplace_back();
+      null_count_prefix_sums.back().reserve(_selected_columns[level].size());
+      std::generate_n(
+        std::back_inserter(null_count_prefix_sums.back()), _selected_columns[level].size(), [&]() {
+          return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(total_num_stripes,
+                                                                          stream);
+        });
+
       // Tracker for eventually deallocating compressed and uncompressed data
       auto& stripe_data = lvl_stripe_data[level];
 
@@ -1207,10 +1259,12 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                 ? stripe_info->numberOfRows
                 : _col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
             chunk.column_num_rows = (level == 0) ? num_rows : _col_meta.num_child_rows[col_idx];
-            chunk.parent_validity_info.valid_map_base =
-              (level == 0) ? nullptr : _col_meta.parent_column_data[col_idx].valid_map_base;
-            chunk.parent_validity_info.null_count =
-              (level == 0) ? 0 : _col_meta.parent_column_data[col_idx].null_count;
+            chunk.parent_validity_info =
+              (level == 0) ? column_validity_info{} : _col_meta.parent_column_data[col_idx];
+            chunk.parent_null_count_prefix_sums =
+              (level == 0)
+                ? nullptr
+                : null_count_prefix_sums[level - 1][_col_meta.parent_column_index[col_idx]].data();
             chunk.encoding_kind = stripe_footer->columns[selected_columns[col_idx].id].kind;
             chunk.type_kind     = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
                                 .ff.types[selected_columns[col_idx].id]
@@ -1336,6 +1390,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
         // Extract information to process nested child columns
         if (nested_col.size()) {
+          scan_null_counts(chunks, null_count_prefix_sums[level], stream);
           row_groups.device_to_host(stream, true);
           aggregate_child_meta(chunks, row_groups, out_buffers[level], nested_col, level);
         }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 49c0c983992..7171b13d422 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -58,6 +58,7 @@ struct reader_column_meta {
 
   std::vector<column_validity_info>
     parent_column_data;  // consists of parent column valid_map and null count
+  std::vector<size_type> parent_column_index;
 
   std::vector<uint32_t> child_start_row;  // start row of child columns [stripe][column]
   std::vector<uint32_t>
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 75ccd19d77b..41ee285ac25 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1167,8 +1167,17 @@ __global__ void __launch_bounds__(block_size)
       // No present stream: all rows are valid
       s->vals.u32[t] = ~0;
     }
-    while (s->top.nulls_desc_row < s->chunk.num_rows) {
-      uint32_t nrows_max = min(s->chunk.num_rows - s->top.nulls_desc_row, blockDim.x * 32);
+    auto const prev_parent_null_count =
+      (s->chunk.parent_null_count_prefix_sums != nullptr && stripe > 0)
+        ? s->chunk.parent_null_count_prefix_sums[stripe - 1]
+        : 0;
+    auto const parent_null_count =
+      (s->chunk.parent_null_count_prefix_sums != nullptr)
+        ? s->chunk.parent_null_count_prefix_sums[stripe] - prev_parent_null_count
+        : 0;
+    auto const num_elems = s->chunk.num_rows - parent_null_count;
+    while (s->top.nulls_desc_row < num_elems) {
+      uint32_t nrows_max = min(num_elems - s->top.nulls_desc_row, blockDim.x * 32);
       uint32_t nrows;
       size_t row_in;
 
@@ -1187,7 +1196,7 @@ __global__ void __launch_bounds__(block_size)
       }
       __syncthreads();
 
-      row_in = s->chunk.start_row + s->top.nulls_desc_row;
+      row_in = s->chunk.start_row + s->top.nulls_desc_row - prev_parent_null_count;
       if (row_in + nrows > first_row && row_in < first_row + max_num_rows &&
           s->chunk.valid_map_base != NULL) {
         int64_t dst_row   = row_in - first_row;
@@ -1251,7 +1260,7 @@ __global__ void __launch_bounds__(block_size)
     // Sum up the valid counts and infer null_count
     null_count = block_reduce(temp_storage.bk_storage).Sum(null_count);
     if (t == 0) {
-      chunks[chunk_id].null_count = null_count;
+      chunks[chunk_id].null_count = parent_null_count + null_count;
       chunks[chunk_id].skip_count = s->chunk.skip_count;
     }
   } else {
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d93845530d7..e0018ed7166 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -678,9 +678,7 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
           ck.dtype_len = column.type_width();
         }
         ck.scale = column.scale();
-        if (ck.type_kind == TypeKind::DECIMAL) {
-          ck.decimal_offsets = device_span<uint32_t>{column.decimal_offsets(), ck.num_rows};
-        }
+        if (ck.type_kind == TypeKind::DECIMAL) { ck.decimal_offsets = column.decimal_offsets(); }
       }
     }
   }
@@ -1140,26 +1138,28 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
-/**
- * @brief pre-order append ORC device columns
- */
-void __device__ append_orc_device_column(uint32_t& idx,
-                                         thrust::optional<uint32_t> parent_idx,
-                                         device_span<orc_column_device_view> cols,
-                                         column_device_view col)
-{
-  auto const current_idx = idx;
-  cols[current_idx]      = orc_column_device_view{col, parent_idx};
-  idx++;
-  if (col.type().id() == type_id::LIST) {
-    append_orc_device_column(
-      idx, current_idx, cols, col.child(lists_column_view::child_column_index));
+template <typename T>
+struct device_stack {
+  __device__ device_stack(T* stack_storage, int capacity)
+    : stack(stack_storage), capacity(capacity), size(0)
+  {
   }
-  if (col.type().id() == type_id::STRUCT) {
-    for (auto child_idx = 0; child_idx < col.num_child_columns(); ++child_idx) {
-      append_orc_device_column(idx, current_idx, cols, col.child(child_idx));
-    }
+  __device__ void push(T const& val)
+  {
+    cudf_assert(size < capacity and "Stack overflow");
+    stack[size++] = val;
   }
+  __device__ T pop()
+  {
+    cudf_assert(size > 0 and "Stack underflow");
+    return stack[--size];
+  }
+  __device__ bool empty() { return size == 0; }
+
+ private:
+  T* stack;
+  int capacity;
+  int size;
 };
 
 orc_table_view make_orc_table_view(table_view const& table,
@@ -1189,13 +1189,40 @@ orc_table_view make_orc_table_view(table_view const& table,
   }
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
+  using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
+  rmm::device_uvector<stack_value_type> stack_storage(orc_columns.size(), stream);
 
+  // pre-order append ORC device columns
   cudf::detail::device_single_thread(
-    [d_orc_cols = device_span<orc_column_device_view>{d_orc_columns},
-     d_table    = d_table] __device__() mutable {
+    [d_orc_cols         = device_span<orc_column_device_view>{d_orc_columns},
+     d_table            = d_table,
+     stack_storage      = stack_storage.data(),
+     stack_storage_size = stack_storage.size()] __device__() {
+      device_stack stack(stack_storage, stack_storage_size);
+
+      thrust::for_each(thrust::seq,
+                       thrust::make_reverse_iterator(d_table.end()),
+                       thrust::make_reverse_iterator(d_table.begin()),
+                       [&stack](column_device_view const& c) {
+                         stack.push({&c, thrust::nullopt});
+                       });
+
       uint32_t idx = 0;
-      for (auto const& column : d_table) {
-        append_orc_device_column(idx, thrust::nullopt, d_orc_cols, column);
+      while (not stack.empty()) {
+        auto [col, parent] = stack.pop();
+        d_orc_cols[idx]    = orc_column_device_view{*col, parent};
+
+        if (col->type().id() == type_id::LIST) {
+          stack.push({&col->children()[lists_column_view::child_column_index], idx});
+        } else if (col->type().id() == type_id::STRUCT) {
+          thrust::for_each(thrust::seq,
+                           thrust::make_reverse_iterator(col->children().end()),
+                           thrust::make_reverse_iterator(col->children().begin()),
+                           [&stack, idx](column_device_view const& c) {
+                             stack.push({&c, idx});
+                           });
+        }
+        idx++;
       }
     },
     stream);
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
new file mode 100644
index 00000000000..64b3dd69c0d
--- /dev/null
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/parquet/parquet_gpu.hpp>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/row_operators.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace gpu {
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
+{
+  auto chunk = chunks[blockIdx.x];
+  auto t     = threadIdx.x;
+  // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
+  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+    if (t + i < chunk.dict_map_size) {
+      new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
+      new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
+    }
+  }
+}
+
+template <typename T>
+struct equality_functor {
+  column_device_view const& col;
+  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx)
+  {
+    // We don't call this for nulls so this is fine
+    return equality_compare(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
+  }
+};
+
+template <typename T>
+struct hash_functor {
+  column_device_view const& col;
+  __device__ auto operator()(size_type idx) { return MurmurHash3_32<T>{}(col.element<T>(idx)); }
+};
+
+struct map_insert_fn {
+  map_type::device_mutable_view& map;
+
+  template <typename T>
+  __device__ bool operator()(column_device_view const& col, size_type i)
+  {
+    if constexpr (column_device_view::has_element_accessor<T>()) {
+      auto hash_fn     = hash_functor<T>{col};
+      auto equality_fn = equality_functor<T>{col};
+      return map.insert(std::make_pair(i, i), hash_fn, equality_fn);
+    } else {
+      cudf_assert(false && "Unsupported type to insert in map");
+    }
+    return false;
+  }
+};
+
+struct map_find_fn {
+  map_type::device_view& map;
+
+  template <typename T>
+  __device__ auto operator()(column_device_view const& col, size_type i)
+  {
+    if constexpr (column_device_view::has_element_accessor<T>()) {
+      auto hash_fn     = hash_functor<T>{col};
+      auto equality_fn = equality_functor<T>{col};
+      return map.find(i, hash_fn, equality_fn);
+    } else {
+      cudf_assert(false && "Unsupported type to insert in map");
+    }
+    return map.end();
+  }
+};
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                                  size_type num_rows)
+{
+  auto col_idx = blockIdx.y;
+  auto block_x = blockIdx.x;
+  auto t       = threadIdx.x;
+
+  auto start_row =
+    block_x *
+    max_page_fragment_size;  // This is fragment size. all chunks are multiple of these many rows.
+  size_type end_row = min(start_row + max_page_fragment_size, num_rows);
+
+  __shared__ EncColumnChunk* s_chunk;
+  __shared__ parquet_column_device_view s_col;
+  __shared__ size_type s_start_value_idx;
+  __shared__ size_type s_num_values;
+  if (t == 0) {
+    // Find the chunk this block is a part of
+    size_type num_rowgroups = chunks.size().first;
+    size_type rg_idx        = 0;
+    while (rg_idx < num_rowgroups) {
+      if (auto ck = chunks[rg_idx][col_idx];
+          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
+        break;
+      }
+      ++rg_idx;
+    }
+    s_chunk = &chunks[rg_idx][col_idx];
+    s_col   = *(s_chunk->col_desc);
+  }
+  __syncthreads();
+  if (not s_chunk->use_dictionary) { return; }
+
+  if (t == 0) {
+    // Find the bounds of values in leaf column to be inserted into the map for current chunk
+    auto col             = *(s_col.parent_column);
+    auto start_value_idx = start_row;
+    auto end_value_idx   = end_row;
+    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+      if (col.type().id() == type_id::STRUCT) {
+        start_value_idx += col.offset();
+        end_value_idx += col.offset();
+        col = col.child(0);
+      } else {
+        auto offset_col = col.child(lists_column_view::offsets_column_index);
+        start_value_idx = offset_col.element<size_type>(start_value_idx + col.offset());
+        end_value_idx   = offset_col.element<size_type>(end_value_idx + col.offset());
+        col             = col.child(lists_column_view::child_column_index);
+      }
+    }
+    s_start_value_idx = start_value_idx;
+    s_num_values      = end_value_idx - start_value_idx;
+  }
+  __syncthreads();
+
+  column_device_view const& data_col = *s_col.leaf_column;
+  using block_reduce                 = cub::BlockReduce<size_type, block_size>;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
+
+  // Make a view of the hash map
+  auto hash_map_mutable = map_type::device_mutable_view(
+    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+  auto hash_map = map_type::device_view(
+    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+
+  __shared__ int total_num_dict_entries;
+  for (size_type i = 0; i < s_num_values; i += block_size) {
+    // add the value to hash map
+    size_type val_idx = i + t + s_start_value_idx;
+    bool is_valid =
+      (i + t < s_num_values && val_idx < data_col.size()) and data_col.is_valid(val_idx);
+
+    // insert element at val_idx to hash map and count successful insertions
+    size_type is_unique      = 0;
+    size_type uniq_elem_size = 0;
+    if (is_valid) {
+      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{hash_map}, data_col, val_idx);
+      if (found_slot == hash_map.end()) {
+        is_unique =
+          type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
+        uniq_elem_size = [&]() -> size_type {
+          if (not is_unique) { return 0; }
+          switch (s_col.physical_type) {
+            case Type::INT32: return 4;
+            case Type::INT64: return 8;
+            case Type::INT96: return 12;
+            case Type::FLOAT: return 4;
+            case Type::DOUBLE: return 8;
+            case Type::BYTE_ARRAY:
+              if (data_col.type().id() == type_id::STRING) {
+                // Strings are stored as 4 byte length + string bytes
+                return 4 + data_col.element<string_view>(val_idx).size_bytes();
+              }
+            case Type::FIXED_LEN_BYTE_ARRAY:
+            default: cudf_assert(false && "Unsupported type for dictionary encoding"); return 0;
+          }
+        }();
+      }
+    }
+
+    __syncthreads();
+    auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
+    __syncthreads();
+    auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
+    if (t == 0) {
+      total_num_dict_entries = atomicAdd(&s_chunk->num_dict_entries, num_unique);
+      total_num_dict_entries += num_unique;
+      atomicAdd(&s_chunk->uniq_data_size, uniq_data_size);
+    }
+    __syncthreads();
+
+    // Check if the num unique values in chunk has already exceeded max dict size and early exit
+    if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
+  }
+}
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
+{
+  auto& chunk = chunks[blockIdx.x];
+  if (not chunk.use_dictionary) { return; }
+
+  auto t = threadIdx.x;
+  auto map =
+    map_type::device_view(chunk.dict_map_slots, chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+
+  __shared__ size_type counter;
+  if (t == 0) counter = 0;
+  __syncthreads();
+  for (size_t i = 0; i < chunk.dict_map_size; i += block_size) {
+    if (t + i < chunk.dict_map_size) {
+      auto slot = map.begin_slot() + t + i;
+      auto key  = static_cast<map_type::key_type>(slot->first);
+      if (key != KEY_SENTINEL) {
+        auto loc = atomicAdd(&counter, 1);
+        cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
+        chunk.dict_data[loc] = key;
+        // If sorting dict page ever becomes a hard requirement, enable the following statement and
+        // add a dict sorting step before storing into the slot's second field.
+        // chunk.dict_data_idx[loc] = t + i;
+        slot->second.store(loc);
+        // TODO: ^ This doesn't need to be atomic. Try casting to value_type ptr and just writing.
+      }
+    }
+  }
+}
+
+template <int block_size>
+__global__ void __launch_bounds__(block_size, 1)
+  get_dictionary_indices_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                                size_type num_rows)
+{
+  auto col_idx = blockIdx.y;
+  auto block_x = blockIdx.x;
+  auto t       = threadIdx.x;
+
+  size_type start_row = block_x * max_page_fragment_size;
+  size_type end_row   = min(start_row + max_page_fragment_size, num_rows);
+
+  __shared__ EncColumnChunk s_chunk;
+  __shared__ parquet_column_device_view s_col;
+  __shared__ size_type s_start_value_idx;
+  __shared__ size_type s_ck_start_val_idx;
+  __shared__ size_type s_num_values;
+
+  if (t == 0) {
+    // Find the chunk this block is a part of
+    size_type num_rowgroups = chunks.size().first;
+    size_type rg_idx        = 0;
+    while (rg_idx < num_rowgroups) {
+      if (auto ck = chunks[rg_idx][col_idx];
+          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
+        break;
+      }
+      ++rg_idx;
+    }
+    s_chunk = chunks[rg_idx][col_idx];
+    s_col   = *(s_chunk.col_desc);
+
+    // Find the bounds of values in leaf column to be inserted into the map for current chunk
+
+    auto col                 = *(s_col.parent_column);
+    auto start_value_idx     = start_row;
+    auto end_value_idx       = end_row;
+    auto chunk_start_val_idx = s_chunk.start_row;
+    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+      if (col.type().id() == type_id::STRUCT) {
+        start_value_idx += col.offset();
+        chunk_start_val_idx += col.offset();
+        end_value_idx += col.offset();
+        col = col.child(0);
+      } else {
+        auto offset_col     = col.child(lists_column_view::offsets_column_index);
+        start_value_idx     = offset_col.element<size_type>(start_value_idx + col.offset());
+        chunk_start_val_idx = offset_col.element<size_type>(chunk_start_val_idx + col.offset());
+        end_value_idx       = offset_col.element<size_type>(end_value_idx + col.offset());
+        col                 = col.child(lists_column_view::child_column_index);
+      }
+    }
+    s_start_value_idx  = start_value_idx;
+    s_ck_start_val_idx = chunk_start_val_idx;
+    s_num_values       = end_value_idx - start_value_idx;
+  }
+  __syncthreads();
+
+  if (not s_chunk.use_dictionary) { return; }
+
+  column_device_view const& data_col = *s_col.leaf_column;
+
+  auto map = map_type::device_view(
+    s_chunk.dict_map_slots, s_chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+
+  for (size_t i = 0; i < s_num_values; i += block_size) {
+    if (t + i < s_num_values) {
+      auto val_idx = s_start_value_idx + t + i;
+      bool is_valid =
+        (i + t < s_num_values && val_idx < data_col.size()) ? data_col.is_valid(val_idx) : false;
+
+      if (is_valid) {
+        auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
+        cudf_assert(found_slot != map.end() &&
+                    "Unable to find value in map in dictionary index construction");
+        if (found_slot != map.end()) {
+          // No need for atomic as this is not going to be modified by any other thread
+          auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
+          s_chunk.dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
+        }
+      }
+    }
+  }
+}
+
+void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 1024;
+  initialize_chunk_hash_maps_kernel<block_size>
+    <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+}
+
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                              size_type num_rows,
+                              rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 256;
+  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
+  auto const num_columns   = chunks.size().second;
+  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+
+  populate_chunk_hash_maps_kernel<block_size>
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+}
+
+void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 1024;
+  collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+}
+
+void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                            size_type num_rows,
+                            rmm::cuda_stream_view stream)
+{
+  constexpr int block_size = 256;
+  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
+  auto const num_columns   = chunks.size().second;
+  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+
+  get_dictionary_indices_kernel<block_size>
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+}
+}  // namespace gpu
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/parquet/page_dict.cu b/cpp/src/io/parquet/page_dict.cu
deleted file mode 100644
index 0c55828b120..00000000000
--- a/cpp/src/io/parquet/page_dict.cu
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <io/utilities/block_utils.cuh>
-#include "parquet_gpu.hpp"
-
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <cub/cub.cuh>
-
-namespace cudf {
-namespace io {
-namespace parquet {
-namespace gpu {
-struct dict_state_s {
-  uint32_t row_cnt;
-  PageFragment* cur_fragment;
-  uint32_t* hashmap;
-  uint32_t total_dict_entries;  //!< Total number of entries in dictionary
-  uint32_t dictionary_size;     //!< Total dictionary size in bytes
-  uint32_t num_dict_entries;    //!< Dictionary entries in current fragment to add
-  uint32_t frag_dict_size;
-  EncColumnChunk ck;
-  parquet_column_device_view col;
-  PageFragment frag;
-  volatile uint32_t scratch_red[32];
-  uint16_t frag_dict[max_page_fragment_size];
-};
-
-/**
- * @brief Computes a 16-bit dictionary hash
- */
-inline __device__ uint32_t uint32_hash16(uint32_t v) { return (v + (v >> 16)) & 0xffff; }
-
-inline __device__ uint32_t uint64_hash16(uint64_t v)
-{
-  return uint32_hash16((uint32_t)(v + (v >> 32)));
-}
-
-inline __device__ uint32_t hash_string(const string_view& val)
-{
-  const char* p = val.data();
-  uint32_t len  = val.size_bytes();
-  uint32_t hash = len;
-  if (len > 0) {
-    uint32_t align_p    = 3 & reinterpret_cast<uintptr_t>(p);
-    const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - align_p);
-    uint32_t ofs        = align_p * 8;
-    uint32_t v;
-    while (len > 4) {
-      v = *p32++;
-      if (ofs) { v = __funnelshift_r(v, *p32, ofs); }
-      hash = __funnelshift_l(hash, hash, 5) + v;
-      len -= 4;
-    }
-    v = *p32;
-    if (ofs) { v = __funnelshift_r(v, (align_p + len > 4) ? p32[1] : 0, ofs); }
-    v &= ((2 << (len * 8 - 1)) - 1);
-    hash = __funnelshift_l(hash, hash, 5) + v;
-  }
-  return uint32_hash16(hash);
-}
-
-/**
- * @brief Fetch a page fragment and its dictionary entries in row-ascending order
- *
- * @param[in,out] s dictionary state
- * @param[in,out] dict_data fragment dictionary data for the current column (zeroed out after
- *fetching)
- * @param[in] frag_start_row row position of current fragment
- * @param[in] t thread id
- */
-__device__ void FetchDictionaryFragment(dict_state_s* s,
-                                        uint32_t* dict_data,
-                                        uint32_t frag_start_row,
-                                        uint32_t t)
-{
-  if (t == 0) s->frag = *s->cur_fragment;
-  __syncthreads();
-  // Store the row values in shared mem and set the corresponding dict_data to zero (end-of-list)
-  // It's easiest to do this here since we're only dealing with values all within a 5K-row window
-  for (uint32_t i = t; i < s->frag.num_dict_vals; i += 1024) {
-    uint32_t r      = dict_data[frag_start_row + i] - frag_start_row;
-    s->frag_dict[i] = r;
-  }
-  __syncthreads();
-  for (uint32_t i = t; i < s->frag.num_dict_vals; i += 1024) {
-    uint32_t r                    = s->frag_dict[i];
-    dict_data[frag_start_row + r] = 0;
-  }
-  __syncthreads();
-}
-
-/// Generate dictionary indices in ascending row order
-template <int block_size>
-__device__ void GenerateDictionaryIndices(dict_state_s* s, uint32_t t)
-{
-  using block_scan = cub::BlockScan<uint32_t, block_size>;
-  __shared__ typename block_scan::TempStorage temp_storage;
-  uint32_t* dict_index      = s->col.dict_index;
-  uint32_t* dict_data       = s->col.dict_data + s->ck.start_row;
-  uint32_t num_dict_entries = 0;
-
-  for (uint32_t i = 0; i < s->row_cnt; i += 1024) {
-    uint32_t row = s->ck.start_row + i + t;
-    uint32_t is_valid =
-      (i + t < s->row_cnt && row < s->col.num_rows) ? s->col.leaf_column->is_valid(row) : 0;
-    uint32_t dict_idx = (is_valid) ? dict_index[row] : 0;
-    uint32_t is_unique =
-      (is_valid &&
-       dict_idx ==
-         row);  // Any value that doesn't have bit31 set should have dict_idx=row at this point
-    uint32_t block_num_dict_entries;
-    uint32_t pos;
-    block_scan(temp_storage).ExclusiveSum(is_unique, pos, block_num_dict_entries);
-    pos += num_dict_entries;
-    num_dict_entries += block_num_dict_entries;
-    if (is_valid && is_unique) {
-      dict_data[pos]  = row;
-      dict_index[row] = pos;
-    }
-    __syncthreads();
-    if (is_valid && !is_unique) {
-      // NOTE: Should have at most 3 iterations (once for early duplicate elimination, once for
-      // final dictionary duplicate elimination and once for re-ordering) (If something went wrong
-      // building the dictionary, it will likely hang or crash right here)
-      do {
-        dict_idx = dict_index[dict_idx & 0x7fffffff];
-      } while (dict_idx > 0x7fffffff);
-      dict_index[row] = dict_idx;
-    }
-  }
-}
-
-// blockDim(1024, 1, 1)
-template <int block_size>
-__global__ void __launch_bounds__(block_size, 1)
-  gpuBuildChunkDictionaries(device_span<EncColumnChunk> chunks, uint32_t* dev_scratch)
-{
-  __shared__ __align__(8) dict_state_s state_g;
-  using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  __shared__ typename block_reduce::TempStorage temp_storage;
-
-  dict_state_s* const s = &state_g;
-  uint32_t t            = threadIdx.x;
-  uint32_t dtype, dtype_len, dtype_len_in;
-
-  if (t == 0) s->ck = chunks[blockIdx.x];
-  __syncthreads();
-
-  if (!s->ck.has_dictionary) { return; }
-
-  if (t == 0) s->col = *s->ck.col_desc;
-  __syncthreads();
-
-  if (!t) {
-    s->hashmap               = dev_scratch + s->ck.dictionary_id * (size_t)(1 << kDictHashBits);
-    s->row_cnt               = 0;
-    s->cur_fragment          = s->ck.fragments;
-    s->total_dict_entries    = 0;
-    s->dictionary_size       = 0;
-    s->ck.num_dict_fragments = 0;
-  }
-  dtype     = s->col.physical_type;
-  dtype_len = (dtype == INT96) ? 12 : (dtype == INT64 || dtype == DOUBLE) ? 8 : 4;
-  if (dtype == INT32) {
-    dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column);
-  } else if (dtype == INT96) {
-    dtype_len_in = 8;
-  } else {
-    dtype_len_in = dtype_len;
-  }
-  __syncthreads();
-  while (s->row_cnt < s->ck.num_rows) {
-    uint32_t frag_start_row = s->ck.start_row + s->row_cnt, num_dict_entries, frag_dict_size;
-    FetchDictionaryFragment(s, s->col.dict_data, frag_start_row, t);
-    __syncthreads();
-    num_dict_entries = s->frag.num_dict_vals;
-    if (!t) {
-      s->num_dict_entries = 0;
-      s->frag_dict_size   = 0;
-    }
-    for (uint32_t i = 0; i < num_dict_entries; i += 1024) {
-      bool is_valid    = (i + t < num_dict_entries);
-      uint32_t len     = 0;
-      uint32_t is_dupe = 0;
-      uint32_t row, hash, next, *next_addr;
-      uint32_t new_dict_entries;
-
-      if (is_valid) {
-        row = frag_start_row + s->frag_dict[i + t];
-        len = dtype_len;
-        if (dtype == BYTE_ARRAY) {
-          auto str1 = s->col.leaf_column->element<string_view>(row);
-          len += str1.size_bytes();
-          hash = hash_string(str1);
-          // Walk the list of rows with the same hash
-          next_addr = &s->hashmap[hash];
-          while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
-            auto const current = next - 1;
-            auto str2          = s->col.leaf_column->element<string_view>(current);
-            if (str1 == str2) {
-              is_dupe = 1;
-              break;
-            }
-            next_addr = &s->col.dict_data[next - 1];
-          }
-        } else {
-          uint64_t val;
-
-          if (dtype_len_in == 8) {
-            val  = s->col.leaf_column->element<uint64_t>(row);
-            hash = uint64_hash16(val);
-          } else {
-            val  = (dtype_len_in == 4)   ? s->col.leaf_column->element<uint32_t>(row)
-                   : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(row)
-                                         : s->col.leaf_column->element<uint8_t>(row);
-            hash = uint32_hash16(val);
-          }
-          // Walk the list of rows with the same hash
-          next_addr = &s->hashmap[hash];
-          while ((next = atomicCAS(next_addr, 0, row + 1)) != 0) {
-            auto const current = next - 1;
-            uint64_t val2 = (dtype_len_in == 8)   ? s->col.leaf_column->element<uint64_t>(current)
-                            : (dtype_len_in == 4) ? s->col.leaf_column->element<uint32_t>(current)
-                            : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(current)
-                                                  : s->col.leaf_column->element<uint8_t>(current);
-            if (val2 == val) {
-              is_dupe = 1;
-              break;
-            }
-            next_addr = &s->col.dict_data[next - 1];
-          }
-        }
-      }
-      // Count the non-duplicate entries
-      frag_dict_size   = block_reduce(temp_storage).Sum((is_valid && !is_dupe) ? len : 0);
-      new_dict_entries = __syncthreads_count(is_valid && !is_dupe);
-      if (t == 0) {
-        s->frag_dict_size += frag_dict_size;
-        s->num_dict_entries += new_dict_entries;
-      }
-      if (is_valid) {
-        if (!is_dupe) {
-          s->col.dict_index[row] = row;
-        } else {
-          s->col.dict_index[row] = (next - 1) | (1u << 31);
-        }
-      }
-      __syncthreads();
-      // At this point, the dictionary order is non-deterministic, and we want insertion order
-      // Make sure that the non-duplicate entry corresponds to the lower row number
-      // (The entry in dict_data (next-1) used for duplicate elimination does not need
-      // to be the lowest row number)
-      bool reorder_check = (is_valid && is_dupe && next - 1 > row);
-      if (reorder_check) {
-        next = s->col.dict_index[next - 1];
-        while (next & (1u << 31)) {
-          next = s->col.dict_index[next & 0x7fffffff];
-        }
-      }
-      if (__syncthreads_or(reorder_check)) {
-        if (reorder_check) { atomicMin(&s->col.dict_index[next], row); }
-        __syncthreads();
-        if (reorder_check && s->col.dict_index[next] == row) {
-          s->col.dict_index[next] = row | (1u << 31);
-          s->col.dict_index[row]  = row;
-        }
-        __syncthreads();
-      }
-    }
-    __syncthreads();
-    num_dict_entries = s->num_dict_entries;
-    frag_dict_size   = s->frag_dict_size;
-    if (s->total_dict_entries + num_dict_entries > 65536 ||
-        (s->dictionary_size != 0 && s->dictionary_size + frag_dict_size > 512 * 1024)) {
-      break;
-    }
-    __syncthreads();
-    if (!t) {
-      if (num_dict_entries != s->frag.num_dict_vals) {
-        s->cur_fragment->num_dict_vals = num_dict_entries;
-      }
-      if (frag_dict_size != s->frag.dict_data_size) { s->frag.dict_data_size = frag_dict_size; }
-      s->total_dict_entries += num_dict_entries;
-      s->dictionary_size += frag_dict_size;
-      s->row_cnt += s->frag.num_rows;
-      s->cur_fragment++;
-      s->ck.num_dict_fragments++;
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-  GenerateDictionaryIndices<block_size>(s, t);
-  if (!t) {
-    chunks[blockIdx.x].num_dict_fragments = s->ck.num_dict_fragments;
-    chunks[blockIdx.x].dictionary_size    = s->dictionary_size;
-    chunks[blockIdx.x].total_dict_entries = s->total_dict_entries;
-  }
-}
-
-/**
- * @brief Launches kernel for building chunk dictionaries
- *
- * @param[in,out] chunks Column chunks
- * @param[in] dev_scratch Device scratch data (kDictScratchSize per dictionary)
- * @param[in] stream CUDA stream to use, default 0
- */
-void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
-                            uint32_t* dev_scratch,
-                            rmm::cuda_stream_view stream)
-{
-  auto num_chunks = chunks.size();
-  gpuBuildChunkDictionaries<1024><<<num_chunks, 1024, 0, stream.value()>>>(chunks, dev_scratch);
-}
-
-}  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 3c62dcf7eea..70b2e27f75d 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -48,14 +48,7 @@ constexpr uint32_t rle_buffer_size = (1 << 9);
 struct frag_init_state_s {
   parquet_column_device_view col;
   PageFragment frag;
-  uint32_t total_dupes;
   size_type start_value_idx;
-  volatile uint32_t scratch_red[32];
-  uint32_t dict[max_page_fragment_size];
-  union {
-    uint16_t u16[1 << (init_hash_bits)];
-    uint32_t u32[1 << (init_hash_bits - 1)];
-  } map;
 };
 
 struct page_enc_state_s {
@@ -68,6 +61,7 @@ struct page_enc_state_s {
   uint32_t rle_lit_count;
   uint32_t rle_rpt_count;
   uint32_t page_start_val;
+  uint32_t chunk_start_val;
   volatile uint32_t rpt_map[4];
   volatile uint32_t scratch_red[32];
   EncPage page;
@@ -124,31 +118,22 @@ __global__ void __launch_bounds__(block_size)
   __shared__ __align__(16) frag_init_state_s state_g;
 
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
-  using block_scan   = cub::BlockScan<uint32_t, block_size>;
-  __shared__ union {
-    typename block_reduce::TempStorage reduce_storage;
-    typename block_scan::TempStorage scan_storage;
-  } temp_storage;
+  __shared__ typename block_reduce::TempStorage reduce_storage;
 
   frag_init_state_s* const s = &state_g;
   uint32_t t                 = threadIdx.x;
-  uint32_t start_row, dtype_len, dtype_len_in, dtype;
+  uint32_t start_row, dtype_len, dtype;
 
   if (t == 0) s->col = col_desc[blockIdx.x];
-  for (uint32_t i = 0; i < sizeof(s->map) / sizeof(uint32_t); i += block_size) {
-    if (i + t < sizeof(s->map) / sizeof(uint32_t)) s->map.u32[i + t] = 0;
-  }
   __syncthreads();
   start_row = blockIdx.y * fragment_size;
   if (!t) {
     // frag.num_rows = fragment_size except for the last page fragment which can be smaller.
     // num_rows is fixed but fragment size could be larger if the data is strings or nested.
     s->frag.num_rows           = min(fragment_size, max_num_rows - min(start_row, max_num_rows));
-    s->frag.non_nulls          = 0;
     s->frag.num_dict_vals      = 0;
     s->frag.fragment_data_size = 0;
     s->frag.dict_data_size     = 0;
-    s->total_dupes             = 0;
 
     // To use num_vals instead of num_rows, we need to calculate num_vals on the fly.
     // For list<list<int>>, values between i and i+50 can be calculated by
@@ -195,16 +180,6 @@ __global__ void __launch_bounds__(block_size)
               : (dtype == INT64 || dtype == DOUBLE) ? 8
               : (dtype == BOOLEAN)                  ? 1
                                                     : 4;
-  if (dtype == INT32) {
-    dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column);
-  } else if (dtype == INT96) {
-    // cudf doesn't support INT96 internally and uses INT64, so treat INT96 as an INT64 for
-    // computing dictionary hash values and reading the data, but we do treat it as 12 bytes for
-    // dtype_len, which determines how much memory we need to allocate for the fragment.
-    dtype_len_in = 8;
-  } else {
-    dtype_len_in = dtype_len;
-  }
   __syncthreads();
 
   size_type nvals           = s->frag.num_leaf_values;
@@ -215,167 +190,22 @@ __global__ void __launch_bounds__(block_size)
     uint32_t is_valid = (i + t < nvals && val_idx < s->col.leaf_column->size())
                           ? s->col.leaf_column->is_valid(val_idx)
                           : 0;
-    uint32_t len, nz_pos, hash;
+    uint32_t len;
     if (is_valid) {
       len = dtype_len;
       if (dtype != BOOLEAN) {
         if (dtype == BYTE_ARRAY) {
           auto str = s->col.leaf_column->element<string_view>(val_idx);
           len += str.size_bytes();
-          hash = hash_string(str);
-        } else if (dtype_len_in == 8) {
-          hash = uint64_init_hash(s->col.leaf_column->element<uint64_t>(val_idx));
-        } else {
-          hash =
-            uint32_init_hash((dtype_len_in == 4)   ? s->col.leaf_column->element<uint32_t>(val_idx)
-                             : (dtype_len_in == 2) ? s->col.leaf_column->element<uint16_t>(val_idx)
-                                                   : s->col.leaf_column->element<uint8_t>(val_idx));
         }
       }
     } else {
       len = 0;
     }
 
-    uint32_t non_nulls;
-    block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, nz_pos, non_nulls);
-    nz_pos += s->frag.non_nulls;
-    __syncthreads();
-    len = block_reduce(temp_storage.reduce_storage).Sum(len);
-    if (!t) {
-      s->frag.non_nulls += non_nulls;
-      s->frag.fragment_data_size += len;
-    }
-    __syncthreads();
-    if (is_valid && dtype != BOOLEAN) {
-      uint32_t* dict_index = s->col.dict_index;
-      if (dict_index) {
-        atomicAdd(&s->map.u32[hash >> 1], (hash & 1) ? 1 << 16 : 1);
-        dict_index[start_value_idx + nz_pos] =
-          ((i + t) << init_hash_bits) |
-          hash;  // Store the hash along with the index, so we don't have to recompute it
-      }
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-  // Reorder the 16-bit local indices according to the hash values
-  if (s->col.dict_index) {
-    static_assert((init_hash_bits == 12), "Hardcoded for init_hash_bits=12");
-    // Cumulative sum of hash map counts
-    uint32_t count01 = s->map.u32[t * 4 + 0];
-    uint32_t count23 = s->map.u32[t * 4 + 1];
-    uint32_t count45 = s->map.u32[t * 4 + 2];
-    uint32_t count67 = s->map.u32[t * 4 + 3];
-    uint32_t sum01   = count01 + (count01 << 16);
-    uint32_t sum23   = count23 + (count23 << 16);
-    uint32_t sum45   = count45 + (count45 << 16);
-    uint32_t sum67   = count67 + (count67 << 16);
-    sum23 += (sum01 >> 16) * 0x10001;
-    sum45 += (sum23 >> 16) * 0x10001;
-    sum67 += (sum45 >> 16) * 0x10001;
-    uint32_t sum_w = sum67 >> 16;
-    block_scan(temp_storage.scan_storage).InclusiveSum(sum_w, sum_w);
-    sum_w                 = (sum_w - (sum67 >> 16)) * 0x10001;
-    s->map.u32[t * 4 + 0] = sum_w + sum01 - count01;
-    s->map.u32[t * 4 + 1] = sum_w + sum23 - count23;
-    s->map.u32[t * 4 + 2] = sum_w + sum45 - count45;
-    s->map.u32[t * 4 + 3] = sum_w + sum67 - count67;
-  }
-  __syncthreads();
-  // Put the indices back in hash order
-  if (s->col.dict_index) {
-    uint32_t* dict_index = s->col.dict_index + start_row;
-    uint32_t nnz         = s->frag.non_nulls;
-    for (uint32_t i = 0; i < nnz; i += block_size) {
-      uint32_t pos = 0, hash = 0, pos_old, pos_new, sh, colliding_row, val = 0;
-      bool collision;
-      if (i + t < nnz) {
-        val     = dict_index[i + t];
-        hash    = val & ((1 << init_hash_bits) - 1);
-        sh      = (hash & 1) ? 16 : 0;
-        pos_old = s->map.u16[hash];
-      }
-      // The isolation of the atomicAdd, along with pos_old/pos_new is to guarantee deterministic
-      // behavior for the first row in the hash map that will be used for early duplicate detection
-      __syncthreads();
-      if (i + t < nnz) {
-        pos          = (atomicAdd(&s->map.u32[hash >> 1], 1 << sh) >> sh) & 0xffff;
-        s->dict[pos] = val;
-      }
-      __syncthreads();
-      collision = false;
-      if (i + t < nnz) {
-        pos_new   = s->map.u16[hash];
-        collision = (pos != pos_old && pos_new > pos_old + 1);
-        if (collision) { colliding_row = s->dict[pos_old]; }
-      }
-      __syncthreads();
-      if (collision) { atomicMin(&s->dict[pos_old], val); }
-      __syncthreads();
-      // Resolve collision
-      if (collision && val == s->dict[pos_old]) { s->dict[pos] = colliding_row; }
-    }
+    len = block_reduce(reduce_storage).Sum(len);
+    if (!t) { s->frag.fragment_data_size += len; }
     __syncthreads();
-    // Now that the values are ordered by hash, compare every entry with the first entry in the hash
-    // map, the position of the first entry can be inferred from the hash map counts
-    uint32_t dupe_data_size = 0;
-    for (uint32_t i = 0; i < nnz; i += block_size) {
-      uint32_t ck_row = 0, ck_row_ref = 0, is_dupe = 0;
-      if (i + t < nnz) {
-        uint32_t dict_val = s->dict[i + t];
-        uint32_t hash     = dict_val & ((1 << init_hash_bits) - 1);
-        ck_row            = start_row + (dict_val >> init_hash_bits);
-        ck_row_ref = start_row + (s->dict[(hash > 0) ? s->map.u16[hash - 1] : 0] >> init_hash_bits);
-        if (ck_row_ref != ck_row) {
-          if (dtype == BYTE_ARRAY) {
-            auto str1 = s->col.leaf_column->element<string_view>(ck_row);
-            auto str2 = s->col.leaf_column->element<string_view>(ck_row_ref);
-            is_dupe   = (str1 == str2);
-            dupe_data_size += (is_dupe) ? 4 + str1.size_bytes() : 0;
-          } else {
-            if (dtype_len_in == 8) {
-              auto v1 = s->col.leaf_column->element<uint64_t>(ck_row);
-              auto v2 = s->col.leaf_column->element<uint64_t>(ck_row_ref);
-              is_dupe = (v1 == v2);
-              dupe_data_size += (is_dupe) ? 8 : 0;
-            } else {
-              uint32_t v1, v2;
-              if (dtype_len_in == 4) {
-                v1 = s->col.leaf_column->element<uint32_t>(ck_row);
-                v2 = s->col.leaf_column->element<uint32_t>(ck_row_ref);
-              } else if (dtype_len_in == 2) {
-                v1 = s->col.leaf_column->element<uint16_t>(ck_row);
-                v2 = s->col.leaf_column->element<uint16_t>(ck_row_ref);
-              } else {
-                v1 = s->col.leaf_column->element<uint8_t>(ck_row);
-                v2 = s->col.leaf_column->element<uint8_t>(ck_row_ref);
-              }
-              is_dupe = (v1 == v2);
-              dupe_data_size += (is_dupe) ? 4 : 0;
-            }
-          }
-        }
-      }
-      uint32_t dupes_in_block;
-      uint32_t dupes_before;
-      block_scan(temp_storage.scan_storage).InclusiveSum(is_dupe, dupes_before, dupes_in_block);
-      dupes_before += s->total_dupes;
-      __syncthreads();
-      if (t == 0) { s->total_dupes += dupes_in_block; }
-      if (i + t < nnz) {
-        if (!is_dupe) {
-          s->col.dict_data[start_row + i + t - dupes_before] = ck_row;
-        } else {
-          s->col.dict_index[ck_row] = ck_row_ref | (1u << 31);
-        }
-      }
-    }
-    __syncthreads();
-    dupe_data_size = block_reduce(temp_storage.reduce_storage).Sum(dupe_data_size);
-    if (!t) {
-      s->frag.dict_data_size = s->frag.fragment_data_size - dupe_data_size;
-      s->frag.num_dict_vals  = s->frag.non_nulls - s->total_dupes;
-    }
   }
   __syncthreads();
   if (t == 0) frag[blockIdx.x][blockIdx.y] = s->frag;
@@ -449,22 +279,21 @@ __global__ void __launch_bounds__(128)
       pagestats_g.start_chunk = ck_g.first_fragment;
       pagestats_g.num_chunks  = 0;
     }
-    if (ck_g.has_dictionary) {
+    if (ck_g.use_dictionary) {
       if (!t) {
         page_g.page_data       = ck_g.uncompressed_bfr + page_offset;
         page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset;
         page_g.num_fragments   = 0;
         page_g.page_type       = PageType::DICTIONARY_PAGE;
-        page_g.dict_bits_plus1 = 0;
         page_g.chunk           = &chunks[blockIdx.y][blockIdx.x];
         page_g.chunk_id        = blockIdx.y * num_columns + blockIdx.x;
         page_g.hdr_size        = 0;
         page_g.max_hdr_size    = 32;
-        page_g.max_data_size   = ck_g.dictionary_size;
+        page_g.max_data_size   = ck_g.uniq_data_size;
         page_g.start_row       = cur_row;
-        page_g.num_rows        = ck_g.total_dict_entries;
-        page_g.num_leaf_values = ck_g.total_dict_entries;
-        page_g.num_values      = ck_g.total_dict_entries;
+        page_g.num_rows        = ck_g.num_dict_entries;
+        page_g.num_leaf_values = ck_g.num_dict_entries;
+        page_g.num_values      = ck_g.num_dict_entries;  // TODO: shouldn't matter for dict page
         page_offset += page_g.max_hdr_size + page_g.max_data_size;
         comp_page_offset += page_g.max_hdr_size + GetMaxCompressedBfrSize(page_g.max_data_size);
       }
@@ -483,7 +312,7 @@ __global__ void __launch_bounds__(128)
     // This doesn't actually deal with data. It's agnostic. It only cares about number of rows and
     // page size.
     do {
-      uint32_t fragment_data_size, max_page_size, minmax_len = 0;
+      uint32_t minmax_len = 0;
       __syncwarp();
       if (num_rows < ck_g.num_rows) {
         if (t == 0) { frag_g = ck_g.fragments[fragments_in_chunk]; }
@@ -496,50 +325,27 @@ __global__ void __launch_bounds__(128)
         frag_g.num_rows           = 0;
       }
       __syncwarp();
-      if (ck_g.has_dictionary && fragments_in_chunk < ck_g.num_dict_fragments) {
-        fragment_data_size =
-          frag_g.num_leaf_values * 2;  // Assume worst-case of 2-bytes per dictionary index
-      } else {
-        fragment_data_size = frag_g.fragment_data_size;
-      }
+      uint32_t fragment_data_size =
+        (ck_g.use_dictionary)
+          ? frag_g.num_leaf_values * 2  // Assume worst-case of 2-bytes per dictionary index
+          : frag_g.fragment_data_size;
       // TODO (dm): this convoluted logic to limit page size needs refactoring
-      max_page_size = (values_in_page * 2 >= ck_g.num_values)   ? 256 * 1024
-                      : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024
-                                                                : 512 * 1024;
+      uint32_t max_page_size = (values_in_page * 2 >= ck_g.num_values)   ? 256 * 1024
+                               : (values_in_page * 3 >= ck_g.num_values) ? 384 * 1024
+                                                                         : 512 * 1024;
       if (num_rows >= ck_g.num_rows ||
-          (values_in_page > 0 &&
-           (page_size + fragment_data_size > max_page_size ||
-            (ck_g.has_dictionary && fragments_in_chunk == ck_g.num_dict_fragments)))) {
-        uint32_t dict_bits_plus1;
-
-        if (ck_g.has_dictionary && page_start < ck_g.num_dict_fragments) {
-          uint32_t dict_bits;
-          if (num_dict_entries <= 2) {
-            dict_bits = 1;
-          } else if (num_dict_entries <= 4) {
-            dict_bits = 2;
-          } else if (num_dict_entries <= 16) {
-            dict_bits = 4;
-          } else if (num_dict_entries <= 256) {
-            dict_bits = 8;
-          } else if (num_dict_entries <= 4096) {
-            dict_bits = 12;
-          } else {
-            dict_bits = 16;
-          }
-          page_size       = 1 + 5 + ((values_in_page * dict_bits + 7) >> 3) + (values_in_page >> 8);
-          dict_bits_plus1 = dict_bits + 1;
-        } else {
-          dict_bits_plus1 = 0;
+          (values_in_page > 0 && (page_size + fragment_data_size > max_page_size))) {
+        if (ck_g.use_dictionary) {
+          page_size =
+            1 + 5 + ((values_in_page * ck_g.dict_rle_bits + 7) >> 3) + (values_in_page >> 8);
         }
         if (!t) {
-          page_g.num_fragments   = fragments_in_chunk - page_start;
-          page_g.chunk           = &chunks[blockIdx.y][blockIdx.x];
-          page_g.chunk_id        = blockIdx.y * num_columns + blockIdx.x;
-          page_g.page_type       = PageType::DATA_PAGE;
-          page_g.dict_bits_plus1 = dict_bits_plus1;
-          page_g.hdr_size        = 0;
-          page_g.max_hdr_size    = 32;  // Max size excluding statistics
+          page_g.num_fragments = fragments_in_chunk - page_start;
+          page_g.chunk         = &chunks[blockIdx.y][blockIdx.x];
+          page_g.chunk_id      = blockIdx.y * num_columns + blockIdx.x;
+          page_g.page_type     = PageType::DATA_PAGE;
+          page_g.hdr_size      = 0;
+          page_g.max_hdr_size  = 32;  // Max size excluding statistics
           if (ck_g.stats) {
             uint32_t stats_hdr_len = 16;
             if (col_g.stats_dtype == dtype_string) {
@@ -611,8 +417,8 @@ __global__ void __launch_bounds__(128)
       ck_g.num_pages          = num_pages;
       ck_g.bfr_size           = page_offset;
       ck_g.compressed_size    = comp_page_offset;
-      pagestats_g.start_chunk = ck_g.first_page + ck_g.has_dictionary;  // Exclude dictionary
-      pagestats_g.num_chunks  = num_pages - ck_g.has_dictionary;
+      pagestats_g.start_chunk = ck_g.first_page + ck_g.use_dictionary;  // Exclude dictionary
+      pagestats_g.num_chunks  = num_pages - ck_g.use_dictionary;
     }
   }
   __syncthreads();
@@ -1069,7 +875,10 @@ __global__ void __launch_bounds__(128, 8)
   } else {
     dtype_len_in = dtype_len_out;
   }
-  dict_bits = (dtype == BOOLEAN) ? 1 : (s->page.dict_bits_plus1 - 1);
+  dict_bits = (dtype == BOOLEAN) ? 1
+              : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
+                ? s->ck.dict_rle_bits
+                : -1;
   if (t == 0) {
     uint8_t* dst   = s->cur;
     s->rle_run     = 0;
@@ -1080,37 +889,56 @@ __global__ void __launch_bounds__(128, 8)
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
     }
-    s->page_start_val = s->page.start_row;
-    if (s->col.parent_column != nullptr) {
+    s->page_start_val    = s->page.start_row;  // Dictionary page's start row is chunk's start row
+    auto chunk_start_val = s->ck.start_row;
+    if (s->col.parent_column != nullptr) {  // TODO: remove this check. parent is now never nullptr
       auto col                    = *(s->col.parent_column);
       auto current_page_start_val = s->page_start_val;
+      // TODO: We do this so much. Add a global function that converts row idx to val idx
       while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
         if (col.type().id() == type_id::STRUCT) {
           current_page_start_val += col.offset();
+          chunk_start_val += col.offset();
           col = col.child(0);
         } else {
-          current_page_start_val = col.child(lists_column_view::offsets_column_index)
-                                     .element<size_type>(current_page_start_val + col.offset());
-          col = col.child(lists_column_view::child_column_index);
+          auto offset_col = col.child(lists_column_view::offsets_column_index);
+          current_page_start_val =
+            offset_col.element<size_type>(current_page_start_val + col.offset());
+          chunk_start_val = offset_col.element<size_type>(chunk_start_val + col.offset());
+          col             = col.child(lists_column_view::child_column_index);
         }
       }
-      s->page_start_val = current_page_start_val;
+      s->page_start_val  = current_page_start_val;
+      s->chunk_start_val = chunk_start_val;
     }
   }
   __syncthreads();
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
-    uint32_t nvals   = min(s->page.num_leaf_values - cur_val_idx, 128);
-    uint32_t val_idx = s->page_start_val + cur_val_idx + t;
-    uint32_t is_valid, len, pos;
+    uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128);
+    uint32_t len, pos;
+
+    auto [is_valid, val_idx] = [&]() {
+      uint32_t val_idx;
+      uint32_t is_valid;
+
+      size_type val_idx_in_block = cur_val_idx + t;
+      if (s->page.page_type == PageType::DICTIONARY_PAGE) {
+        val_idx  = val_idx_in_block;
+        is_valid = (val_idx < s->page.num_leaf_values);
+        if (is_valid) { val_idx = s->ck.dict_data[val_idx]; }
+      } else {
+        size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block;
+
+        is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() &&
+                    val_idx_in_block < s->page.num_leaf_values)
+                     ? s->col.leaf_column->is_valid(val_idx_in_leaf_col)
+                     : 0;
+        val_idx =
+          (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
+      }
+      return std::make_tuple(is_valid, val_idx);
+    }();
 
-    if (s->page.page_type == PageType::DICTIONARY_PAGE) {
-      is_valid = (cur_val_idx + t < s->page.num_leaf_values);
-      val_idx  = (is_valid) ? s->col.dict_data[val_idx] : val_idx;
-    } else {
-      is_valid = (val_idx < s->col.leaf_column->size() && cur_val_idx + t < s->page.num_leaf_values)
-                   ? s->col.leaf_column->is_valid(val_idx)
-                   : 0;
-    }
     cur_val_idx += nvals;
     if (dict_bits >= 0) {
       // Dictionary encoding
@@ -1124,7 +952,7 @@ __global__ void __launch_bounds__(128, 8)
           if (dtype == BOOLEAN) {
             v = s->col.leaf_column->element<uint8_t>(val_idx);
           } else {
-            v = s->col.dict_index[val_idx];
+            v = s->ck.dict_index[val_idx];
           }
           s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v;
         }
@@ -1531,13 +1359,12 @@ __global__ void __launch_bounds__(128)
     // data pages (actual encoding is identical).
     Encoding encoding;
     if (enable_bool_rle) {
-      encoding = (col_g.physical_type != BOOLEAN)
-                   ? (page_type == PageType::DICTIONARY_PAGE || page_g.dict_bits_plus1 != 0)
-                       ? Encoding::PLAIN_DICTIONARY
-                       : Encoding::PLAIN
-                   : Encoding::RLE;
+      encoding = (col_g.physical_type == BOOLEAN) ? Encoding::RLE
+                 : (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
+                   ? Encoding::PLAIN_DICTIONARY
+                   : Encoding::PLAIN;
     } else {
-      encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.dict_bits_plus1 != 0)
+      encoding = (page_type == PageType::DICTIONARY_PAGE || page_g.chunk->use_dictionary)
                    ? Encoding::PLAIN_DICTIONARY
                    : Encoding::PLAIN;
     }
@@ -1562,7 +1389,7 @@ __global__ void __launch_bounds__(128)
     } else {
       // DictionaryPageHeader
       encoder.field_struct_begin(7);
-      encoder.field_int32(1, ck_g.total_dict_entries);  // number of values in dictionary
+      encoder.field_int32(1, ck_g.num_dict_entries);  // number of values in dictionary
       encoder.field_int32(2, encoding);
       encoder.field_struct_end(7);
     }
@@ -1613,12 +1440,12 @@ __global__ void __launch_bounds__(1024)
     memcpy_block<1024, true>(dst, src, data_len, t);
     dst += data_len;
     __syncthreads();
-    if (!t && page == 0 && ck_g.has_dictionary) { ck_g.dictionary_size = hdr_len + data_len; }
+    if (!t && page == 0 && ck_g.use_dictionary) { ck_g.dictionary_size = hdr_len + data_len; }
   }
   if (t == 0) {
     chunks[blockIdx.x].bfr_size        = uncompressed_size;
     chunks[blockIdx.x].compressed_size = (dst - dst_base);
-    if (ck_g.has_dictionary) { chunks[blockIdx.x].dictionary_size = ck_g.dictionary_size; }
+    if (ck_g.use_dictionary) { chunks[blockIdx.x].dictionary_size = ck_g.dictionary_size; }
   }
 }
 
@@ -1966,9 +1793,9 @@ dremel_data get_dremel_data(column_view h_col,
 
     // Scan to get distance by which each offset value is shifted due to the insertion of empties
     auto scan_it = cudf::detail::make_counting_transform_iterator(
-      column_offsets[level], [off = lcv.offsets().data<size_type>()] __device__(auto i) -> int {
-        return off[i] == off[i + 1];
-      });
+      column_offsets[level],
+      [off = lcv.offsets().data<size_type>(), size = lcv.offsets().size()] __device__(
+        auto i) -> int { return (i + 1 < size) && (off[i] == off[i + 1]); });
     rmm::device_uvector<size_type> scan_out(offset_size_at_level, stream);
     thrust::exclusive_scan(
       rmm::exec_policy(stream), scan_it, scan_it + offset_size_at_level, scan_out.begin());
@@ -2053,9 +1880,9 @@ dremel_data get_dremel_data(column_view h_col,
     // Scan to get distance by which each offset value is shifted due to the insertion of dremel
     // level value fof an empty list
     auto scan_it = cudf::detail::make_counting_transform_iterator(
-      column_offsets[level], [off = lcv.offsets().data<size_type>()] __device__(auto i) -> int {
-        return off[i] == off[i + 1];
-      });
+      column_offsets[level],
+      [off = lcv.offsets().data<size_type>(), size = lcv.offsets().size()] __device__(
+        auto i) -> int { return (i + 1 < size) && (off[i] == off[i + 1]); });
     rmm::device_uvector<size_type> scan_out(offset_size_at_level, stream);
     thrust::exclusive_scan(
       rmm::exec_policy(stream), scan_it, scan_it + offset_size_at_level, scan_out.begin());
diff --git a/cpp/src/io/parquet/parquet.cpp b/cpp/src/io/parquet/parquet.cpp
index 6c658788fa1..c8c54e9933f 100644
--- a/cpp/src/io/parquet/parquet.cpp
+++ b/cpp/src/io/parquet/parquet.cpp
@@ -347,6 +347,7 @@ int CompactProtocolReader::WalkSchema(
     ++idx;
     if (e->num_children > 0) {
       for (int i = 0; i < e->num_children; i++) {
+        e->children_idx.push_back(idx);
         int idx_old = idx;
         idx         = WalkSchema(md, idx, parent_idx, max_def_level, max_rep_level);
         if (idx <= idx_old) break;  // Error
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 2232017409d..4390d1c788f 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -165,6 +165,7 @@ struct SchemaElement {
   int max_definition_level = 0;
   int max_repetition_level = 0;
   int parent_idx           = 0;
+  std::vector<size_t> children_idx;
 
   bool operator==(SchemaElement const& other) const
   {
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 975d2545cd1..cdd7c6b6674 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -28,6 +28,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <cuco/static_map.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -42,6 +44,10 @@ namespace parquet {
 
 using cudf::io::detail::string_index_pair;
 
+// Total number of unsigned 16 bit values
+constexpr size_type MAX_DICT_SIZE =
+  std::numeric_limits<uint16_t>::max() - std::numeric_limits<uint16_t>::min() + 1;
+
 /**
  * @brief Struct representing an input column in the file.
  */
@@ -56,6 +62,11 @@ struct input_column_info {
 
 namespace gpu {
 
+auto constexpr KEY_SENTINEL   = size_type{-1};
+auto constexpr VALUE_SENTINEL = size_type{-1};
+using map_type                = cuco::static_map<size_type, size_type>;
+using slot_type               = map_type::pair_atomic_type;
+
 /**
  * @brief Enums for the flags in the page header
  */
@@ -222,8 +233,6 @@ struct ColumnChunkDesc {
  * @brief Struct describing an encoder column
  */
 struct parquet_column_device_view : stats_column_desc {
-  uint32_t* dict_index;    //!< Dictionary index [row]
-  uint32_t* dict_data;     //!< Dictionary data (unique row indices)
   uint8_t physical_type;   //!< physical data type
   uint8_t converted_type;  //!< logical data type
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
@@ -236,9 +245,9 @@ struct parquet_column_device_view : stats_column_desc {
   size_type const* level_offsets;  //!< Offset array for per-row pre-calculated rep/def level values
   uint8_t const* rep_values;       //!< Pre-calculated repetition level values
   uint8_t const* def_values;       //!< Pre-calculated definition level values
-  uint8_t* nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
-                         //!< nullability of parent_column. May be different from col.nullable() in
-                         //!< case of chunked writing.
+  uint8_t const* nullability;  //!< Array of nullability of each nesting level. e.g. nullable[0] is
+                               //!< nullability of parent_column. May be different from
+                               //!< col.nullable() in case of chunked writing.
 };
 
 constexpr int max_page_fragment_size = 5000;  //!< Max number of rows in a page fragment
@@ -253,7 +262,6 @@ struct PageFragment {
   uint32_t start_value_idx;
   uint32_t num_leaf_values;  //!< Number of leaf values in fragment. Does not include nulls at
                              //!< non-leaf level
-  uint32_t non_nulls;        //!< Number of non-null values
   uint16_t num_rows;         //!< Number of rows in fragment
   uint16_t num_dict_vals;    //!< Number of unique dictionary entries
 };
@@ -292,26 +300,33 @@ struct EncPage;
  */
 struct EncColumnChunk {
   parquet_column_device_view const* col_desc;  //!< Column description
-  PageFragment* fragments;                     //!< First fragment in chunk
-  uint8_t* uncompressed_bfr;                   //!< Uncompressed page data
-  uint8_t* compressed_bfr;                     //!< Compressed page data
-  statistics_chunk const* stats;               //!< Fragment statistics
-  uint32_t bfr_size;                           //!< Uncompressed buffer size
-  uint32_t compressed_size;                    //!< Compressed buffer size
-  uint32_t start_row;                          //!< First row of chunk
-  uint32_t num_rows;                           //!< Number of rows in chunk
-  uint32_t num_values;      //!< Number of values in chunk. Different from num_rows for nested types
+  size_type col_desc_id;
+  PageFragment* fragments;        //!< First fragment in chunk
+  uint8_t* uncompressed_bfr;      //!< Uncompressed page data
+  uint8_t* compressed_bfr;        //!< Compressed page data
+  statistics_chunk const* stats;  //!< Fragment statistics
+  uint32_t bfr_size;              //!< Uncompressed buffer size
+  uint32_t compressed_size;       //!< Compressed buffer size
+  uint32_t start_row;             //!< First row of chunk
+  uint32_t num_rows;              //!< Number of rows in chunk
+  size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
   EncPage* pages;           //!< Ptr to pages that belong to this chunk
   uint32_t first_page;      //!< First page of chunk
   uint32_t num_pages;       //!< Number of pages in chunk
-  uint32_t dictionary_id;   //!< Dictionary id for this chunk
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
-  uint8_t has_dictionary;   //!< Nonzero if the chunk uses dictionary encoding
-  uint16_t num_dict_fragments;  //!< Number of fragments using dictionary
-  uint32_t dictionary_size;     //!< Size of dictionary
-  uint32_t total_dict_entries;  //!< Total number of entries in dictionary
-  uint32_t ck_stat_size;        //!< Size of chunk-level statistics (included in 1st page header)
+  uint32_t dictionary_size;    //!< Size of dictionary page including header
+  uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
+  slot_type* dict_map_slots;   //!< Hash map storage for calculating dict encoding for this chunk
+  size_type dict_map_size;     //!< Size of dict_map_slots
+  size_type num_dict_entries;  //!< Total number of entries in dictionary
+  size_type
+    uniq_data_size;  //!< Size of dictionary page (set of all unique values) if dict enc is used
+  size_type plain_data_size;  //!< Size of data in this chunk if plain encoding is used
+  size_type* dict_data;       //!< Dictionary data (unique row indices)
+  uint16_t* dict_index;   //!< Index of value in dictionary page. column[dict_data[dict_index[row]]]
+  uint8_t dict_rle_bits;  //!< Bit size for encoding dictionary indices
+  bool use_dictionary;    //!< True if the chunk uses dictionary encoding
 };
 
 /**
@@ -322,7 +337,6 @@ struct EncPage {
   uint8_t* compressed_data;  //!< Ptr to compressed page
   uint16_t num_fragments;    //!< Number of fragments in page
   PageType page_type;        //!< Page type
-  uint8_t dict_bits_plus1;   //!< 0=plain, nonzero:bits to encoding dictionary indices + 1
   EncColumnChunk* chunk;     //!< Chunk that this page belongs to
   uint32_t chunk_id;         //!< Index in chunk array
   uint32_t hdr_size;         //!< Size of page header
@@ -449,7 +463,7 @@ dremel_data get_dremel_data(column_view h_col,
  * @param[in] num_columns Number of columns
  * @param[in] fragment_size Number of rows per fragment
  * @param[in] num_rows Number of rows per column
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void InitPageFragments(cudf::detail::device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
@@ -463,13 +477,57 @@ void InitPageFragments(cudf::detail::device_2dspan<PageFragment> frag,
  * @param[out] groups Statistics groups [num_columns x num_fragments]
  * @param[in] fragments Page fragments [num_columns x num_fragments]
  * @param[in] col_desc Column description [num_columns]
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups,
                             cudf::detail::device_2dspan<PageFragment const> fragments,
                             device_span<gpu::parquet_column_device_view const> col_desc,
                             rmm::cuda_stream_view stream);
 
+/**
+ * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
+ *
+ * @param chunks Flat span of chunks to intialize hash maps for
+ * @param stream CUDA stream to use
+ */
+void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Insert chunk values into their respective hash maps
+ *
+ * @param chunks Column chunks [rowgroup][column]
+ * @param num_rows Number of rows per column
+ * @param stream CUDA stream to use
+ */
+void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                              size_type num_rows,
+                              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Compact dictionary hash map entries into chunk.dict_data
+ *
+ * @param chunks Flat span of chunks to compact hash maps for
+ * @param stream CUDA stream to use
+ */
+void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Get the Dictionary Indices for each row
+ *
+ * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
+ * stored in input column [row]. Stores these indices into chunk.dict_index.
+ *
+ * Since dict_data itself contains indices into the original cudf column, this means that
+ * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
+ *
+ * @param chunks Column chunks [rowgroup][column]
+ * @param num_rows Number of rows per column
+ * @param stream CUDA stream to use
+ */
+void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
+                            size_type num_rows,
+                            rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder data pages
  *
@@ -538,17 +596,6 @@ void GatherPages(device_span<EncColumnChunk> chunks,
                  device_span<gpu::EncPage const> pages,
                  rmm::cuda_stream_view stream);
 
-/**
- * @brief Launches kernel for building chunk dictionaries
- *
- * @param[in] chunks Column chunks
- * @param[in] dev_scratch Device scratch data (kDictScratchSize bytes per dictionary)
- * @param[in] stream CUDA stream to use, default 0
- */
-void BuildChunkDictionaries(device_span<EncColumnChunk> chunks,
-                            uint32_t* dev_scratch,
-                            rmm::cuda_stream_view stream);
-
 }  // namespace gpu
 }  // namespace parquet
 }  // namespace io
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 9f9bdfd4755..caf11b66206 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -464,8 +464,9 @@ class aggregate_metadata {
    *
    * @param names List of column names to load, where index column name(s) will be added
    */
-  void add_pandas_index_names(std::vector<std::string>& names) const
+  std::vector<std::string> get_pandas_index_names() const
   {
+    std::vector<std::string> names;
     auto str = get_pandas_index();
     if (str.length() != 0) {
       std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
@@ -480,6 +481,7 @@ class aggregate_metadata {
         str = sm.suffix();
       }
     }
+    return names;
   }
 
   struct row_group_info {
@@ -549,86 +551,14 @@ class aggregate_metadata {
     return selection;
   }
 
-  /**
-   * @brief Build input and output column structures based on schema input. Recursive.
-   *
-   * @param[in,out] schema_idx Schema index to build information for. This value gets
-   * incremented as the function recurses.
-   * @param[out] input_columns Input column information (source data in the file)
-   * @param[out] output_columns Output column structure (resulting cudf columns)
-   * @param[in,out] nesting A stack keeping track of child column indices so we can
-   * reproduce the linear list of output columns that correspond to an input column.
-   * @param[in] strings_to_categorical Type conversion parameter
-   * @param[in] timestamp_type_id Type conversion parameter
-   * @param[in] strict_decimal_types True if it is an error to load an unsupported decimal type
-   *
-   */
-  void build_column_info(int& schema_idx,
-                         std::vector<input_column_info>& input_columns,
-                         std::vector<column_buffer>& output_columns,
-                         std::deque<int>& nesting,
-                         bool strings_to_categorical,
-                         type_id timestamp_type_id,
-                         bool strict_decimal_types) const
-  {
-    int start_schema_idx = schema_idx;
-    auto const& schema   = get_schema(schema_idx);
-    schema_idx++;
-
-    // if I am a stub, continue on
-    if (schema.is_stub()) {
-      // is this legit?
-      CUDF_EXPECTS(schema.num_children == 1, "Unexpected number of children for stub");
-      build_column_info(schema_idx,
-                        input_columns,
-                        output_columns,
-                        nesting,
-                        strings_to_categorical,
-                        timestamp_type_id,
-                        strict_decimal_types);
-      return;
-    }
-
-    // if we're at the root, this is a new output column
-    nesting.push_back(static_cast<int>(output_columns.size()));
-    auto const col_type =
-      to_type_id(schema, strings_to_categorical, timestamp_type_id, strict_decimal_types);
-    auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64
-                         ? data_type{col_type, numeric::scale_type{-schema.decimal_scale}}
-                         : data_type{col_type};
-    output_columns.emplace_back(dtype, schema.repetition_type == OPTIONAL ? true : false);
-    column_buffer& output_col = output_columns.back();
-    output_col.name           = schema.name;
-
-    // build each child
-    for (int idx = 0; idx < schema.num_children; idx++) {
-      build_column_info(schema_idx,
-                        input_columns,
-                        output_col.children,
-                        nesting,
-                        strings_to_categorical,
-                        timestamp_type_id,
-                        strict_decimal_types);
-    }
-
-    // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
-    // data stored) so add me to the list.
-    if (schema.num_children == 0) {
-      input_columns.emplace_back(input_column_info{start_schema_idx, schema.name});
-      input_column_info& input_col = input_columns.back();
-      std::copy(nesting.begin(), nesting.end(), std::back_inserter(input_col.nesting));
-    }
-
-    nesting.pop_back();
-  }
-
   /**
    * @brief Filters and reduces down to a selection of columns
    *
-   * @param use_names List of column names to select
+   * @param use_names List of paths of column names to select
    * @param include_index Whether to always include the PANDAS index column(s)
    * @param strings_to_categorical Type conversion parameter
    * @param timestamp_type_id Type conversion parameter
+   * @param strict_decimal_types Type conversion parameter
    *
    * @return input column information, output column information, list of output column schema
    * indices
@@ -639,9 +569,86 @@ class aggregate_metadata {
                       type_id timestamp_type_id,
                       bool strict_decimal_types) const
   {
-    auto const& pfm = per_file_metadata[0];
+    auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
+      auto const& col_schema_idx = std::find_if(
+        schema_elem.children_idx.cbegin(),
+        schema_elem.children_idx.cend(),
+        [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
+
+      return (col_schema_idx != schema_elem.children_idx.end()) ? static_cast<int>(*col_schema_idx)
+                                                                : -1;
+    };
+
+    std::vector<column_buffer> output_columns;
+    std::vector<input_column_info> input_columns;
+    std::vector<int> nesting;
+
+    // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
+    // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
+    // not a child of "struct1" then the function will return false for "struct1"
+    std::function<bool(column_name_info const*, int, std::vector<column_buffer>&)> build_column =
+      [&](column_name_info const* col_name_info,
+          int schema_idx,
+          std::vector<column_buffer>& out_col_array) {
+        if (schema_idx < 0) { return false; }
+        auto const& schema_elem = get_schema(schema_idx);
+
+        // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
+        // hierarchy. So continue on
+        if (schema_elem.is_stub()) {
+          // is this legit?
+          CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
+          auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
+          return build_column(child_col_name_info, schema_elem.children_idx[0], out_col_array);
+        }
+
+        // if we're at the root, this is a new output column
+        auto const col_type =
+          to_type_id(schema_elem, strings_to_categorical, timestamp_type_id, strict_decimal_types);
+        auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64
+                             ? data_type{col_type, numeric::scale_type{-schema_elem.decimal_scale}}
+                             : data_type{col_type};
+
+        column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+        // store the index of this element if inserted in out_col_array
+        nesting.push_back(static_cast<int>(out_col_array.size()));
+        output_col.name = schema_elem.name;
+
+        // build each child
+        bool path_is_valid = false;
+        if (col_name_info == nullptr or col_name_info->children.empty()) {
+          // add all children of schema_elem.
+          // At this point, we can no longer pass a col_name_info to build_column
+          for (int idx = 0; idx < schema_elem.num_children; idx++) {
+            path_is_valid |=
+              build_column(nullptr, schema_elem.children_idx[idx], output_col.children);
+          }
+        } else {
+          for (size_t idx = 0; idx < col_name_info->children.size(); idx++) {
+            path_is_valid |=
+              build_column(&col_name_info->children[idx],
+                           find_schema_child(schema_elem, col_name_info->children[idx].name),
+                           output_col.children);
+          }
+        }
+
+        // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
+        // data stored) so add me to the list.
+        if (schema_elem.num_children == 0) {
+          input_column_info& input_col =
+            input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name});
+          std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
+          path_is_valid = true;  // If we're able to reach leaf then path is valid
+        }
+
+        if (path_is_valid) { out_col_array.push_back(std::move(output_col)); }
+
+        nesting.pop_back();
+        return path_is_valid;
+      };
+
+    std::vector<int> output_column_schemas;
 
-    // determine the list of output columns
     //
     // there is not necessarily a 1:1 mapping between input columns and output columns.
     // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns.
@@ -657,43 +664,120 @@ class aggregate_metadata {
     // "firstname", "middlename" and "lastname" represent the input columns in the file that we
     // process to produce the final cudf "name" column.
     //
-    std::vector<int> output_column_schemas;
+    // A user can ask for a single field out of the struct e.g. firstname.
+    // In this case they'll pass a fully qualified name to the schema element like
+    // ["name", "firstname"]
+    //
+    auto const& root = get_schema(0);
     if (use_names.empty()) {
-      // walk the schema and choose all top level columns
-      for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) {
-        auto const& schema = pfm.schema[schema_idx];
-        if (schema.parent_idx == 0) { output_column_schemas.push_back(schema_idx); }
+      for (auto const& schema_idx : root.children_idx) {
+        build_column(nullptr, schema_idx, output_columns);
+        output_column_schemas.push_back(schema_idx);
       }
     } else {
-      // Load subset of columns; include PANDAS index unless excluded
-      std::vector<std::string> local_use_names = use_names;
-      if (include_index) { add_pandas_index_names(local_use_names); }
-      for (const auto& use_name : local_use_names) {
-        for (size_t schema_idx = 1; schema_idx < pfm.schema.size(); schema_idx++) {
-          auto const& schema = pfm.schema[schema_idx];
-          // We select only top level columns by name. Selecting nested columns by name is not
-          // supported. Top level columns are identified by their parent being the root (idx == 0)
-          if (use_name == schema.name and schema.parent_idx == 0) {
-            output_column_schemas.push_back(schema_idx);
-          }
+      struct path_info {
+        std::string full_path;
+        int schema_idx;
+      };
+
+      // Convert schema into a vector of every possible path
+      std::vector<path_info> all_paths;
+      std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
+                                                           int schema_idx) {
+        auto const& schema_elem = get_schema(schema_idx);
+        std::string curr_path   = path_till_now + schema_elem.name;
+        all_paths.push_back({curr_path, schema_idx});
+        for (auto const& child_idx : schema_elem.children_idx) {
+          add_path(curr_path + ".", child_idx);
         }
+      };
+      for (auto const& child_idx : get_schema(0).children_idx) {
+        add_path("", child_idx);
       }
-    }
 
-    // construct input and output output column info
-    std::vector<column_buffer> output_columns;
-    output_columns.reserve(output_column_schemas.size());
-    std::vector<input_column_info> input_columns;
-    std::deque<int> nesting;
-    for (size_t idx = 0; idx < output_column_schemas.size(); idx++) {
-      int schema_index = output_column_schemas[idx];
-      build_column_info(schema_index,
-                        input_columns,
-                        output_columns,
-                        nesting,
-                        strings_to_categorical,
-                        timestamp_type_id,
-                        strict_decimal_types);
+      // Find which of the selected paths are valid and get their schema index
+      std::vector<path_info> valid_selected_paths;
+      for (auto const& selected_path : use_names) {
+        auto found_path =
+          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+            return valid_path.full_path == selected_path;
+          });
+        if (found_path != all_paths.end()) {
+          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+        }
+      }
+
+      // Now construct paths as vector of strings for further consumption
+      std::vector<std::vector<std::string>> use_names3;
+      std::transform(valid_selected_paths.begin(),
+                     valid_selected_paths.end(),
+                     std::back_inserter(use_names3),
+                     [&](path_info const& valid_path) {
+                       auto schema_idx = valid_path.schema_idx;
+                       std::vector<std::string> result_path;
+                       do {
+                         SchemaElement const& elem = get_schema(schema_idx);
+                         result_path.push_back(elem.name);
+                         schema_idx = elem.parent_idx;
+                       } while (schema_idx > 0);
+                       return std::vector<std::string>(result_path.rbegin(), result_path.rend());
+                     });
+
+      std::vector<column_name_info> selected_columns;
+      if (include_index) {
+        std::vector<std::string> index_names = get_pandas_index_names();
+        std::transform(index_names.cbegin(),
+                       index_names.cend(),
+                       std::back_inserter(selected_columns),
+                       [](std::string const& name) { return column_name_info(name); });
+      }
+      // Merge the vector use_names into a set of hierarchical column_name_info objects
+      /* This is because if we have columns like this:
+       *     col1
+       *      / \
+       *    s3   f4
+       *   / \
+       * f5   f6
+       *
+       * there may be common paths in use_names like:
+       * {"col1", "s3", "f5"}, {"col1", "f4"}
+       * which means we want the output to contain
+       *     col1
+       *      / \
+       *    s3   f4
+       *   /
+       * f5
+       *
+       * rather than
+       *  col1   col1
+       *   |      |
+       *   s3     f4
+       *   |
+       *   f5
+       */
+      for (auto const& path : use_names3) {
+        auto array_to_find_in = &selected_columns;
+        for (size_t depth = 0; depth < path.size(); ++depth) {
+          // Check if the path exists in our selected_columns and if not, add it.
+          auto const& name_to_find = path[depth];
+          auto found_col           = std::find_if(
+            array_to_find_in->begin(),
+            array_to_find_in->end(),
+            [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
+          if (found_col == array_to_find_in->end()) {
+            auto& col        = array_to_find_in->emplace_back(name_to_find);
+            array_to_find_in = &col.children;
+          } else {
+            // Path exists. go down further.
+            array_to_find_in = &found_col->children;
+          }
+        }
+      }
+      for (auto& col : selected_columns) {
+        auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
+        bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns);
+        if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
+      }
     }
 
     return std::make_tuple(
@@ -1581,18 +1665,16 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
       // create the final output cudf columns
       for (size_t i = 0; i < _output_columns.size(); ++i) {
-        out_metadata.schema_info.push_back(column_name_info{""});
-        out_columns.emplace_back(
-          make_column(_output_columns[i], &out_metadata.schema_info.back(), stream, _mr));
+        column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+        out_columns.emplace_back(make_column(_output_columns[i], &col_name, stream, _mr));
       }
     }
   }
 
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) {
-    out_metadata.schema_info.push_back(column_name_info{""});
-    out_columns.emplace_back(cudf::io::detail::empty_like(
-      _output_columns[i], &out_metadata.schema_info.back(), stream, _mr));
+    column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+    out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, stream, _mr));
   }
 
   // Return column names (must match order of returned columns)
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 7c0ce03886d..0d4ce40354f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -364,6 +364,26 @@ struct leaf_schema_fn {
   }
 };
 
+inline bool is_col_nullable(LinkedColPtr const& col,
+                            column_in_metadata const& col_meta,
+                            bool single_write_mode)
+{
+  if (single_write_mode) {
+    return col->nullable();
+  } else {
+    if (col_meta.is_nullability_defined()) {
+      CUDF_EXPECTS(col_meta.nullable() || !col->nullable(),
+                   "Mismatch in metadata prescribed nullability and input column nullability. "
+                   "Metadata for nullable input column cannot prescribe nullability = false");
+      return col_meta.nullable();
+    } else {
+      // For chunked write, when not provided nullability, we assume the worst case scenario
+      // that all columns are nullable.
+      return true;
+    }
+  }
+}
+
 /**
  * @brief Construct schema from input columns and per-column input options
  *
@@ -371,7 +391,7 @@ struct leaf_schema_fn {
  * The resulting schema tree is stored in a vector in pre-order traversal order.
  */
 std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linked_columns,
-                                                    table_input_metadata const& metadata,
+                                                    table_input_metadata& metadata,
                                                     bool single_write_mode,
                                                     bool int96_timestamps)
 {
@@ -384,27 +404,9 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
   root.parent_idx      = -1;  // root schema has no parent
   schema.push_back(std::move(root));
 
-  std::function<void(LinkedColPtr const&, column_in_metadata const&, size_t)> add_schema =
-    [&](LinkedColPtr const& col, column_in_metadata const& col_meta, size_t parent_idx) {
-      bool col_nullable = [&]() {
-        if (single_write_mode) {
-          return col->nullable();
-        } else {
-          if (col_meta.is_nullability_defined()) {
-            if (col_meta.nullable() == false) {
-              CUDF_EXPECTS(
-                col->nullable() == false,
-                "Mismatch in metadata prescribed nullability and input column nullability. "
-                "Metadata for nullable input column cannot prescribe nullability = false");
-            }
-            return col_meta.nullable();
-          } else {
-            // For chunked write, when not provided nullability, we assume the worst case scenario
-            // that all columns are nullable.
-            return true;
-          }
-        }
-      }();
+  std::function<void(LinkedColPtr const&, column_in_metadata&, size_t)> add_schema =
+    [&](LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
+      bool col_nullable = is_col_nullable(col, col_meta, single_write_mode);
 
       if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
@@ -426,7 +428,7 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
         for (size_t i = 0; i < col->children.size(); ++i) {
           add_schema(col->children[i], col_meta.child(i), struct_node_index);
         }
-      } else if (col->type().id() == type_id::LIST) {
+      } else if (col->type().id() == type_id::LIST && !col_meta.is_map()) {
         // List schema is denoted by two levels for each nesting level and one final level for leaf.
         // The top level is the same name as the column name.
         // So e.g. List<List<int>> is denoted in the schema by
@@ -454,6 +456,58 @@ std::vector<schema_tree_node> construct_schema_tree(LinkedColVector const& linke
         add_schema(col->children[lists_column_view::child_column_index],
                    col_meta.child(lists_column_view::child_column_index),
                    schema.size() - 1);
+      } else if (col->type().id() == type_id::LIST && col_meta.is_map()) {
+        // Map schema is denoted by a list of struct
+        // e.g. List<Struct<String,String>> will be
+        // "col_name" : { "key_value" : { "key", "value" } }
+
+        // verify the List child structure is a struct<left_child, right_child>
+        auto const& struct_col = col->child(lists_column_view::child_column_index);
+        CUDF_EXPECTS(struct_col.type().id() == type_id::STRUCT, "Map should be a List of struct");
+        CUDF_EXPECTS(struct_col.num_children() == 2,
+                     "Map should be a List of struct with two children only but found " +
+                       std::to_string(struct_col.num_children()));
+
+        schema_tree_node map_schema{};
+        map_schema.converted_type = ConvertedType::MAP;
+        map_schema.repetition_type =
+          col_nullable ? FieldRepetitionType::OPTIONAL : FieldRepetitionType::REQUIRED;
+        map_schema.name         = col_meta.get_name();
+        map_schema.num_children = 1;
+        map_schema.parent_idx   = parent_idx;
+        schema.push_back(std::move(map_schema));
+
+        schema_tree_node repeat_group{};
+        repeat_group.repetition_type = FieldRepetitionType::REPEATED;
+        repeat_group.name            = "key_value";
+        repeat_group.num_children    = 2;
+        repeat_group.parent_idx      = schema.size() - 1;  // Parent is map_schema, last added.
+        schema.push_back(std::move(repeat_group));
+
+        CUDF_EXPECTS(col_meta.num_children() == 2,
+                     "List column's metadata should have exactly two children");
+        CUDF_EXPECTS(col_meta.child(lists_column_view::child_column_index).num_children() == 2,
+                     "Map struct column should have exactly two children");
+        // verify the col meta of children of the struct have name key and value
+        auto& left_child_meta = col_meta.child(lists_column_view::child_column_index).child(0);
+        left_child_meta.set_name("key");
+        left_child_meta.set_nullability(false);
+
+        auto& right_child_meta = col_meta.child(lists_column_view::child_column_index).child(1);
+        right_child_meta.set_name("value");
+        // check the repetition type of key is required i.e. the col should be non-nullable
+        auto key_col = col->children[lists_column_view::child_column_index]->children[0];
+        CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, single_write_mode),
+                     "key column cannot be nullable. For chunked writing, explicitly set the "
+                     "nullability to false in metadata");
+        // process key
+        size_type struct_col_index = schema.size() - 1;
+        add_schema(key_col, left_child_meta, struct_col_index);
+        // process value
+        add_schema(col->children[lists_column_view::child_column_index]->children[1],
+                   right_child_meta,
+                   struct_col_index);
+
       } else {
         // if leaf, add current
         if (col->type().id() == type_id::STRING) {
@@ -505,7 +559,7 @@ struct parquet_column_view {
                       rmm::cuda_stream_view stream);
 
   column_view leaf_column_view() const;
-  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream);
+  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
   column_view cudf_column_view() const { return cudf_col; }
   parquet::Type physical_type() const { return schema_node.type; }
@@ -517,26 +571,6 @@ struct parquet_column_view {
   uint8_t max_rep_level() const noexcept { return _max_rep_level; }
   bool is_list() const noexcept { return _is_list; }
 
-  // Dictionary related member functions
-  uint32_t* get_dict_data() { return (_dict_data.size()) ? _dict_data.data() : nullptr; }
-  uint32_t* get_dict_index() { return (_dict_index.size()) ? _dict_index.data() : nullptr; }
-  void use_dictionary(bool use_dict) { _dictionary_used = use_dict; }
-  void alloc_dictionary(size_t max_num_rows, rmm::cuda_stream_view stream)
-  {
-    _dict_data.resize(max_num_rows, stream);
-    _dict_index.resize(max_num_rows, stream);
-  }
-  bool check_dictionary_used(rmm::cuda_stream_view stream)
-  {
-    if (!_dictionary_used) {
-      _dict_data.resize(0, stream);
-      _dict_data.shrink_to_fit(stream);
-      _dict_index.resize(0, stream);
-      _dict_index.shrink_to_fit(stream);
-    }
-    return _dictionary_used;
-  }
-
  private:
   // Schema related members
   schema_tree_node schema_node;
@@ -556,11 +590,6 @@ struct parquet_column_view {
   rmm::device_uvector<uint8_t> _def_level;
   std::vector<uint8_t> _nullability;
   size_type _data_count = 0;
-
-  // Dictionary related members
-  bool _dictionary_used = false;
-  rmm::device_uvector<uint32_t> _dict_data;
-  rmm::device_uvector<uint32_t> _dict_index;
 };
 
 parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
@@ -570,9 +599,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
     _d_nullability(0, stream),
     _dremel_offsets(0, stream),
     _rep_level(0, stream),
-    _def_level(0, stream),
-    _dict_data(0, stream),
-    _dict_index(0, stream)
+    _def_level(0, stream)
 {
   // Construct single inheritance column_view from linked_column_view
   auto curr_col                           = schema_node.leaf_column.get();
@@ -683,21 +710,14 @@ column_view parquet_column_view::leaf_column_view() const
   return col;
 }
 
-gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view stream)
+gpu::parquet_column_device_view parquet_column_view::get_device_view(
+  rmm::cuda_stream_view stream) const
 {
   column_view col  = leaf_column_view();
   auto desc        = gpu::parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
 
-  // TODO (dm): Enable dictionary for list and struct after refactor
-  if (physical_type() != BOOLEAN && physical_type() != UNDEFINED_TYPE &&
-      !is_nested(cudf_col.type())) {
-    alloc_dictionary(_data_count, stream);
-    desc.dict_index = get_dict_index();
-    desc.dict_data  = get_dict_data();
-  }
-
   if (is_list()) {
     desc.level_offsets = _dremel_offsets.data();
     desc.rep_values    = _rep_level.data();
@@ -705,15 +725,9 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s
   }
   desc.num_rows      = cudf_col.size();
   desc.physical_type = static_cast<uint8_t>(physical_type());
-  auto count_bits    = [](uint16_t number) {
-    int16_t nbits = 0;
-    while (number > 0) {
-      nbits++;
-      number >>= 1;
-    }
-    return nbits;
-  };
-  desc.level_bits  = count_bits(max_rep_level()) << 4 | count_bits(max_def_level());
+
+  desc.level_bits = CompactProtocolReader::NumRequiredBits(max_rep_level()) << 4 |
+                    CompactProtocolReader::NumRequiredBits(max_def_level());
   desc.nullability = _d_nullability.data();
   return desc;
 }
@@ -744,22 +758,99 @@ void writer::impl::gather_fragment_statistics(
   stream.synchronize();
 }
 
-void writer::impl::build_chunk_dictionaries(
-  hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-  device_span<gpu::parquet_column_device_view const> col_desc,
-  uint32_t num_columns,
-  uint32_t num_dictionaries)
+void writer::impl::init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                                   device_span<gpu::parquet_column_device_view const> col_desc,
+                                   uint32_t num_columns)
 {
   chunks.host_to_device(stream);
-  if (num_dictionaries > 0) {
-    size_t dict_scratch_size = (size_t)num_dictionaries * gpu::kDictScratchSize;
-    auto dict_scratch        = cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-      dict_scratch_size / sizeof(uint32_t), stream);
+  gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, stream);
+  chunks.device_to_host(stream, true);
+}
+
+auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                              host_span<gpu::parquet_column_device_view const> col_desc,
+                              uint32_t num_rows,
+                              rmm::cuda_stream_view stream)
+{
+  // At this point, we know all chunks and their sizes. We want to allocate dictionaries for each
+  // chunk that can have dictionary
+
+  auto h_chunks = chunks.host_view().flat_view();
 
-    gpu::BuildChunkDictionaries(chunks.device_view().flat_view(), dict_scratch.data(), stream);
+  std::vector<rmm::device_uvector<size_type>> dict_data;
+  std::vector<rmm::device_uvector<uint16_t>> dict_index;
+
+  if (h_chunks.size() == 0) { return std::make_pair(std::move(dict_data), std::move(dict_index)); }
+
+  // Allocate slots for each chunk
+  std::vector<rmm::device_uvector<gpu::slot_type>> hash_maps_storage;
+  hash_maps_storage.reserve(h_chunks.size());
+  for (auto& chunk : h_chunks) {
+    if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN) {
+      chunk.use_dictionary = false;
+    } else {
+      chunk.use_dictionary = true;
+      auto& inserted_map   = hash_maps_storage.emplace_back(chunk.num_values, stream);
+      chunk.dict_map_slots = inserted_map.data();
+      chunk.dict_map_size  = inserted_map.size();
+    }
   }
-  gpu::InitEncoderPages(chunks, {}, col_desc, num_columns, nullptr, nullptr, stream);
+
+  chunks.host_to_device(stream);
+
+  gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
+  gpu::populate_chunk_hash_maps(chunks, num_rows, stream);
+
   chunks.device_to_host(stream, true);
+
+  // Make decision about which chunks have dictionary
+  for (auto& ck : h_chunks) {
+    if (not ck.use_dictionary) { continue; }
+    std::tie(ck.use_dictionary, ck.dict_rle_bits) = [&]() {
+      // calculate size of chunk if dictionary is used
+
+      // If we have N unique values then the idx for the last value is N - 1 and nbits is the number
+      // of bits required to encode indices into the dictionary
+      auto max_dict_index = (ck.num_dict_entries > 0) ? ck.num_dict_entries - 1 : 0;
+      auto nbits          = CompactProtocolReader::NumRequiredBits(max_dict_index);
+
+      // We don't use dictionary if the indices are > 16 bits because that's the maximum bitpacking
+      // bitsize we efficiently support
+      if (nbits > 16) { return std::make_pair(false, 0); }
+
+      // Only these bit sizes are allowed for RLE encoding because it's compute optimized
+      constexpr auto allowed_bitsizes = std::array<size_type, 6>{1, 2, 4, 8, 12, 16};
+
+      // ceil to (1/2/4/8/12/16)
+      auto rle_bits = *std::lower_bound(allowed_bitsizes.begin(), allowed_bitsizes.end(), nbits);
+      auto rle_byte_size = util::div_rounding_up_safe(ck.num_values * rle_bits, 8);
+
+      auto dict_enc_size = ck.uniq_data_size + rle_byte_size;
+
+      bool use_dict = (ck.plain_data_size > dict_enc_size);
+      if (not use_dict) { rle_bits = 0; }
+      return std::make_pair(use_dict, rle_bits);
+    }();
+  }
+
+  // TODO: (enh) Deallocate hash map storage for chunks that don't use dict and clear pointers.
+
+  dict_data.reserve(h_chunks.size());
+  dict_index.reserve(h_chunks.size());
+  for (auto& chunk : h_chunks) {
+    if (not chunk.use_dictionary) { continue; }
+
+    size_t dict_data_size     = std::min(MAX_DICT_SIZE, chunk.dict_map_size);
+    auto& inserted_dict_data  = dict_data.emplace_back(dict_data_size, stream);
+    auto& inserted_dict_index = dict_index.emplace_back(chunk.num_values, stream);
+    chunk.dict_data           = inserted_dict_data.data();
+    chunk.dict_index          = inserted_dict_index.data();
+  }
+  chunks.host_to_device(stream);
+  gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
+  gpu::get_dictionary_indices(chunks.device_view(), num_rows, stream);
+
+  return std::make_pair(std::move(dict_data), std::move(dict_index));
 }
 
 void writer::impl::init_encoder_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
@@ -959,10 +1050,8 @@ void writer::impl::write(table_view const& table)
 
   // Initialize column description
   hostdevice_vector<gpu::parquet_column_device_view> col_desc(parquet_columns.size(), stream);
-  // This should've been `auto const&` but isn't since dictionary space is allocated when calling
-  // get_device_view(). Fix during dictionary refactor.
   std::transform(
-    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto& pcol) {
+    parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) {
       return pcol.get_device_view(stream);
     });
 
@@ -973,11 +1062,9 @@ void writer::impl::write(table_view const& table)
   // ideally want the page size to be below 1MB so as to have enough pages to get good
   // compression/decompression performance).
   using cudf::io::parquet::gpu::max_page_fragment_size;
-  constexpr uint32_t fragment_size = 5000;
-  static_assert(fragment_size <= max_page_fragment_size,
-                "fragment size cannot be greater than max_page_fragment_size");
 
-  uint32_t num_fragments = (uint32_t)((num_rows + fragment_size - 1) / fragment_size);
+  uint32_t num_fragments =
+    (uint32_t)((num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
     num_columns, num_fragments, stream);
 
@@ -987,7 +1074,7 @@ void writer::impl::write(table_view const& table)
     leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
-    init_page_fragments(fragments, col_desc, num_rows, fragment_size);
+    init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size);
   }
 
   size_t global_rowgroup_base = md.row_groups.size();
@@ -1002,11 +1089,12 @@ void writer::impl::write(table_view const& table)
     for (auto i = 0; i < num_columns; i++) {
       fragment_data_size += fragments[i][f].fragment_data_size;
     }
-    if (f > rowgroup_start && (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
-                               (f + 1 - rowgroup_start) * fragment_size > max_rowgroup_rows_)) {
+    if (f > rowgroup_start &&
+        (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
+         (f + 1 - rowgroup_start) * max_page_fragment_size > max_rowgroup_rows_)) {
       // update schema
       md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = (f - rowgroup_start) * fragment_size;
+      md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size;
       num_rowgroups++;
       rowgroup_start = f;
       rowgroup_size  = 0;
@@ -1015,7 +1103,7 @@ void writer::impl::write(table_view const& table)
     if (f + 1 == num_fragments) {
       // update schema
       md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * fragment_size;
+      md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * max_page_fragment_size;
       num_rowgroups++;
     }
   }
@@ -1033,20 +1121,19 @@ void writer::impl::write(table_view const& table)
   // Initialize row groups and column chunks
   uint32_t num_chunks = num_rowgroups * num_columns;
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
-  uint32_t num_dictionaries = 0;
   for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
-    uint32_t fragments_in_chunk =
-      (uint32_t)((md.row_groups[global_r].num_rows + fragment_size - 1) / fragment_size);
+    uint32_t fragments_in_chunk = (uint32_t)(
+      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
     for (int i = 0; i < num_columns; i++) {
       gpu::EncColumnChunk* ck = &chunks[r][i];
-      bool dict_enable        = false;
 
-      *ck           = {};
-      ck->col_desc  = col_desc.device_ptr() + i;
-      ck->fragments = &fragments.device_view()[i][f];
+      *ck             = {};
+      ck->col_desc    = col_desc.device_ptr() + i;
+      ck->col_desc_id = i;
+      ck->fragments   = &fragments.device_view()[i][f];
       ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr;
       ck->start_row        = start_row;
       ck->num_rows         = (uint32_t)md.row_groups[global_r].num_rows;
@@ -1056,30 +1143,12 @@ void writer::impl::write(table_view const& table)
         std::accumulate(chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) {
           return l + r.num_values;
         });
-      ck->dictionary_id = num_dictionaries;
-      if (col_desc[i].dict_data) {
-        size_t plain_size      = 0;
-        size_t dict_size       = 1;
-        uint32_t num_dict_vals = 0;
-        for (uint32_t j = 0; j < fragments_in_chunk && num_dict_vals < 65536; j++) {
-          plain_size += chunk_fragments[j].fragment_data_size;
-          dict_size += chunk_fragments[j].dict_data_size +
-                       ((num_dict_vals > 256) ? 2 : 1) * chunk_fragments[j].non_nulls;
-          num_dict_vals += chunk_fragments[j].num_dict_vals;
-        }
-        if (dict_size < plain_size) {
-          parquet_columns[i].use_dictionary(true);
-          dict_enable = true;
-          num_dictionaries++;
-        }
-      }
-      ck->has_dictionary                                     = dict_enable;
+      ck->plain_data_size = std::accumulate(
+        chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+          return sum + frag.fragment_data_size;
+        });
       md.row_groups[global_r].columns[i].meta_data.type      = parquet_columns[i].physical_type();
       md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE};
-      if (dict_enable) {
-        md.row_groups[global_r].columns[i].meta_data.encodings.push_back(
-          Encoding::PLAIN_DICTIONARY);
-      }
       md.row_groups[global_r].columns[i].meta_data.path_in_schema =
         parquet_columns[i].get_path_in_schema();
       md.row_groups[global_r].columns[i].meta_data.codec      = UNCOMPRESSED;
@@ -1089,15 +1158,18 @@ void writer::impl::write(table_view const& table)
     start_row += (uint32_t)md.row_groups[global_r].num_rows;
   }
 
-  // Free unused dictionaries
-  for (auto& col : parquet_columns) {
-    col.check_dictionary_used(stream);
+  auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream);
+  for (uint32_t rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
+    for (int col = 0; col < num_columns; col++) {
+      if (chunks.host_view()[rg][col].use_dictionary) {
+        md.row_groups[global_rg].columns[col].meta_data.encodings.push_back(
+          Encoding::PLAIN_DICTIONARY);
+      }
+    }
   }
 
   // Build chunk dictionaries and count pages
-  if (num_chunks != 0) {
-    build_chunk_dictionaries(chunks, col_desc, num_columns, num_dictionaries);
-  }
+  if (num_chunks != 0) { init_page_sizes(chunks, col_desc, num_columns); }
 
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
   std::vector<uint32_t> batch_list;
@@ -1247,9 +1319,9 @@ void writer::impl::write(table_view const& table)
         }
         md.row_groups[global_r].total_byte_size += ck->compressed_size;
         md.row_groups[global_r].columns[i].meta_data.data_page_offset =
-          current_chunk_offset + ((ck->has_dictionary) ? ck->dictionary_size : 0);
+          current_chunk_offset + ((ck->use_dictionary) ? ck->dictionary_size : 0);
         md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset =
-          (ck->has_dictionary) ? current_chunk_offset : 0;
+          (ck->use_dictionary) ? current_chunk_offset : 0;
         md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size;
         md.row_groups[global_r].columns[i].meta_data.total_compressed_size   = ck->compressed_size;
         current_chunk_offset += ck->compressed_size;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 8d9bdc8adbd..8fb1a8294fb 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -153,12 +153,11 @@ class writer::impl {
    * @param chunks column chunk array
    * @param col_desc column description array
    * @param num_columns Total number of columns
-   * @param num_dictionaries Total number of dictionaries
    */
-  void build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
-                                device_span<gpu::parquet_column_device_view const> col_desc,
-                                uint32_t num_columns,
-                                uint32_t num_dictionaries);
+  void init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
+                       device_span<gpu::parquet_column_device_view const> col_desc,
+                       uint32_t num_columns);
+
   /**
    * @brief Initialize encoder pages
    *
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index a6b4978aeab..88297423b9b 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -252,7 +252,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin,
   bool quotation   = false;
   auto current     = begin;
   bool escape_next = false;
-  while (true) {
+  while (current < end) {
     // Use simple logic to ignore control chars between any quote seq
     // Handles nominal cases including doublequotes within quotes, but
     // may not output exact failures as PANDAS for malformed fields.
@@ -262,7 +262,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin,
       quotation = !quotation;
     } else if (!quotation) {
       if (*current == opts.delimiter) {
-        while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) {
+        while (opts.multi_delimiter && (current + 1 < end) && *(current + 1) == opts.delimiter) {
           ++current;
         }
         break;
@@ -283,8 +283,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin,
       }
     }
 
-    if (current >= end) break;
-    current++;
+    if (current < end) { current++; }
   }
   return current;
 }
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index c7a1630311b..bfabe99aaf9 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -13,118 +13,400 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <join/conditional_join.cuh>
-#include <join/join_common_utils.hpp>
 
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <join/conditional_join.hpp>
+#include <join/conditional_join_kernels.cuh>
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <optional>
+
 namespace cudf {
 namespace detail {
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_join(table_view left,
-                 table_view right,
-                 ast::expression binary_predicate,
+conditional_join(table_view const& left,
+                 table_view const& right,
+                 ast::expression const& binary_predicate,
                  null_equality compare_nulls,
-                 join_kind JoinKind,
+                 join_kind join_type,
+                 std::optional<std::size_t> output_size,
                  rmm::cuda_stream_view stream,
                  rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
-  return get_conditional_join_indices(
-    left, right, JoinKind, binary_predicate, compare_nulls, stream, mr);
+  // We can immediately filter out cases where the right table is empty. In
+  // some cases, we return all the rows of the left table with a corresponding
+  // null index for the right table; in others, we return an empty output.
+  if (right.num_rows() == 0) {
+    switch (join_type) {
+      // Left, left anti, and full (which are effectively left because we are
+      // guaranteed that left has more rows than right) all return a all the
+      // row indices from left with a corresponding NULL from the right.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream);
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+    }
+  }
+
+  // Prepare output column. Whether or not the output column is nullable is
+  // determined by whether any of the columns in the input table are nullable.
+  // If none of the input columns actually contain nulls, we can still use the
+  // non-nullable version of the expression evaluation code path for
+  // performance, so we capture that information as well.
+  auto const nullable  = cudf::nullable(left) || cudf::nullable(right);
+  auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right));
+
+  auto const parser =
+    ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  auto left_table  = table_device_view::create(left, stream);
+  auto right_table = table_device_view::create(right, stream);
+
+  // Allocate storage for the counter used to get the size of the join output
+  detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block =
+    parser.device_expression_data.shmem_per_thread * config.num_threads_per_block;
+  join_kind kernel_join_type = join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
+
+  // If the join size was not provided as an input, compute it here.
+  std::size_t join_size;
+  if (output_size.has_value()) {
+    join_size = *output_size;
+  } else {
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    CHECK_CUDA(stream.value());
+    if (has_nulls) {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table,
+          *right_table,
+          kernel_join_type,
+          compare_nulls,
+          parser.device_expression_data,
+          size.data());
+    } else {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table,
+          *right_table,
+          kernel_join_type,
+          compare_nulls,
+          parser.device_expression_data,
+          size.data());
+    }
+    CHECK_CUDA(stream.value());
+    join_size = size.value(stream);
+  }
+
+  // If the output size will be zero, we can return immediately.
+  if (join_size == 0) {
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+  }
+
+  rmm::device_scalar<size_type> write_index(0, stream);
+
+  auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  auto const& join_output_l = left_indices->data();
+  auto const& join_output_r = right_indices->data();
+  if (has_nulls) {
+    conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        kernel_join_type,
+        compare_nulls,
+        join_output_l,
+        join_output_r,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  } else {
+    conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        kernel_join_type,
+        compare_nulls,
+        join_output_l,
+        join_output_r,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  }
+
+  CHECK_CUDA(stream.value());
+
+  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
+
+  // For full joins, get the indices in the right table that were not joined to
+  // by any row in the left table.
+  if (join_type == join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
+  }
+  return join_indices;
+}
+
+std::size_t compute_conditional_join_output_size(table_view const& left,
+                                                 table_view const& right,
+                                                 ast::expression const& binary_predicate,
+                                                 null_equality compare_nulls,
+                                                 join_kind join_type,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  // We can immediately filter out cases where the right table is empty. In
+  // some cases, we return all the rows of the left table with a corresponding
+  // null index for the right table; in others, we return an empty output.
+  if (right.num_rows() == 0) {
+    switch (join_type) {
+      // Left, left anti, and full (which are effectively left because we are
+      // guaranteed that left has more rows than right) all return a all the
+      // row indices from left with a corresponding NULL from the right.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::FULL_JOIN: return left.num_rows();
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN: return 0;
+    }
+  }
+
+  // Prepare output column. Whether or not the output column is nullable is
+  // determined by whether any of the columns in the input table are nullable.
+  // If none of the input columns actually contain nulls, we can still use the
+  // non-nullable version of the expression evaluation code path for
+  // performance, so we capture that information as well.
+  auto const nullable  = cudf::nullable(left) || cudf::nullable(right);
+  auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right));
+
+  auto const parser =
+    ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  auto left_table  = table_device_view::create(left, stream);
+  auto right_table = table_device_view::create(right, stream);
+
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  CHECK_CUDA(stream.value());
+  detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block =
+    parser.device_expression_data.shmem_per_thread * config.num_threads_per_block;
+
+  // Determine number of output rows without actually building the output to simply
+  // find what the size of the output will be.
+  assert(join_type != join_kind::FULL_JOIN);
+  if (has_nulls) {
+    compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        compare_nulls,
+        parser.device_expression_data,
+        size.data());
+  } else {
+    compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        compare_nulls,
+        parser.device_expression_data,
+        size.data());
+  }
+  CHECK_CUDA(stream.value());
+
+  return size.value(stream);
 }
 
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_inner_join(table_view left,
-                       table_view right,
-                       ast::expression binary_predicate,
+conditional_inner_join(table_view const& left,
+                       table_view const& right,
+                       ast::expression const& binary_predicate,
                        null_equality compare_nulls,
+                       std::optional<std::size_t> output_size,
                        rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
                                   right,
                                   binary_predicate,
                                   compare_nulls,
                                   detail::join_kind::INNER_JOIN,
+                                  output_size,
                                   rmm::cuda_stream_default,
                                   mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_left_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
+conditional_left_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
                       null_equality compare_nulls,
+                      std::optional<std::size_t> output_size,
                       rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
                                   right,
                                   binary_predicate,
                                   compare_nulls,
                                   detail::join_kind::LEFT_JOIN,
+                                  output_size,
                                   rmm::cuda_stream_default,
                                   mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_full_join(table_view left,
-                      table_view right,
-                      ast::expression binary_predicate,
+conditional_full_join(table_view const& left,
+                      table_view const& right,
+                      ast::expression const& binary_predicate,
                       null_equality compare_nulls,
                       rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
                                   right,
                                   binary_predicate,
                                   compare_nulls,
                                   detail::join_kind::FULL_JOIN,
+                                  {},
                                   rmm::cuda_stream_default,
                                   mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  std::optional<std::size_t> output_size,
   rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   return std::move(detail::conditional_join(left,
                                             right,
                                             binary_predicate,
                                             compare_nulls,
                                             detail::join_kind::LEFT_SEMI_JOIN,
+                                            output_size,
                                             rmm::cuda_stream_default,
                                             mr)
                      .first);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
-  table_view left,
-  table_view right,
-  ast::expression binary_predicate,
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
   null_equality compare_nulls,
+  std::optional<std::size_t> output_size,
   rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   return std::move(detail::conditional_join(left,
                                             right,
                                             binary_predicate,
                                             compare_nulls,
                                             detail::join_kind::LEFT_ANTI_JOIN,
+                                            output_size,
                                             rmm::cuda_stream_default,
                                             mr)
                      .first);
 }
 
+std::size_t conditional_inner_join_size(table_view const& left,
+                                        table_view const& right,
+                                        ast::expression const& binary_predicate,
+                                        null_equality compare_nulls,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      compare_nulls,
+                                                      detail::join_kind::INNER_JOIN,
+                                                      rmm::cuda_stream_default,
+                                                      mr);
+}
+
+std::size_t conditional_left_join_size(table_view const& left,
+                                       table_view const& right,
+                                       ast::expression const& binary_predicate,
+                                       null_equality compare_nulls,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      compare_nulls,
+                                                      detail::join_kind::LEFT_JOIN,
+                                                      rmm::cuda_stream_default,
+                                                      mr);
+}
+
+std::size_t conditional_left_semi_join_size(table_view const& left,
+                                            table_view const& right,
+                                            ast::expression const& binary_predicate,
+                                            null_equality compare_nulls,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::move(detail::compute_conditional_join_output_size(left,
+                                                                right,
+                                                                binary_predicate,
+                                                                compare_nulls,
+                                                                detail::join_kind::LEFT_SEMI_JOIN,
+                                                                rmm::cuda_stream_default,
+                                                                mr));
+}
+
+std::size_t conditional_left_anti_join_size(table_view const& left,
+                                            table_view const& right,
+                                            ast::expression const& binary_predicate,
+                                            null_equality compare_nulls,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::move(detail::compute_conditional_join_output_size(left,
+                                                                right,
+                                                                binary_predicate,
+                                                                compare_nulls,
+                                                                detail::join_kind::LEFT_ANTI_JOIN,
+                                                                rmm::cuda_stream_default,
+                                                                mr));
+}
+
 }  // namespace cudf
diff --git a/cpp/src/join/conditional_join.cuh b/cpp/src/join/conditional_join.cuh
deleted file mode 100644
index 4602b7fefaa..00000000000
--- a/cpp/src/join/conditional_join.cuh
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <join/conditional_join_kernels.cuh>
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-
-#include <cudf/ast/detail/transform.cuh>
-#include <cudf/ast/nodes.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Computes the join operation between two tables and returns the
- * output indices of left and right table as a combined table
- *
- * @param left  Table of left columns to join
- * @param right Table of right  columns to join
- * tables have been flipped, meaning the output indices should also be flipped
- * @param JoinKind The type of join to be performed
- * @param compare_nulls Controls whether null join-key values should match or not.
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return Join output indices vector pair
- */
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-get_conditional_join_indices(table_view const& left,
-                             table_view const& right,
-                             join_kind JoinKind,
-                             ast::expression binary_pred,
-                             null_equality compare_nulls,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  // We can immediately filter out cases where the right table is empty. In
-  // some cases, we return all the rows of the left table with a corresponding
-  // null index for the right table; in others, we return an empty output.
-  if (right.num_rows() == 0) {
-    switch (JoinKind) {
-      // Left, left anti, and full (which are effectively left because we are
-      // guaranteed that left has more rows than right) all return a all the
-      // row indices from left with a corresponding NULL from the right.
-      case join_kind::LEFT_JOIN:
-      case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream);
-      // Inner and left semi joins return empty output because no matches can exist.
-      case join_kind::INNER_JOIN:
-      case join_kind::LEFT_SEMI_JOIN:
-        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
-    }
-  }
-
-  // Prepare output column. Whether or not the output column is nullable is
-  // determined by whether any of the columns in the input table are nullable.
-  // If none of the input columns actually contain nulls, we can still use the
-  // non-nullable version of the expression evaluation code path for
-  // performance, so we capture that information as well.
-  auto const nullable  = cudf::nullable(left) || cudf::nullable(right);
-  auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right));
-
-  auto const plan = ast::detail::ast_plan{binary_pred, left, right, has_nulls, stream, mr};
-  CUDF_EXPECTS(plan.output_type().id() == type_id::BOOL8,
-               "The expression must produce a boolean output.");
-
-  auto left_table  = table_device_view::create(left, stream);
-  auto right_table = table_device_view::create(right, stream);
-
-  // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<size_type> size(0, stream, mr);
-  CHECK_CUDA(stream.value());
-  constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
-  detail::grid_1d config(left_table->num_rows(), block_size);
-  auto const shmem_size_per_block = plan.dev_plan.shmem_per_thread * config.num_threads_per_block;
-
-  // Determine number of output rows without actually building the output to simply
-  // find what the size of the output will be.
-  join_kind KernelJoinKind = JoinKind == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : JoinKind;
-  if (has_nulls) {
-    compute_conditional_join_output_size<block_size, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data());
-  } else {
-    compute_conditional_join_output_size<block_size, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_table, *right_table, KernelJoinKind, compare_nulls, plan.dev_plan, size.data());
-  }
-  CHECK_CUDA(stream.value());
-
-  size_type const join_size = size.value(stream);
-
-  // If the output size will be zero, we can return immediately.
-  if (join_size == 0) {
-    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
-                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
-  }
-
-  rmm::device_scalar<size_type> write_index(0, stream);
-
-  auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
-  auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
-
-  const auto& join_output_l = left_indices->data();
-  const auto& join_output_r = right_indices->data();
-  if (has_nulls) {
-    conditional_join<block_size, DEFAULT_JOIN_CACHE_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_table,
-        *right_table,
-        KernelJoinKind,
-        compare_nulls,
-        join_output_l,
-        join_output_r,
-        write_index.data(),
-        plan.dev_plan,
-        join_size);
-  } else {
-    conditional_join<block_size, DEFAULT_JOIN_CACHE_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_table,
-        *right_table,
-        KernelJoinKind,
-        compare_nulls,
-        join_output_l,
-        join_output_r,
-        write_index.data(),
-        plan.dev_plan,
-        join_size);
-  }
-
-  CHECK_CUDA(stream.value());
-
-  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
-
-  // For full joins, get the indices in the right table that were not joined to
-  // by any row in the left table.
-  if (JoinKind == join_kind::FULL_JOIN) {
-    auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
-    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
-  }
-  return join_indices;
-}
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
new file mode 100644
index 00000000000..5a3fe887838
--- /dev/null
+++ b/cpp/src/join/conditional_join.hpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "join_common_utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <optional>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Computes the join operation between two tables and returns the
+ * output indices of left and right table as a combined table
+ *
+ * @param left  Table of left columns to join
+ * @param right Table of right  columns to join
+ * tables have been flipped, meaning the output indices should also be flipped
+ * @param JoinKind The type of join to be performed
+ * @param compare_nulls Controls whether null join-key values should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Join output indices vector pair
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+conditional_join(table_view const& left,
+                 table_view const& right,
+                 ast::expression const& binary_predicate,
+                 null_equality compare_nulls,
+                 join_kind JoinKind,
+                 std::optional<std::size_t> output_size = {},
+                 rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
+                 rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Computes the size of a join operation between two tables without
+ * materializing the result and returns the total size value.
+ *
+ * @param left  Table of left columns to join
+ * @param right Table of right  columns to join
+ * tables have been flipped, meaning the output indices should also be flipped
+ * @param JoinKind The type of join to be performed
+ * @param compare_nulls Controls whether null join-key values should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Join output indices vector pair
+ */
+std::size_t compute_conditional_join_output_size(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  join_kind JoinKind,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 3d34a49c5af..9fcc7bf5cfb 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -19,13 +19,13 @@
 #include <join/join_common_utils.cuh>
 #include <join/join_common_utils.hpp>
 
-#include <cub/cub.cuh>
-#include <cudf/ast/detail/linearizer.hpp>
-#include <cudf/ast/detail/transform.cuh>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
 
+#include <cub/cub.cuh>
+
 namespace cudf {
 namespace detail {
 
@@ -40,18 +40,20 @@ namespace detail {
  *
  * @param[in] left_table The left table
  * @param[in] right_table The right table
- * @param[in] JoinKind The type of join to be performed
+ * @param[in] join_type The type of join to be performed
  * @param[in] compare_nulls Controls whether null join-key values should match or not.
- * @param[in] plan Container of device data required to evaluate the desired expression.
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
  * @param[out] output_size The resulting output size
  */
 template <int block_size, bool has_nulls>
-__global__ void compute_conditional_join_output_size(table_device_view left_table,
-                                                     table_device_view right_table,
-                                                     join_kind JoinKind,
-                                                     null_equality compare_nulls,
-                                                     ast::detail::device_ast_plan plan,
-                                                     cudf::size_type* output_size)
+__global__ void compute_conditional_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  join_kind join_type,
+  null_equality compare_nulls,
+  ast::detail::expression_device_view device_expression_data,
+  std::size_t* output_size)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
@@ -60,16 +62,17 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl
   extern __shared__ char raw_intermediate_storage[];
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
-  auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates];
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
-  cudf::size_type thread_counter(0);
-  const cudf::size_type left_start_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  const cudf::size_type left_stride    = blockDim.x * gridDim.x;
-  const cudf::size_type left_num_rows  = left_table.num_rows();
-  const cudf::size_type right_num_rows = right_table.num_rows();
+  std::size_t thread_counter{0};
+  cudf::size_type const left_start_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  cudf::size_type const left_stride    = blockDim.x * gridDim.x;
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, plan, thread_intermediate_storage, compare_nulls);
+    left_table, right_table, device_expression_data, thread_intermediate_storage, compare_nulls);
 
   for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows;
        left_row_index += left_stride) {
@@ -78,15 +81,15 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       evaluator.evaluate(output_dest, left_row_index, right_row_index, 0);
       if (output_dest.is_valid() && output_dest.value()) {
-        if ((JoinKind != join_kind::LEFT_ANTI_JOIN) &&
-            !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) {
+        if ((join_type != join_kind::LEFT_ANTI_JOIN) &&
+            !(join_type == join_kind::LEFT_SEMI_JOIN && found_match)) {
           ++thread_counter;
         }
         found_match = true;
       }
     }
-    if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN ||
-         JoinKind == join_kind::FULL_JOIN) &&
+    if ((join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
+         join_type == join_kind::FULL_JOIN) &&
         (!found_match)) {
       ++thread_counter;
     }
@@ -94,7 +97,7 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl
 
   using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  cudf::size_type block_counter = BlockReduce(temp_storage).Sum(thread_counter);
+  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
   if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
@@ -112,25 +115,26 @@ __global__ void compute_conditional_join_output_size(table_device_view left_tabl
  *
  * @param[in] left_table The left table
  * @param[in] right_table The right table
- * @param[in] JoinKind The type of join to be performed
+ * @param[in] join_type The type of join to be performed
  * @param compare_nulls Controls whether null join-key values should match or not.
  * @param[out] join_output_l The left result of the join operation
  * @param[out] join_output_r The right result of the join operation
  * @param[in,out] current_idx A global counter used by threads to coordinate
  * writes to the global output
- * @param plan Container of device data required to evaluate the desired expression.
+ * @param device_expression_data Container of device data required to evaluate the desired
+ * expression.
  * @param[in] max_size The maximum size of the output
  */
 template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
 __global__ void conditional_join(table_device_view left_table,
                                  table_device_view right_table,
-                                 join_kind JoinKind,
+                                 join_kind join_type,
                                  null_equality compare_nulls,
                                  cudf::size_type* join_output_l,
                                  cudf::size_type* join_output_r,
                                  cudf::size_type* current_idx,
-                                 cudf::ast::detail::device_ast_plan plan,
-                                 const cudf::size_type max_size)
+                                 cudf::ast::detail::expression_device_view device_expression_data,
+                                 cudf::size_type const max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
   __shared__ cudf::size_type current_idx_shared[num_warps];
@@ -144,12 +148,13 @@ __global__ void conditional_join(table_device_view left_table,
   extern __shared__ char raw_intermediate_storage[];
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
-  auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates];
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
-  const int warp_id                    = threadIdx.x / detail::warp_size;
-  const int lane_id                    = threadIdx.x % detail::warp_size;
-  const cudf::size_type left_num_rows  = left_table.num_rows();
-  const cudf::size_type right_num_rows = right_table.num_rows();
+  int const warp_id                    = threadIdx.x / detail::warp_size;
+  int const lane_id                    = threadIdx.x % detail::warp_size;
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
 
   if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
 
@@ -157,10 +162,10 @@ __global__ void conditional_join(table_device_view left_table,
 
   cudf::size_type left_row_index = threadIdx.x + blockIdx.x * blockDim.x;
 
-  const unsigned int activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows);
+  unsigned int const activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows);
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, plan, thread_intermediate_storage, compare_nulls);
+    left_table, right_table, device_expression_data, thread_intermediate_storage, compare_nulls);
 
   if (left_row_index < left_num_rows) {
     bool found_match = false;
@@ -176,8 +181,8 @@ __global__ void conditional_join(table_device_view left_table,
         // that the current logic relies on the fact that we process all right
         // table rows for a single left table row on a single thread so that no
         // synchronization of found_match is required).
-        if ((JoinKind != join_kind::LEFT_ANTI_JOIN) &&
-            !(JoinKind == join_kind::LEFT_SEMI_JOIN && found_match)) {
+        if ((join_type != join_kind::LEFT_ANTI_JOIN) &&
+            !(join_type == join_kind::LEFT_SEMI_JOIN && found_match)) {
           add_pair_to_cache(left_row_index,
                             right_row_index,
                             current_idx_shared,
@@ -209,8 +214,8 @@ __global__ void conditional_join(table_device_view left_table,
 
     // Left, left anti, and full joins all require saving left columns that
     // aren't present in the right.
-    if ((JoinKind == join_kind::LEFT_JOIN || JoinKind == join_kind::LEFT_ANTI_JOIN ||
-         JoinKind == join_kind::FULL_JOIN) &&
+    if ((join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
+         join_type == join_kind::FULL_JOIN) &&
         (!found_match)) {
       add_pair_to_cache(left_row_index,
                         static_cast<cudf::size_type>(JoinNoneValue),
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 526edbf6903..740431b8563 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 2b1c870bea1..d5c23b1d612 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -21,9 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
 
-#include <thrust/sequence.h>
 #include <cub/cub.cuh>
 
 namespace cudf {
@@ -31,7 +29,9 @@ namespace detail {
 
 /**
  * @brief Computes the trivial left join operation for the case when the
- * right table is empty. In this case all the valid indices of the left table
+ * right table is empty.
+ *
+ * In this case all the valid indices of the left table
  * are returned with their corresponding right indices being set to
  * JoinNoneValue, i.e. -1.
  *
@@ -41,21 +41,12 @@ namespace detail {
  *
  * @return Join output indices vector pair
  */
-inline std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-                 std::unique_ptr<rmm::device_uvector<size_type>>>
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
 get_trivial_left_join_indices(
   table_view const& left,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
-  thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
-  auto right_indices =
-    std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
-  thrust::uninitialized_fill(
-    rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
-  return std::make_pair(std::move(left_indices), std::move(right_indices));
-}
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 // Convenient alias for a pair of unique pointers to device uvectors.
 using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -83,47 +74,11 @@ using VectorPair = std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
  *
  * @return A pair of vectors containing the concatenated output.
  */
-inline VectorPair concatenate_vector_pairs(VectorPair& a,
-                                           VectorPair& b,
-                                           rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS((a.first->size() == a.second->size()),
-               "Mismatch between sizes of vectors in vector pair");
-  CUDF_EXPECTS((b.first->size() == b.second->size()),
-               "Mismatch between sizes of vectors in vector pair");
-  if (a.first->is_empty()) {
-    return std::move(b);
-  } else if (b.first->is_empty()) {
-    return std::move(a);
-  }
-  auto original_size = a.first->size();
-  a.first->resize(a.first->size() + b.first->size(), stream);
-  a.second->resize(a.second->size() + b.second->size(), stream);
-  thrust::copy(
-    rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size);
-  thrust::copy(rmm::exec_policy(stream),
-               b.second->begin(),
-               b.second->end(),
-               a.second->begin() + original_size);
-  return std::move(a);
-}
-
-/**
- * @brief Device functor to determine if an index is contained in a range.
- */
-template <typename T>
-struct valid_range {
-  T start, stop;
-  __host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {}
-
-  __host__ __device__ __forceinline__ bool operator()(const T index)
-  {
-    return ((index >= start) && (index < stop));
-  }
-};
+VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream);
 
 /**
  * @brief  Creates a table containing the complement of left join indices.
+ *
  * This table has two columns. The first one is filled with JoinNoneValue(-1)
  * and the second one contains values from 0 to right_table_row_count - 1
  * excluding those found in the right_indices column.
@@ -136,72 +91,27 @@ struct valid_range {
  *
  * @return Pair of vectors containing the left join indices complement
  */
-inline std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-                 std::unique_ptr<rmm::device_uvector<size_type>>>
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
 get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>& right_indices,
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  // Get array of indices that do not appear in right_indices
-
-  // Vector allocated for unmatched result
-  auto right_indices_complement =
-    std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
-
-  // If left table is empty in a full join call then all rows of the right table
-  // should be represented in the joined indices. This is an optimization since
-  // if left table is empty and full join is called all the elements in
-  // right_indices will be JoinNoneValue, i.e. -1. This if path should
-  // produce exactly the same result as the else path but will be faster.
-  if (left_table_row_count == 0) {
-    thrust::sequence(rmm::exec_policy(stream),
-                     right_indices_complement->begin(),
-                     right_indices_complement->end(),
-                     0);
-  } else {
-    // Assume all the indices in invalid_index_map are invalid
-    auto invalid_index_map =
-      std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
-    thrust::uninitialized_fill(
-      rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1});
-
-    // Functor to check for index validity since left joins can create invalid indices
-    valid_range<size_type> valid(0, right_table_row_count);
+                                 rmm::mr::device_memory_resource* mr);
 
-    // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count
-    // Thus specifying that those locations are valid
-    thrust::scatter_if(rmm::exec_policy(stream),
-                       thrust::make_constant_iterator(0),
-                       thrust::make_constant_iterator(0) + right_indices->size(),
-                       right_indices->begin(),      // Index locations
-                       right_indices->begin(),      // Stencil - Check if index location is valid
-                       invalid_index_map->begin(),  // Output indices
-                       valid);                      // Stencil Predicate
-    size_type begin_counter = static_cast<size_type>(0);
-    size_type end_counter   = static_cast<size_type>(right_table_row_count);
+/**
+ * @brief Device functor to determine if an index is contained in a range.
+ */
+template <typename T>
+struct valid_range {
+  T start, stop;
+  __host__ __device__ valid_range(const T begin, const T end) : start(begin), stop(end) {}
 
-    // Create list of indices that have been marked as invalid
-    size_type indices_count = thrust::copy_if(rmm::exec_policy(stream),
-                                              thrust::make_counting_iterator(begin_counter),
-                                              thrust::make_counting_iterator(end_counter),
-                                              invalid_index_map->begin(),
-                                              right_indices_complement->begin(),
-                                              thrust::identity<size_type>()) -
-                              right_indices_complement->begin();
-    right_indices_complement->resize(indices_count, stream);
+  __host__ __device__ __forceinline__ bool operator()(const T index)
+  {
+    return ((index >= start) && (index < stop));
   }
-
-  auto left_invalid_indices =
-    std::make_unique<rmm::device_uvector<size_type>>(right_indices_complement->size(), stream);
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
-                             left_invalid_indices->begin(),
-                             left_invalid_indices->end(),
-                             JoinNoneValue);
-
-  return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
-}
+};
 
 /**
  * @brief Adds a pair of indices to the shared memory cache
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index d2337e28ed4..d2541b006a7 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <rmm/device_uvector.hpp>
-
 #include <hash/concurrent_unordered_multimap.cuh>
 
 #include <limits>
@@ -49,26 +47,7 @@ using row_equality = cudf::row_equality_comparator<true>;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
 
-inline bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type)
-{
-  // If there is nothing to join, then send empty table with all columns
-  if (left.is_empty() || right.is_empty()) { return true; }
-
-  // If left join and the left table is empty, return immediately
-  if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; }
-
-  // If Inner Join and either table is empty, return immediately
-  if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) {
-    return true;
-  }
-
-  // If left semi join (contains) and right table is empty,
-  // return immediately
-  if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; }
-
-  return false;
-}
+bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type);
 
 }  // namespace detail
-
 }  // namespace cudf
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
new file mode 100644
index 00000000000..4aca4b4a9cf
--- /dev/null
+++ b/cpp/src/join/join_utils.cu
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/join_common_utils.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+
+namespace cudf {
+namespace detail {
+
+bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type)
+{
+  // If there is nothing to join, then send empty table with all columns
+  if (left.is_empty() || right.is_empty()) { return true; }
+
+  // If left join and the left table is empty, return immediately
+  if ((join_kind::LEFT_JOIN == join_type) && (0 == left.num_rows())) { return true; }
+
+  // If Inner Join and either table is empty, return immediately
+  if ((join_kind::INNER_JOIN == join_type) && ((0 == left.num_rows()) || (0 == right.num_rows()))) {
+    return true;
+  }
+
+  // If left semi join (contains) and right table is empty,
+  // return immediately
+  if ((join_kind::LEFT_SEMI_JOIN == join_type) && (0 == right.num_rows())) { return true; }
+
+  return false;
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+get_trivial_left_join_indices(table_view const& left,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
+{
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+  thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
+  auto right_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+  thrust::uninitialized_fill(
+    rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue);
+  return std::make_pair(std::move(left_indices), std::move(right_indices));
+}
+
+VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS((a.first->size() == a.second->size()),
+               "Mismatch between sizes of vectors in vector pair");
+  CUDF_EXPECTS((b.first->size() == b.second->size()),
+               "Mismatch between sizes of vectors in vector pair");
+  if (a.first->is_empty()) {
+    return std::move(b);
+  } else if (b.first->is_empty()) {
+    return std::move(a);
+  }
+  auto original_size = a.first->size();
+  a.first->resize(a.first->size() + b.first->size(), stream);
+  a.second->resize(a.second->size() + b.second->size(), stream);
+  thrust::copy(
+    rmm::exec_policy(stream), b.first->begin(), b.first->end(), a.first->begin() + original_size);
+  thrust::copy(rmm::exec_policy(stream),
+               b.second->begin(),
+               b.second->end(),
+               a.second->begin() + original_size);
+  return std::move(a);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>& right_indices,
+                                 size_type left_table_row_count,
+                                 size_type right_table_row_count,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  // Get array of indices that do not appear in right_indices
+
+  // Vector allocated for unmatched result
+  auto right_indices_complement =
+    std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
+
+  // If left table is empty in a full join call then all rows of the right table
+  // should be represented in the joined indices. This is an optimization since
+  // if left table is empty and full join is called all the elements in
+  // right_indices will be JoinNoneValue, i.e. -1. This if path should
+  // produce exactly the same result as the else path but will be faster.
+  if (left_table_row_count == 0) {
+    thrust::sequence(rmm::exec_policy(stream),
+                     right_indices_complement->begin(),
+                     right_indices_complement->end(),
+                     0);
+  } else {
+    // Assume all the indices in invalid_index_map are invalid
+    auto invalid_index_map =
+      std::make_unique<rmm::device_uvector<size_type>>(right_table_row_count, stream);
+    thrust::uninitialized_fill(
+      rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), int32_t{1});
+
+    // Functor to check for index validity since left joins can create invalid indices
+    valid_range<size_type> valid(0, right_table_row_count);
+
+    // invalid_index_map[index_ptr[i]] = 0 for i = 0 to right_table_row_count
+    // Thus specifying that those locations are valid
+    thrust::scatter_if(rmm::exec_policy(stream),
+                       thrust::make_constant_iterator(0),
+                       thrust::make_constant_iterator(0) + right_indices->size(),
+                       right_indices->begin(),      // Index locations
+                       right_indices->begin(),      // Stencil - Check if index location is valid
+                       invalid_index_map->begin(),  // Output indices
+                       valid);                      // Stencil Predicate
+    size_type begin_counter = static_cast<size_type>(0);
+    size_type end_counter   = static_cast<size_type>(right_table_row_count);
+
+    // Create list of indices that have been marked as invalid
+    size_type indices_count = thrust::copy_if(rmm::exec_policy(stream),
+                                              thrust::make_counting_iterator(begin_counter),
+                                              thrust::make_counting_iterator(end_counter),
+                                              invalid_index_map->begin(),
+                                              right_indices_complement->begin(),
+                                              thrust::identity<size_type>()) -
+                              right_indices_complement->begin();
+    right_indices_complement->resize(indices_count, stream);
+  }
+
+  auto left_invalid_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(right_indices_complement->size(), stream);
+  thrust::uninitialized_fill(rmm::exec_policy(stream),
+                             left_invalid_indices->begin(),
+                             left_invalid_indices->end(),
+                             JoinNoneValue);
+
+  return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement));
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index cc34aed33ea..69a7b8c722b 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -18,15 +18,12 @@
 #include <join/join_common_utils.hpp>
 #include <structs/utilities.hpp>
 
-#include <thrust/distance.h>
-
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/gather.cuh>
+#include <cudf/detail/gather.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/sequence.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -34,11 +31,15 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/sequence.h>
+
 namespace cudf {
 namespace detail {
 
-template <join_kind JoinKind>
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
+  join_kind const kind,
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls,
@@ -48,13 +49,13 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
 
-  if (is_trivial_join(left_keys, right_keys, JoinKind)) {
+  if (is_trivial_join(left_keys, right_keys, kind)) {
     return std::make_unique<rmm::device_uvector<cudf::size_type>>(0, stream, mr);
   }
-  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right_keys.num_rows())) {
+  if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right_keys.num_rows())) {
     auto result =
       std::make_unique<rmm::device_uvector<cudf::size_type>>(left_keys.num_rows(), stream, mr);
-    thrust::sequence(thrust::cuda::par.on(stream.value()), result->begin(), result->end());
+    thrust::sequence(rmm::exec_policy(stream), result->begin(), result->end());
     return result;
   }
 
@@ -115,7 +116,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   //
 
   // For semi join we want contains to be true, for anti join we want contains to be false
-  bool join_type_boolean = (JoinKind == join_kind::LEFT_SEMI_JOIN);
+  bool const join_type_boolean = (kind == join_kind::LEFT_SEMI_JOIN);
 
   auto gather_map =
     std::make_unique<rmm::device_uvector<cudf::size_type>>(left_num_rows, stream, mr);
@@ -152,27 +153,26 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
  * @throws cudf::logic_error if number of returned columns is 0
  * @throws cudf::logic_error if number of elements in `right_on` and `left_on` are not equal
  *
- * @param[in] left             The left table
- * @param[in] right            The right table
- * @param[in] left_on          The column indices from `left` to join on.
- *                             The column from `left` indicated by `left_on[i]`
- *                             will be compared against the column from `right`
- *                             indicated by `right_on[i]`.
- * @param[in] right_on         The column indices from `right` to join on.
- *                             The column from `right` indicated by `right_on[i]`
- *                             will be compared against the column from `left`
- *                             indicated by `left_on[i]`.
- * @param[in] compare_nulls    Controls whether null join-key values should match or not.
- * @param[in] mr               Device memory resource to used to allocate the returned table's
- *                             device memory
- * @param[in] stream           CUDA stream used for device memory operations and kernel launches.
- * @tparam    join_kind        Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN
+ * @param kind          Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN
+ * @param left          The left table
+ * @param right         The right table
+ * @param left_on       The column indices from `left` to join on.
+ *                      The column from `left` indicated by `left_on[i]`
+ *                      will be compared against the column from `right`
+ *                      indicated by `right_on[i]`.
+ * @param right_on      The column indices from `right` to join on.
+ *                      The column from `right` indicated by `right_on[i]`
+ *                      will be compared against the column from `left`
+ *                      indicated by `left_on[i]`.
+ * @param compare_nulls Controls whether null join-key values should match or not.
+ * @param stream        CUDA stream used for device memory operations and kernel launches.
+ * @param mr            Device memory resource to used to allocate the returned table
  *
- * @returns                    Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`.
+ * @returns             Result of joining `left` and `right` tables on the columns
+ *                      specified by `left_on` and `right_on`.
  */
-template <join_kind JoinKind>
 std::unique_ptr<cudf::table> left_semi_anti_join(
+  join_kind const kind,
   cudf::table_view const& left,
   cudf::table_view const& right,
   std::vector<cudf::size_type> const& left_on,
@@ -183,11 +183,11 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
 {
   CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on");
 
-  if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, JoinKind)) {
+  if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, kind)) {
     return empty_like(left);
   }
 
-  if ((join_kind::LEFT_ANTI_JOIN == JoinKind) && (0 == right.num_rows())) {
+  if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right.num_rows())) {
     // Everything matches, just copy the proper columns from the left table
     return std::make_unique<table>(left, stream, mr);
   }
@@ -202,14 +202,23 @@ std::unique_ptr<cudf::table> left_semi_anti_join(
   auto const left_selected  = matched.second.front();
   auto const right_selected = matched.second.back();
 
-  auto gather_map =
-    left_semi_anti_join<JoinKind>(left_selected, right_selected, compare_nulls, stream);
+  auto gather_vector =
+    left_semi_anti_join(kind, left_selected, right_selected, compare_nulls, stream);
+
+  // wrapping the device vector with a column view allows calling the non-iterator
+  // version of detail::gather, improving compile time by 10% and reducing the
+  // object file size by 2.2x without affecting performance
+  auto gather_map = column_view(data_type{type_id::INT32},
+                                static_cast<size_type>(gather_vector->size()),
+                                gather_vector->data(),
+                                nullptr,
+                                0);
 
   auto const left_updated = scatter_columns(left_selected, left_on, left);
   return cudf::detail::gather(left_updated,
-                              gather_map->begin(),
-                              gather_map->end(),
+                              gather_map,
                               out_of_bounds_policy::DONT_CHECK,
+                              negative_index_policy::NOT_ALLOWED,
                               stream,
                               mr);
 }
@@ -224,8 +233,14 @@ std::unique_ptr<cudf::table> left_semi_join(cudf::table_view const& left,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_semi_anti_join<detail::join_kind::LEFT_SEMI_JOIN>(
-    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+  return detail::left_semi_anti_join(detail::join_kind::LEFT_SEMI_JOIN,
+                                     left,
+                                     right,
+                                     left_on,
+                                     right_on,
+                                     compare_nulls,
+                                     rmm::cuda_stream_default,
+                                     mr);
 }
 
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
@@ -235,8 +250,8 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_semi_anti_join<detail::join_kind::LEFT_SEMI_JOIN>(
-    left, right, compare_nulls, rmm::cuda_stream_default, mr);
+  return detail::left_semi_anti_join(
+    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<cudf::table> left_anti_join(cudf::table_view const& left,
@@ -247,8 +262,14 @@ std::unique_ptr<cudf::table> left_anti_join(cudf::table_view const& left,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_semi_anti_join<detail::join_kind::LEFT_ANTI_JOIN>(
-    left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr);
+  return detail::left_semi_anti_join(detail::join_kind::LEFT_ANTI_JOIN,
+                                     left,
+                                     right,
+                                     left_on,
+                                     right_on,
+                                     compare_nulls,
+                                     rmm::cuda_stream_default,
+                                     mr);
 }
 
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
@@ -258,8 +279,8 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_semi_anti_join<detail::join_kind::LEFT_ANTI_JOIN>(
-    left, right, compare_nulls, rmm::cuda_stream_default, mr);
+  return detail::left_semi_anti_join(
+    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index c5a28a8ec5f..fb6bff3f129 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -51,13 +51,9 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
   auto out_offsets = make_numeric_column(
     data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr);
 
-  // The array of int8_t stores validities for the output list elements.
-  auto validities = rmm::device_uvector<int8_t>(build_null_mask ? num_rows : 0, stream);
-
   auto const d_out_offsets  = out_offsets->mutable_view().template begin<offset_type>();
   auto const d_row_offsets  = lists_column_view(input).offsets_begin();
   auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin();
-  auto const lists_dv_ptr   = column_device_view::create(lists_column_view(input).child());
 
   // Concatenating the lists at the same row by converting the entry offsets from the child column
   // into row offsets of the root column. Those entry offsets are subtracted by the first entry
@@ -67,22 +63,7 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
                     iter,
                     iter + num_rows + 1,
                     d_out_offsets,
-                    [d_row_offsets,
-                     d_list_offsets,
-                     lists_dv     = *lists_dv_ptr,
-                     d_validities = validities.begin(),
-                     build_null_mask,
-                     iter] __device__(auto const idx) {
-                      if (build_null_mask) {
-                        // The output row will be null only if all lists on the input row are null.
-                        auto const is_valid = thrust::any_of(thrust::seq,
-                                                             iter + d_row_offsets[idx],
-                                                             iter + d_row_offsets[idx + 1],
-                                                             [&] __device__(auto const list_idx) {
-                                                               return lists_dv.is_valid(list_idx);
-                                                             });
-                        d_validities[idx]   = static_cast<int8_t>(is_valid);
-                      }
+                    [d_row_offsets, d_list_offsets] __device__(auto const idx) {
                       auto const start_offset = d_list_offsets[d_row_offsets[0]];
                       return d_list_offsets[d_row_offsets[idx]] - start_offset;
                     });
@@ -92,10 +73,23 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
     lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream));
 
   auto [null_mask, null_count] = [&] {
-    return build_null_mask
-             ? cudf::detail::valid_if(
-                 validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr)
-             : std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+    if (!build_null_mask)
+      return std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+
+    // The output row will be null only if all lists on the input row are null.
+    auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child(), stream);
+    return cudf::detail::valid_if(
+      iter,
+      iter + num_rows,
+      [d_row_offsets, lists_dv = *lists_dv_ptr, iter] __device__(auto const idx) {
+        return thrust::any_of(
+          thrust::seq,
+          iter + d_row_offsets[idx],
+          iter + d_row_offsets[idx + 1],
+          [&] __device__(auto const list_idx) { return lists_dv.is_valid(list_idx); });
+      },
+      stream,
+      mr);
   }();
 
   return make_lists_column(num_rows,
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 5da8aef5853..4e69baef6ed 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -172,29 +172,24 @@ struct interleave_list_entries_fn {
     rmm::mr::device_memory_resource* mr) const noexcept
   {
     auto const table_dv_ptr = table_device_view::create(input);
-    auto const comp_fn      = compute_string_sizes_and_interleave_lists_fn{
+    auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
       *table_dv_ptr, output_list_offsets.template begin<offset_type>(), data_has_null_mask};
 
-    if (data_has_null_mask) {
-      auto [offsets_column, chars_column, null_mask, null_count] =
-        cudf::strings::detail::make_strings_children_with_null_mask(
-          comp_fn, num_output_lists, num_output_entries, stream, mr);
-      return make_strings_column(num_output_entries,
-                                 std::move(offsets_column),
-                                 std::move(chars_column),
-                                 null_count,
-                                 std::move(null_mask),
-                                 stream,
-                                 mr);
-    }
+    auto validities =
+      rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
+    comp_fn.d_validities = validities.data();
 
     auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       comp_fn, num_output_lists, num_output_entries, stream, mr);
+
+    auto [null_mask, null_count] = cudf::detail::valid_if(
+      validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
                                std::move(chars_column),
-                               0,
-                               rmm::device_buffer{},
+                               null_count,
+                               std::move(null_mask),
                                stream,
                                mr);
   }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 1df0a4ab41a..def4a400488 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -156,6 +156,8 @@ template <typename Result, typename Iterator>
 CUDA_HOST_DEVICE_CALLABLE Result
 select_quantile_data(Iterator begin, size_type size, double q, interpolation interp)
 {
+  if (size == 0) return static_cast<Result>(*begin);
+
   quantile_index idx(size, q);
 
   switch (interp) {
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index d902efd8b06..b15708c5cf8 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -17,6 +17,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/reshape.hpp>
 #include <cudf/lists/detail/interleave_columns.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/table/table_device_view.cuh>
@@ -28,32 +29,111 @@
 namespace cudf {
 namespace detail {
 namespace {
-struct interleave_columns_functor {
-  template <typename T, typename... Args>
-  std::enable_if_t<not cudf::is_fixed_width<T>() and not std::is_same_v<T, cudf::string_view> and
-                     not std::is_same_v<T, cudf::list_view>,
-                   std::unique_ptr<cudf::column>>
-  operator()(Args&&...)
+// Error case when no other overload or specialization is available
+template <typename T, typename Enable = void>
+struct interleave_columns_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
   {
-    CUDF_FAIL("Called `interleave_columns` on none-supported data type.");
+    CUDF_FAIL("Unsupported type in `interleave_columns`.");
   }
+};
 
+struct interleave_columns_functor {
   template <typename T>
-  std::enable_if_t<std::is_same_v<T, cudf::list_view>, std::unique_ptr<cudf::column>> operator()(
-    table_view const& lists_columns,
-    bool create_mask,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+  std::unique_ptr<cudf::column> operator()(table_view const& input,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+  {
+    return interleave_columns_impl<T>{}(input, create_mask, stream, mr);
+  }
+};
+
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::list_view>>> {
+  std::unique_ptr<column> operator()(table_view const& lists_columns,
+                                     bool create_mask,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
     return lists::detail::interleave_columns(lists_columns, create_mask, stream, mr);
   }
+};
 
-  template <typename T>
-  std::enable_if_t<std::is_same_v<T, cudf::string_view>, std::unique_ptr<cudf::column>> operator()(
-    table_view const& strings_columns,
-    bool create_mask,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::struct_view>>> {
+  std::unique_ptr<cudf::column> operator()(table_view const& structs_columns,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+  {
+    // We can safely call `column(0)` as the number of columns is known to be non zero.
+    auto const num_children = structs_columns.column(0).num_children();
+    CUDF_EXPECTS(
+      std::all_of(structs_columns.begin(),
+                  structs_columns.end(),
+                  [num_children](auto const& col) { return col.num_children() == num_children; }),
+      "Number of children of the input structs columns must be the same");
+
+    auto const num_columns = structs_columns.num_columns();
+    auto const num_rows    = structs_columns.num_rows();
+    auto const output_size = num_columns * num_rows;
+
+    // Interleave the children of the structs columns.
+    std::vector<std::unique_ptr<cudf::column>> output_struct_members;
+    for (size_type child_idx = 0; child_idx < num_children; ++child_idx) {
+      // Collect children columns from the input structs columns at index `child_idx`.
+      auto const child_iter =
+        thrust::make_transform_iterator(structs_columns.begin(), [child_idx](auto const& col) {
+          return structs_column_view(col).get_sliced_child(child_idx);
+        });
+      auto children = std::vector<column_view>(child_iter, child_iter + num_columns);
+
+      auto const child_type = children.front().type();
+      CUDF_EXPECTS(
+        std::all_of(children.cbegin(),
+                    children.cend(),
+                    [child_type](auto const& col) { return child_type == col.type(); }),
+        "Children of the input structs columns at the same child index must have the same type");
+
+      auto const children_nullable = std::any_of(
+        children.cbegin(), children.cend(), [](auto const& col) { return col.nullable(); });
+      output_struct_members.emplace_back(
+        type_dispatcher<dispatch_storage_type>(child_type,
+                                               interleave_columns_functor{},
+                                               table_view{std::move(children)},
+                                               children_nullable,
+                                               stream,
+                                               mr));
+    }
+
+    auto const create_mask_fn = [&] {
+      auto const input_dv_ptr = table_device_view::create(structs_columns);
+      auto const validity_fn  = [input_dv = *input_dv_ptr, num_columns] __device__(auto const idx) {
+        return input_dv.column(idx % num_columns).is_valid(idx / num_columns);
+      };
+      return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
+                                    thrust::make_counting_iterator<size_type>(output_size),
+                                    validity_fn,
+                                    stream,
+                                    mr);
+    };
+
+    // Only create null mask if at least one input structs column is nullable.
+    auto [null_mask, null_count] =
+      create_mask ? create_mask_fn() : std::pair{rmm::device_buffer{0, stream, mr}, size_type{0}};
+    return make_structs_column(
+      output_size, std::move(output_struct_members), null_count, std::move(null_mask), stream, mr);
+  }
+};
+
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
+  std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     auto num_columns = strings_columns.num_columns();
     if (num_columns == 1)  // Single strings column returns a copy
@@ -105,7 +185,7 @@ struct interleave_columns_functor {
       cudf::detail::get_value<int32_t>(offsets_column->view(), num_strings, stream);
     auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr);
     // Fill the chars column
-    auto d_results_chars = chars_column->mutable_view().data<char>();
+    auto d_results_chars = chars_column->mutable_view().template data<char>();
     thrust::for_each_n(
       rmm::exec_policy(stream),
       thrust::make_counting_iterator<size_type>(0),
@@ -131,13 +211,14 @@ struct interleave_columns_functor {
                                stream,
                                mr);
   }
+};
 
-  template <typename T>
-  std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> operator()(
-    table_view const& input,
-    bool create_mask,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+template <typename T>
+struct interleave_columns_impl<T, typename std::enable_if_t<cudf::is_fixed_width<T>()>> {
+  std::unique_ptr<cudf::column> operator()(table_view const& input,
+                                           bool create_mask,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
   {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
@@ -184,30 +265,33 @@ struct interleave_columns_functor {
 };
 
 }  // anonymous namespace
-}  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
-  CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input.num_columns() > 0, "input must have at least one column to determine dtype.");
 
   auto const dtype = input.column(0).type();
-
   CUDF_EXPECTS(std::all_of(std::cbegin(input),
                            std::cend(input),
                            [dtype](auto const& col) { return dtype == col.type(); }),
-               "DTYPE mismatch");
+               "Input columns must have the same type");
 
   auto const output_needs_mask = std::any_of(
     std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); });
 
-  return type_dispatcher<dispatch_storage_type>(dtype,
-                                                detail::interleave_columns_functor{},
-                                                input,
-                                                output_needs_mask,
-                                                rmm::cuda_stream_default,
-                                                mr);
+  return type_dispatcher<dispatch_storage_type>(
+    dtype, detail::interleave_columns_functor{}, input, output_needs_mask, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::interleave_columns(input, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 045bfbe0327..f982e7b99f2 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -17,7 +17,7 @@
 #include <structs/utilities.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.hpp>
@@ -520,6 +520,13 @@ list_scalar::list_scalar(list_scalar const& other,
 
 column_view list_scalar::view() const { return _data.view(); }
 
+struct_scalar::struct_scalar(struct_scalar const& other,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+  : scalar{other, stream, mr}, _data(other._data, stream, mr)
+{
+}
+
 struct_scalar::struct_scalar(table_view const& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
@@ -567,12 +574,13 @@ void struct_scalar::superimpose_nulls(rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
   // push validity mask down
-  std::vector<bitmask_type> host_validity({0});
-  auto validity = cudf::detail::make_device_uvector_sync(host_validity, stream, mr);
+  std::vector<bitmask_type> host_validity(
+    cudf::bitmask_allocation_size_bytes(1) / sizeof(bitmask_type), 0);
+  auto validity = cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream);
   auto iter     = thrust::make_counting_iterator(0);
   std::for_each(iter, iter + _data.num_columns(), [&](size_type i) {
     cudf::structs::detail::superimpose_parent_nulls(
-      validity.data(), 1, _data.get_column(i), stream, mr);
+      static_cast<bitmask_type const*>(validity.data()), 1, _data.get_column(i), stream, mr);
   });
 }
 
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 2ef27759124..3e0bb8704b6 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
@@ -60,9 +61,6 @@ struct compute_size_and_concatenate_fn {
   // If d_chars != nullptr: only concatenate strings.
   char* d_chars{nullptr};
 
-  // We need to set `1` or `0` for the validities of the output strings.
-  int8_t* d_validities{nullptr};
-
   __device__ bool output_is_null(size_type const idx,
                                  size_type const start_idx,
                                  size_type const end_idx) const noexcept
@@ -73,33 +71,31 @@ struct compute_size_and_concatenate_fn {
 
   __device__ void operator()(size_type const idx) const noexcept
   {
-    // If this is the second pass, and the row `idx` is known to be a null string
-    if (d_chars && !d_validities[idx]) { return; }
+    // If this is the second pass, and the row `idx` is known to be a null or empty string
+    if (d_chars && (d_offsets[idx] == d_offsets[idx + 1])) { return; }
 
     // Indices of the strings within the list row
     auto const start_idx = list_offsets[idx];
     auto const end_idx   = list_offsets[idx + 1];
 
     if (!d_chars && output_is_null(idx, start_idx, end_idx)) {
-      d_offsets[idx]    = 0;
-      d_validities[idx] = false;
+      d_offsets[idx] = 0;
       return;
     }
 
     auto const separator   = func.separator(idx);
-    auto size_bytes        = size_type{0};
     char* output_ptr       = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    bool has_valid_element = false;
     bool write_separator   = false;
+    auto size_bytes        = size_type{0};
+    bool has_valid_element = false;
 
     for (size_type str_idx = start_idx; str_idx < end_idx; ++str_idx) {
       bool null_element = strings_dv.is_null(str_idx);
       has_valid_element = has_valid_element || !null_element;
 
       if (!d_chars && (null_element && !string_narep_dv.is_valid())) {
-        d_offsets[idx]    = 0;
-        d_validities[idx] = false;
-        return;  // early termination: the entire list of strings will result in a null string
+        size_bytes = 0;
+        break;
       }
 
       if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) {
@@ -119,11 +115,7 @@ struct compute_size_and_concatenate_fn {
 
     // If there are all null elements, the output should be the same as having an empty list input:
     // a null or an empty string
-    if (!d_chars) {
-      d_offsets[idx] = has_valid_element ? size_bytes : 0;
-      d_validities[idx] =
-        has_valid_element || empty_list_policy == output_if_empty_list::EMPTY_STRING;
-    }
+    if (!d_chars) { d_offsets[idx] = has_valid_element ? size_bytes : 0; }
   }
 };
 
@@ -144,6 +136,33 @@ struct scalar_separator_fn {
   __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); }
 };
 
+template <typename CompFn>
+struct validities_fn {
+  CompFn comp_fn;
+
+  validities_fn(CompFn comp_fn) : comp_fn(comp_fn) {}
+
+  __device__ bool operator()(size_type idx)
+  {
+    auto const start_idx = comp_fn.list_offsets[idx];
+    auto const end_idx   = comp_fn.list_offsets[idx + 1];
+    bool valid_output    = !comp_fn.output_is_null(idx, start_idx, end_idx);
+    if (valid_output) {
+      bool check_elements = false;
+      for (size_type str_idx = start_idx; str_idx < end_idx; ++str_idx) {
+        bool const valid_element = comp_fn.strings_dv.is_valid(str_idx);
+        check_elements           = check_elements || valid_element;
+        // if an element is null and narep is invalid, the output row is null
+        if (!valid_element && !comp_fn.string_narep_dv.is_valid()) { return false; }
+      }
+      // handle empty-list-as-null output policy setting
+      valid_output =
+        check_elements || comp_fn.empty_list_policy == output_if_empty_list::EMPTY_STRING;
+    }
+    return valid_output;
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<column> join_list_elements(lists_column_view const& lists_strings_column,
@@ -180,8 +199,14 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     string_narep_dv,
                                                     separate_nulls,
                                                     empty_list_policy};
-  auto [offsets_column, chars_column, null_mask, null_count] =
-    make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr);
+
+  auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [null_mask, null_count] =
+    cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(num_rows),
+                           validities_fn{comp_fn},
+                           stream,
+                           mr);
 
   return make_strings_column(num_rows,
                              std::move(offsets_column),
@@ -254,8 +279,14 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     string_narep_dv,
                                                     separate_nulls,
                                                     empty_list_policy};
-  auto [offsets_column, chars_column, null_mask, null_count] =
-    make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr);
+
+  auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [null_mask, null_count] =
+    cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(num_rows),
+                           validities_fn{comp_fn},
+                           stream,
+                           mr);
 
   return make_strings_column(num_rows,
                              std::move(offsets_column),
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 8dcb260a7ee..c8b4b859020 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -535,7 +535,7 @@ struct parse_duration {
     auto ptr    = d_string.data();
     auto length = d_string.size_bytes();
     int8_t hour_shift{0};
-    for (size_t idx = 0; idx < items_count; ++idx) {
+    for (size_type idx = 0; idx < items_count; ++idx) {
       auto item = d_format_items[idx];
       if (length < item.length) return 1;
       if (item.item_type == format_char_type::literal) {  // static character we'll just skip;
@@ -567,7 +567,7 @@ struct parse_duration {
           break;
         case 'S':  // [-]SS[.mmm][uuu][nnn]
           timeparts->second = parse_second(ptr, item_length);
-          if (*(ptr + item_length) == '.') {
+          if ((item_length < length) && *(ptr + item_length) == '.') {
             item_length++;
             int64_t nanoseconds = str2int_fixed(
               ptr + item_length, 9, length - item_length, item_length);  // normalize to nanoseconds
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 866ff1adbc6..8d77c7da4cc 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -16,8 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -27,12 +26,11 @@
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include "thrust/iterator/transform_iterator.h"
 
 #include <thrust/binary_search.h>
-#include <thrust/for_each.h>
-#include <thrust/transform_reduce.h>
+#include <thrust/execution_policy.h>
 #include <thrust/transform_scan.h>
 
 namespace cudf {
@@ -287,12 +285,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
         column_view offsets_child = column->child(strings_column_view::offsets_column_index);
         column_view chars_child   = column->child(strings_column_view::chars_column_index);
 
-        auto d_offsets       = offsets_child.data<int32_t>() + column_offset;
-        int32_t bytes_offset = thrust::device_pointer_cast(d_offsets)[0];
+        auto bytes_offset =
+          cudf::detail::get_value<offset_type>(offsets_child, column_offset, stream);
 
         // copy the chars column data
-        auto d_chars    = chars_child.data<char>() + bytes_offset;
-        size_type bytes = thrust::device_pointer_cast(d_offsets)[column_size] - bytes_offset;
+        auto d_chars = chars_child.data<char>() + bytes_offset;
+        auto const bytes =
+          cudf::detail::get_value<offset_type>(offsets_child, column_size + column_offset, stream) -
+          bytes_offset;
+
         CUDA_TRY(
           cudaMemcpyAsync(d_new_chars, d_chars, bytes, cudaMemcpyDeviceToDevice, stream.value()));
 
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 462efedffe5..5f7b195e8f9 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -37,39 +37,57 @@ namespace strings {
 namespace detail {
 namespace {
 
+/**
+ * @brief Return the capturing group index pattern to use with the given replacement string.
+ *
+ * Only two patterns are supported at this time `\d` and `${d}` where `d` is an integer in
+ * the range 1-99. The `\d` pattern is returned by default unless no `\d` pattern is found in
+ * the `repl` string,
+ *
+ * Reference: https://www.regular-expressions.info/refreplacebackref.html
+ */
+std::string get_backref_pattern(std::string const& repl)
+{
+  std::string const backslash_pattern = "\\\\(\\d+)";
+  std::string const bracket_pattern   = "\\$\\{(\\d+)\\}";
+  std::smatch m;
+  return std::regex_search(repl, m, std::regex(backslash_pattern)) ? backslash_pattern
+                                                                   : bracket_pattern;
+}
 /**
  * @brief Parse the back-ref index and position values from a given replace format.
  *
- * The backref numbers are expected to be 1-based.
+ * The back-ref numbers are expected to be 1-based.
+ *
+ * Returns a modified string without back-ref indicators and a vector of back-ref
+ * byte position pairs. These are used by the device code to build the output
+ * string by placing the captured group elements into the replace format.
  *
- * Returns a modified string without back-ref indicators and a vector of backref
- * byte position pairs.
- * ```
- * Example:
- *    for input string:    'hello \2 and \1'
- *    the returned pairs:  (2,6),(1,11)
- *    returned string is:  'hello  and '
- * ```
+ * For example, for input string 'hello \2 and \1' the returned `backref_type` vector
+ * contains `[(2,6),(1,11)]` and the returned string is 'hello  and '.
  */
 std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string const& repl)
 {
   std::vector<backref_type> backrefs;
   std::string str = repl;  // make a modifiable copy
   std::smatch m;
-  std::regex ex("(\\\\\\d+)");  // this searches for backslash-number(s); example "\1"
-  std::string rtn;              // result without refs
+  std::regex ex(get_backref_pattern(repl));
+  std::string rtn;
   size_type byte_offset = 0;
-  while (std::regex_search(str, m, ex)) {
-    if (m.size() == 0) break;
-    std::string const backref = m[0];
-    size_type const position  = static_cast<size_type>(m.position(0));
-    size_type const length    = static_cast<size_type>(backref.length());
+  while (std::regex_search(str, m, ex) && !m.empty()) {
+    // parse the back-ref index number
+    size_type const index = static_cast<size_type>(std::atoi(std::string{m[1]}.c_str()));
+    CUDF_EXPECTS(index > 0 && index < 100, "Group index numbers must be in the range 1-99");
+
+    // store the new byte offset and index value
+    size_type const position = static_cast<size_type>(m.position(0));
     byte_offset += position;
-    size_type const index = std::atoi(backref.c_str() + 1);  // back-ref index number
-    CUDF_EXPECTS(index > 0, "Back-reference numbers must be greater than 0");
-    rtn += str.substr(0, position);
-    str = str.substr(position + length);
     backrefs.push_back({index, byte_offset});
+
+    // update the output string
+    rtn += str.substr(0, position);
+    // remove the back-ref pattern to continue parsing
+    str = str.substr(position + static_cast<size_type>(m.length(0)));
   }
   if (!str.empty())  // add the remainder
     rtn += str;      // of the string
@@ -96,7 +114,7 @@ std::unique_ptr<column> replace_with_backrefs(
   auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream);
   auto const regex_insts = d_prog->insts_counts();
 
-  // parse the repl string for backref indicators
+  // parse the repl string for back-ref indicators
   auto const parse_result = parse_backrefs(repl);
   rmm::device_uvector<backref_type> backrefs(parse_result.second.size(), stream);
   CUDA_TRY(cudaMemcpyAsync(backrefs.data(),
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index 6f18c4bcbd4..fe5483b119d 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -28,6 +28,7 @@
 
 #include <algorithm>
 #include <memory>
+#include <numeric>
 
 namespace cudf {
 namespace structs {
@@ -53,7 +54,11 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                    return cudf::detail::concatenate(cols, stream, mr);
                  });
 
-  size_type const total_length = children[0]->size();
+  // get total length from concatenated children; if no child exists, we would compute it
+  auto const acc_size_fn = [](size_type s, column_view const& c) { return s + c.size(); };
+  auto const total_length =
+    !children.empty() ? children[0]->size()
+                      : std::accumulate(columns.begin(), columns.end(), size_type{0}, acc_size_fn);
 
   // if any of the input columns have nulls, construct the output mask
   bool const has_nulls =
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 80bea2ab55e..bfeb6ef3533 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -16,8 +16,10 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
@@ -61,6 +63,24 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
   return result;
 }
 
+namespace {
+/**
+ * @brief Check whether the specified column is of type `STRUCT`.
+ */
+bool is_struct(cudf::column_view const& col) { return col.type().id() == type_id::STRUCT; }
+
+/**
+ * @brief Check whether the specified column is of type LIST, or any LISTs in its descendent
+ * columns.
+ */
+bool is_or_has_nested_lists(cudf::column_view const& col)
+{
+  auto is_list = [](cudf::column_view const& col) { return col.type().id() == type_id::LIST; };
+
+  return is_list(col) || std::any_of(col.child_begin(), col.child_end(), is_or_has_nested_lists);
+}
+}  // namespace
+
 /**
  * @brief Flattens struct columns to constituent non-struct columns in the input table.
  *
@@ -86,6 +106,13 @@ struct flattened_table {
       null_precedence(null_precedence),
       nullability(nullability)
   {
+    fail_if_unsupported_types(input);
+  }
+
+  void fail_if_unsupported_types(table_view const& input) const
+  {
+    auto const has_lists = std::any_of(input.begin(), input.end(), is_or_has_nested_lists);
+    CUDF_EXPECTS(not has_lists, "Flattening LIST columns is not supported.");
   }
 
   // Convert null_mask to BOOL8 columns and flatten the struct children in order.
@@ -156,9 +183,6 @@ struct flattened_table {
   }
 };
 
-/**
- * @copydoc cudf::detail::flatten_nested_columns
- */
 std::tuple<table_view,
            std::vector<order>,
            std::vector<null_order>,
@@ -168,15 +192,107 @@ flatten_nested_columns(table_view const& input,
                        std::vector<null_order> const& null_precedence,
                        column_nullability nullability)
 {
-  std::vector<std::unique_ptr<column>> validity_as_column;
-  auto const has_struct = std::any_of(
-    input.begin(), input.end(), [](auto const& col) { return col.type().id() == type_id::STRUCT; });
-  if (not has_struct)
-    return std::make_tuple(input, column_order, null_precedence, std::move(validity_as_column));
+  auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
+  if (not has_struct) {
+    return std::make_tuple(
+      input, column_order, null_precedence, std::vector<std::unique_ptr<column>>{});
+  }
 
   return flattened_table{input, column_order, null_precedence, nullability}();
 }
 
+namespace {
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+using column_index_t    = typename vector_of_columns::size_type;
+
+// Forward declaration, to enable recursion via `unflattener`.
+std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
+                                               column_index_t& current_index,
+                                               cudf::column_view const& blueprint);
+
+/**
+ * @brief Helper functor to reconstruct STRUCT columns from its flattened member columns.
+ *
+ */
+class unflattener {
+ public:
+  unflattener(vector_of_columns& flattened_, column_index_t& current_index_)
+    : flattened{flattened_}, current_index{current_index_}
+  {
+  }
+
+  auto operator()(column_view const& blueprint)
+  {
+    return is_struct(blueprint) ? unflatten_struct(flattened, current_index, blueprint)
+                                : std::move(flattened[current_index++]);
+  }
+
+ private:
+  vector_of_columns& flattened;
+  column_index_t& current_index;
+
+};  // class unflattener;
+
+std::unique_ptr<cudf::column> unflatten_struct(vector_of_columns& flattened,
+                                               column_index_t& current_index,
+                                               cudf::column_view const& blueprint)
+{
+  // "Consume" columns from `flattened`, starting at `current_index`,
+  // based on the provided `blueprint` struct col. Recurse for struct children.
+  CUDF_EXPECTS(blueprint.type().id() == type_id::STRUCT,
+               "Expected blueprint column to be a STRUCT column.");
+
+  CUDF_EXPECTS(current_index < flattened.size(), "STRUCT column can't have 0 children.");
+
+  auto const num_rows = flattened[current_index]->size();
+
+  // cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
+  // before the child/member columns.
+  // E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
+  //      1. Null Vector for STRUCT_1
+  //      2. Null Vector for STRUCT_2
+  //      3. Member STRUCT_2::A
+  //      4. Member STRUCT_2::B
+  //      5. Member STRUCT_1::C
+  //
+  // Extract null-vector *before* child columns are constructed.
+  auto struct_null_column_contents = flattened[current_index++]->release();
+  auto unflattening_iter =
+    thrust::make_transform_iterator(blueprint.child_begin(), unflattener{flattened, current_index});
+
+  return cudf::make_structs_column(
+    num_rows,
+    vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_children()},
+    UNKNOWN_NULL_COUNT,  // Do count?
+    std::move(*struct_null_column_contents.null_mask));
+}
+}  // namespace
+
+std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
+                                                      table_view const& blueprint)
+{
+  // Bail, if LISTs are present.
+  auto const has_lists = std::any_of(blueprint.begin(), blueprint.end(), is_or_has_nested_lists);
+  CUDF_EXPECTS(not has_lists, "Unflattening LIST columns is not supported.");
+
+  // If there are no STRUCTs, unflattening is a NOOP.
+  auto const has_structs = std::any_of(blueprint.begin(), blueprint.end(), is_struct);
+  if (not has_structs) {
+    return std::move(flattened);  // Unchanged.
+  }
+
+  // There be struct columns.
+  // Note: Requires null vectors for all struct input columns.
+  auto flattened_columns = flattened->release();
+  auto current_idx       = column_index_t{0};
+
+  auto unflattening_iter =
+    thrust::make_transform_iterator(blueprint.begin(), unflattener{flattened_columns, current_idx});
+
+  return std::make_unique<cudf::table>(
+    vector_of_columns{unflattening_iter, unflattening_iter + blueprint.num_columns()});
+}
+
 // Helper function to superimpose validity of parent struct
 // over the specified member (child) column.
 void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
@@ -187,8 +303,7 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
 {
   if (!child.nullable()) {
     // Child currently has no null mask. Copy parent's null mask.
-    child.set_null_mask(rmm::device_buffer{
-      parent_null_mask, cudf::bitmask_allocation_size_bytes(child.size()), stream, mr});
+    child.set_null_mask(cudf::detail::copy_bitmask(parent_null_mask, 0, child.size(), stream, mr));
     child.set_null_count(parent_null_count);
   } else {
     // Child should have a null mask.
diff --git a/cpp/src/structs/utilities.hpp b/cpp/src/structs/utilities.hpp
index eee9ca63146..a68f09574ce 100644
--- a/cpp/src/structs/utilities.hpp
+++ b/cpp/src/structs/utilities.hpp
@@ -76,6 +76,35 @@ flatten_nested_columns(table_view const& input,
                        std::vector<null_order> const& null_precedence,
                        column_nullability nullability = column_nullability::MATCH_INCOMING);
 
+/**
+ * @brief Unflatten columns flattened as by `flatten_nested_columns()`,
+ *        based on the provided `blueprint`.
+ *
+ * cudf::flatten_nested_columns() executes depth first, and serializes the struct null vector
+ * before the child/member columns.
+ * E.g. STRUCT_1< STRUCT_2< A, B >, C > is flattened to:
+ *      1. Null Vector for STRUCT_1
+ *      2. Null Vector for STRUCT_2
+ *      3. Member STRUCT_2::A
+ *      4. Member STRUCT_2::B
+ *      5. Member STRUCT_1::C
+ *
+ * `unflatten_nested_columns()` reconstructs nested columns from flattened input that follows
+ * the convention above.
+ *
+ * Note: This function requires a null-mask vector for each STRUCT column, including for nested
+ * STRUCT members.
+ *
+ * @param flattened "Flattened" `table` of input columns, following the conventions in
+ * `flatten_nested_columns()`.
+ * @param blueprint The exemplar `table_view` with nested columns intact, whose structure defines
+ * the nesting of the reconstructed output table.
+ * @return std::unique_ptr<cudf::table> Unflattened table (with nested STRUCT columns) reconstructed
+ * based on `blueprint`.
+ */
+std::unique_ptr<cudf::table> unflatten_nested_columns(std::unique_ptr<cudf::table>&& flattened,
+                                                      table_view const& blueprint);
+
 /**
  * @brief Pushdown nulls from a parent mask into a child column, using AND.
  *
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 3800339a6a2..b2230f95842 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <stdint.h>
 #include <algorithm>
@@ -40,11 +41,11 @@ namespace {
 struct get_codepoint_metadata_init {
   rmm::cuda_stream_view stream;
 
-  codepoint_metadata_type* operator()() const
+  rmm::device_uvector<codepoint_metadata_type>* operator()() const
   {
-    codepoint_metadata_type* table =
-      static_cast<codepoint_metadata_type*>(rmm::mr::get_current_device_resource()->allocate(
-        codepoint_metadata_size * sizeof(codepoint_metadata_type), stream));
+    auto table_vector =
+      new rmm::device_uvector<codepoint_metadata_type>(codepoint_metadata_size, stream);
+    auto table = table_vector->data();
     thrust::fill(rmm::exec_policy(stream),
                  table + cp_section1_end,
                  table + codepoint_metadata_size,
@@ -60,18 +61,18 @@ struct get_codepoint_metadata_init {
       (cp_section2_end - cp_section2_begin + 1) * sizeof(codepoint_metadata[0]),  // 2nd section
       cudaMemcpyHostToDevice,
       stream.value()));
-    return table;
+    return table_vector;
   };
 };
 
 struct get_aux_codepoint_data_init {
   rmm::cuda_stream_view stream;
 
-  aux_codepoint_data_type* operator()() const
+  rmm::device_uvector<aux_codepoint_data_type>* operator()() const
   {
-    aux_codepoint_data_type* table =
-      static_cast<aux_codepoint_data_type*>(rmm::mr::get_current_device_resource()->allocate(
-        aux_codepoint_data_size * sizeof(aux_codepoint_data_type), stream));
+    auto table_vector =
+      new rmm::device_uvector<aux_codepoint_data_type>(aux_codepoint_data_size, stream);
+    auto table = table_vector->data();
     thrust::fill(rmm::exec_policy(stream),
                  table + aux_section1_end,
                  table + aux_codepoint_data_size,
@@ -99,7 +100,7 @@ struct get_aux_codepoint_data_init {
       (aux_section4_end - aux_section4_begin + 1) * sizeof(aux_codepoint_data[0]),  // 4th section
       cudaMemcpyHostToDevice,
       stream.value()));
-    return table;
+    return table_vector;
   }
 };
 }  // namespace
@@ -112,11 +113,11 @@ struct get_aux_codepoint_data_init {
  */
 const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<codepoint_metadata_type>
+  static cudf::strings::detail::thread_safe_per_context_cache<
+    rmm::device_uvector<codepoint_metadata_type>>
     g_codepoint_metadata;
 
-  get_codepoint_metadata_init function = {stream};
-  return g_codepoint_metadata.find_or_initialize(function);
+  return g_codepoint_metadata.find_or_initialize(get_codepoint_metadata_init{stream})->data();
 }
 
 /**
@@ -127,10 +128,11 @@ const codepoint_metadata_type* get_codepoint_metadata(rmm::cuda_stream_view stre
  */
 const aux_codepoint_data_type* get_aux_codepoint_data(rmm::cuda_stream_view stream)
 {
-  static cudf::strings::detail::thread_safe_per_context_cache<aux_codepoint_data_type>
+  static cudf::strings::detail::thread_safe_per_context_cache<
+    rmm::device_uvector<aux_codepoint_data_type>>
     g_aux_codepoint_data;
-  get_aux_codepoint_data_init function = {stream};
-  return g_aux_codepoint_data.find_or_initialize(function);
+
+  return g_aux_codepoint_data.find_or_initialize(get_aux_codepoint_data_init{stream})->data();
 }
 
 namespace {
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 8c14f89d4d0..6de1044b492 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -172,8 +172,10 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
     thrust::make_counting_iterator<cudf::size_type>(0),
     thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
     offsets_per_tensor.begin(),
-    [device_offsets, do_truncate, max_sequence_length, stride] __device__(cudf::size_type idx) {
-      uint32_t num_tokens = device_offsets[idx + 1] - device_offsets[idx];
+    [device_offsets, do_truncate, max_sequence_length, stride, strings_count] __device__(
+      cudf::size_type idx) {
+      uint32_t const num_tokens =
+        idx < strings_count ? device_offsets[idx + 1] - device_offsets[idx] : 0;
       if (do_truncate || num_tokens <= max_sequence_length) return uint32_t{1};
       return 1 + ((num_tokens - max_sequence_length + stride - 1) / stride);
     },
diff --git a/cpp/src/ast/transform.cu b/cpp/src/transform/compute_column.cu
similarity index 68%
rename from cpp/src/ast/transform.cu
rename to cpp/src/transform/compute_column.cu
index d6426f92002..1466ee9ad27 100644
--- a/cpp/src/ast/transform.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -14,17 +14,18 @@
  * limitations under the License.
  */
 
-#include <cudf/ast/detail/transform.cuh>
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
-#include <cudf/ast/transform.hpp>
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -33,7 +34,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace cudf {
-namespace ast {
 namespace detail {
 
 /**
@@ -47,13 +47,14 @@ namespace detail {
  * @tparam has_nulls whether or not the output column may contain nulls.
  *
  * @param table The table device view used for evaluation.
- * @param plan Container of device data required to evaluate the desired expression.
+ * @param device_expression_data Container of device data required to evaluate the desired
+ * expression.
  * @param output_column The destination for the results of evaluating the expression.
  */
 template <cudf::size_type max_block_size, bool has_nulls>
 __launch_bounds__(max_block_size) __global__
   void compute_column_kernel(table_device_view const table,
-                             device_ast_plan plan,
+                             ast::detail::expression_device_view device_expression_data,
                              mutable_column_device_view output_column)
 {
   // The (required) extern storage of the shared memory array leads to
@@ -61,23 +62,24 @@ __launch_bounds__(max_block_size) __global__
   // workaround is to declare an arbitrary (here char) array type then cast it
   // after the fact to the appropriate type.
   extern __shared__ char raw_intermediate_storage[];
-  IntermediateDataType<has_nulls>* intermediate_storage =
-    reinterpret_cast<IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
 
-  auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * plan.num_intermediates];
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
   auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
   auto const stride    = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
-  auto evaluator =
-    cudf::ast::detail::expression_evaluator<has_nulls>(table, plan, thread_intermediate_storage);
+  auto evaluator       = cudf::ast::detail::expression_evaluator<has_nulls>(
+    table, device_expression_data, thread_intermediate_storage);
 
   for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
-    auto output_dest = mutable_column_expression_result<has_nulls>(output_column);
+    auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
     evaluator.evaluate(output_dest, row_index);
   }
 }
 
-std::unique_ptr<column> compute_column(table_view const table,
-                                       expression const& expr,
+std::unique_ptr<column> compute_column(table_view const& table,
+                                       ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
@@ -89,19 +91,19 @@ std::unique_ptr<column> compute_column(table_view const table,
   auto const nullable  = cudf::nullable(table);
   auto const has_nulls = nullable && cudf::has_nulls(table);
 
-  auto const plan = ast_plan{expr, table, has_nulls, stream, mr};
+  auto const parser = ast::detail::expression_parser{expr, table, has_nulls, stream, mr};
 
   auto const output_column_mask_state =
     nullable ? (has_nulls ? mask_state::UNINITIALIZED : mask_state::ALL_VALID)
              : mask_state::UNALLOCATED;
 
   auto output_column = cudf::make_fixed_width_column(
-    plan.output_type(), table.num_rows(), output_column_mask_state, stream, mr);
+    parser.output_type(), table.num_rows(), output_column_mask_state, stream, mr);
   auto mutable_output_device =
     cudf::mutable_column_device_view::create(output_column->mutable_view(), stream);
 
   // Configure kernel parameters
-  auto const& dev_plan = plan.dev_plan;
+  auto const& device_expression_data = parser.device_expression_data;
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int shmem_limit_per_block;
@@ -109,22 +111,23 @@ std::unique_ptr<column> compute_column(table_view const table,
     cudaDeviceGetAttribute(&shmem_limit_per_block, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
   auto constexpr MAX_BLOCK_SIZE = 128;
   auto const block_size =
-    dev_plan.shmem_per_thread != 0
-      ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / dev_plan.shmem_per_thread)
+    device_expression_data.shmem_per_thread != 0
+      ? std::min(MAX_BLOCK_SIZE, shmem_limit_per_block / device_expression_data.shmem_per_thread)
       : MAX_BLOCK_SIZE;
-  auto const config          = cudf::detail::grid_1d{table.num_rows(), block_size};
-  auto const shmem_per_block = dev_plan.shmem_per_thread * config.num_threads_per_block;
+  auto const config = cudf::detail::grid_1d{table.num_rows(), block_size};
+  auto const shmem_per_block =
+    device_expression_data.shmem_per_thread * config.num_threads_per_block;
 
   // Execute the kernel
   auto table_device = table_device_view::create(table, stream);
   if (has_nulls) {
-    cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE, true>
+    cudf::detail::compute_column_kernel<MAX_BLOCK_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_per_block, stream.value()>>>(
-        *table_device, dev_plan, *mutable_output_device);
+        *table_device, device_expression_data, *mutable_output_device);
   } else {
-    cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE, false>
+    cudf::detail::compute_column_kernel<MAX_BLOCK_SIZE, false>
       <<<config.num_blocks, config.num_threads_per_block, shmem_per_block, stream.value()>>>(
-        *table_device, dev_plan, *mutable_output_device);
+        *table_device, device_expression_data, *mutable_output_device);
   }
   CHECK_CUDA(stream.value());
   return output_column;
@@ -132,14 +135,12 @@ std::unique_ptr<column> compute_column(table_view const table,
 
 }  // namespace detail
 
-std::unique_ptr<column> compute_column(table_view const table,
-                                       expression const& expr,
+std::unique_ptr<column> compute_column(table_view const& table,
+                                       ast::expression const& expr,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_column(table, expr, rmm::cuda_stream_default, mr);
 }
 
-}  // namespace ast
-
 }  // namespace cudf
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 67a06e60dd3..5bc2cb21ac7 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -16,9 +16,9 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/reshape.hpp>
 #include <cudf/detail/transpose.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/reshape.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/transpose.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -44,7 +44,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
       input.begin(), input.end(), [dtype](auto const& col) { return dtype == col.type(); }),
     "Column type mismatch");
 
-  auto output_column = cudf::interleave_columns(input, mr);
+  auto output_column = cudf::detail::interleave_columns(input, stream, mr);
   auto one_iter      = thrust::make_counting_iterator<size_type>(1);
   auto splits_iter   = thrust::make_transform_iterator(
     one_iter, [width = input.num_columns()](size_type idx) { return idx * width; });
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c82826b8c60..19421e3115d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -379,6 +379,7 @@ ConfigureTest(STRINGS_TEST
 # - structs test ----------------------------------------------------------------------------------
 ConfigureTest(STRUCTS_TEST 
     structs/structs_column_tests.cpp
+    structs/utilities_tests.cpp
     )
 
 ###################################################################################################
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 738c58c32b8..de6c9d486ec 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/ast/operators.hpp>
-#include <cudf/ast/transform.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -24,6 +23,7 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -47,6 +47,35 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve
 struct TransformTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(TransformTest, ColumnReference)
+{
+  auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
+  auto c_1   = column_wrapper<int32_t>{10, 7, 20, 0};
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto col_ref_0 = cudf::ast::column_reference(0);
+
+  auto const& expected = c_0;
+  auto result          = cudf::compute_column(table, col_ref_0);
+
+  cudf::test::expect_columns_equal(expected, result->view(), verbosity);
+}
+
+TEST_F(TransformTest, Literal)
+{
+  auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
+  auto c_1   = column_wrapper<int32_t>{10, 7, 20, 0};
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto literal_value = cudf::numeric_scalar<int32_t>(42);
+  auto literal       = cudf::ast::literal(literal_value);
+
+  auto expected = column_wrapper<int32_t>{42, 42, 42, 42};
+  auto result   = cudf::compute_column(table, literal);
+
+  cudf::test::expect_columns_equal(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, BasicAddition)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
@@ -55,10 +84,10 @@ TEST_F(TransformTest, BasicAddition)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<int32_t>{13, 27, 21, 50};
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
@@ -70,11 +99,11 @@ TEST_F(TransformTest, BasicAdditionLarge)
   auto table = cudf::table_view{{col, col}};
 
   auto col_ref    = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref, col_ref);
 
   auto b        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
   auto expected = column_wrapper<int32_t>(b, b + 2000);
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
@@ -87,10 +116,10 @@ TEST_F(TransformTest, LessComparator)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<bool>{true, false, true, false};
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
@@ -105,11 +134,11 @@ TEST_F(TransformTest, LessComparatorLarge)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
 
   auto c        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; });
   auto expected = column_wrapper<bool>(c, c + 2000);
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
@@ -126,15 +155,15 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic)
   auto col_ref_2 = cudf::ast::column_reference(2);
 
   auto expression_left_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
 
   auto expression_right_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::SUB, col_ref_2, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::SUB, col_ref_2, col_ref_0);
 
-  auto expression_tree = cudf::ast::expression(
+  auto expression_tree = cudf::ast::operation(
     cudf::ast::ast_operator::ADD, expression_left_subtree, expression_right_subtree);
 
-  auto result   = cudf::ast::compute_column(table, expression_tree);
+  auto result   = cudf::compute_column(table, expression_tree);
   auto expected = column_wrapper<int32_t>{7, 73, 22, -99};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -142,8 +171,6 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic)
 
 TEST_F(TransformTest, MultiLevelTreeArithmeticLarge)
 {
-  using namespace cudf::ast;
-
   auto a     = thrust::make_counting_iterator(0);
   auto b     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; });
   auto c     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
@@ -152,15 +179,17 @@ TEST_F(TransformTest, MultiLevelTreeArithmeticLarge)
   auto c_2   = column_wrapper<int32_t>(c, c + 2000);
   auto table = cudf::table_view{{c_0, c_1, c_2}};
 
-  auto col_ref_0 = column_reference(0);
-  auto col_ref_1 = column_reference(1);
-  auto col_ref_2 = column_reference(2);
+  auto col_ref_0 = cudf::ast::column_reference(0);
+  auto col_ref_1 = cudf::ast::column_reference(1);
+  auto col_ref_2 = cudf::ast::column_reference(2);
 
-  auto expr_left_subtree  = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
-  auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0);
-  auto expr_tree          = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree);
+  auto expr_left_subtree = cudf::ast::operation(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
+  auto expr_right_subtree =
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0);
+  auto expr_tree =
+    cudf::ast::operation(cudf::ast::ast_operator::SUB, expr_left_subtree, expr_right_subtree);
 
-  auto result = cudf::ast::compute_column(table, expr_tree);
+  auto result = cudf::compute_column(table, expr_tree);
   auto calc   = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); };
   auto d      = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); });
   auto expected = column_wrapper<int32_t>(d, d + 2000);
@@ -180,12 +209,12 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic)
   auto col_ref_2 = cudf::ast::column_reference(2);
 
   auto expression_right_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
 
   auto expression_tree =
-    cudf::ast::expression(cudf::ast::ast_operator::SUB, col_ref_2, expression_right_subtree);
+    cudf::ast::operation(cudf::ast::ast_operator::SUB, col_ref_2, expression_right_subtree);
 
-  auto result = cudf::ast::compute_column(table, expression_tree);
+  auto result = cudf::compute_column(table, expression_tree);
   auto expected =
     column_wrapper<double>{0.6, std::numeric_limits<double>::infinity(), -3.201, -2099.18};
 
@@ -204,15 +233,15 @@ TEST_F(TransformTest, MultiLevelTreeComparator)
   auto col_ref_2 = cudf::ast::column_reference(2);
 
   auto expression_left_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::GREATER_EQUAL, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER_EQUAL, col_ref_0, col_ref_1);
 
   auto expression_right_subtree =
-    cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_2, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_2, col_ref_0);
 
-  auto expression_tree = cudf::ast::expression(
+  auto expression_tree = cudf::ast::operation(
     cudf::ast::ast_operator::LOGICAL_AND, expression_left_subtree, expression_right_subtree);
 
-  auto result   = cudf::ast::compute_column(table, expression_tree);
+  auto result   = cudf::compute_column(table, expression_tree);
   auto expected = column_wrapper<bool>{false, true, false, false};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -228,13 +257,13 @@ TEST_F(TransformTest, MultiTypeOperationFailure)
   auto col_ref_1 = cudf::ast::column_reference(1);
 
   auto expression_0_plus_1 =
-    cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
   auto expression_1_plus_0 =
-    cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_1, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_1, col_ref_0);
 
   // Operations on different types are not allowed
-  EXPECT_THROW(cudf::ast::compute_column(table, expression_0_plus_1), cudf::logic_error);
-  EXPECT_THROW(cudf::ast::compute_column(table, expression_1_plus_0), cudf::logic_error);
+  EXPECT_THROW(cudf::compute_column(table, expression_0_plus_1), cudf::logic_error);
+  EXPECT_THROW(cudf::compute_column(table, expression_1_plus_0), cudf::logic_error);
 }
 
 TEST_F(TransformTest, LiteralComparison)
@@ -246,9 +275,9 @@ TEST_F(TransformTest, LiteralComparison)
   auto literal_value = cudf::numeric_scalar<int32_t>(41);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, literal);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<bool>{false, false, false, true};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -261,9 +290,9 @@ TEST_F(TransformTest, UnaryNot)
 
   auto col_ref_0 = cudf::ast::column_reference(0);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, col_ref_0);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<bool>{false, true, false, false};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -277,26 +306,26 @@ TEST_F(TransformTest, UnaryTrigonometry)
   auto col_ref_0 = cudf::ast::column_reference(0);
 
   auto expected_sin   = column_wrapper<double>{0.0, std::sqrt(2) / 2, std::sqrt(3.0) / 2.0};
-  auto expression_sin = cudf::ast::expression(cudf::ast::ast_operator::SIN, col_ref_0);
-  auto result_sin     = cudf::ast::compute_column(table, expression_sin);
+  auto expression_sin = cudf::ast::operation(cudf::ast::ast_operator::SIN, col_ref_0);
+  auto result_sin     = cudf::compute_column(table, expression_sin);
   cudf::test::expect_columns_equivalent(expected_sin, result_sin->view(), verbosity);
 
   auto expected_cos   = column_wrapper<double>{1.0, std::sqrt(2) / 2, 0.5};
-  auto expression_cos = cudf::ast::expression(cudf::ast::ast_operator::COS, col_ref_0);
-  auto result_cos     = cudf::ast::compute_column(table, expression_cos);
+  auto expression_cos = cudf::ast::operation(cudf::ast::ast_operator::COS, col_ref_0);
+  auto result_cos     = cudf::compute_column(table, expression_cos);
   cudf::test::expect_columns_equivalent(expected_cos, result_cos->view(), verbosity);
 
   auto expected_tan   = column_wrapper<double>{0.0, 1.0, std::sqrt(3.0)};
-  auto expression_tan = cudf::ast::expression(cudf::ast::ast_operator::TAN, col_ref_0);
-  auto result_tan     = cudf::ast::compute_column(table, expression_tan);
+  auto expression_tan = cudf::ast::operation(cudf::ast::ast_operator::TAN, col_ref_0);
+  auto result_tan     = cudf::compute_column(table, expression_tan);
   cudf::test::expect_columns_equivalent(expected_tan, result_tan->view(), verbosity);
 }
 
 TEST_F(TransformTest, ArityCheckFailure)
 {
   auto col_ref_0 = cudf::ast::column_reference(0);
-  EXPECT_THROW(cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0), cudf::logic_error);
-  EXPECT_THROW(cudf::ast::expression(cudf::ast::ast_operator::ABS, col_ref_0, col_ref_0),
+  EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0), cudf::logic_error);
+  EXPECT_THROW(cudf::ast::operation(cudf::ast::ast_operator::ABS, col_ref_0, col_ref_0),
                cudf::logic_error);
 }
 
@@ -308,10 +337,10 @@ TEST_F(TransformTest, StringComparison)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<bool>{true, false, true, false};
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
@@ -322,9 +351,9 @@ TEST_F(TransformTest, CopyColumn)
   auto table = cudf::table_view{{c_0}};
 
   auto col_ref_0  = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, col_ref_0);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, col_ref_0);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<int32_t>{3, 0, 1, 50};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -338,9 +367,9 @@ TEST_F(TransformTest, CopyLiteral)
   auto literal_value = cudf::numeric_scalar<int32_t>(-123);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::IDENTITY, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::IDENTITY, literal);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<int32_t>{-123, -123, -123, -123};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -355,9 +384,9 @@ TEST_F(TransformTest, TrueDiv)
   auto literal_value = cudf::numeric_scalar<int32_t>(2);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::TRUE_DIV, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::TRUE_DIV, col_ref_0, literal);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.5, 0.0, 0.5, 25.0};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -372,9 +401,9 @@ TEST_F(TransformTest, FloorDiv)
   auto literal_value = cudf::numeric_scalar<double>(2.0);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::FLOOR_DIV, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::FLOOR_DIV, col_ref_0, literal);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.0, 0.0, 0.0, 25.0};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -389,9 +418,9 @@ TEST_F(TransformTest, Mod)
   auto literal_value = cudf::numeric_scalar<double>(2.0);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::MOD, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::MOD, col_ref_0, literal);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.0, 0.0, -1.0, 0.0};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -406,9 +435,9 @@ TEST_F(TransformTest, PyMod)
   auto literal_value = cudf::numeric_scalar<double>(2.0);
   auto literal       = cudf::ast::literal(literal_value);
 
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::PYMOD, col_ref_0, literal);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::PYMOD, col_ref_0, literal);
 
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<double>{1.0, 0.0, 1.0, 0.0};
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
@@ -422,10 +451,10 @@ TEST_F(TransformTest, BasicAdditionNulls)
 
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
 
   auto expected = column_wrapper<int32_t>{{0, 0, 0, 50}, {0, 0, 0, 1}};
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
@@ -447,11 +476,11 @@ TEST_F(TransformTest, BasicAdditionLargeNulls)
   auto table = cudf::table_view{{col}};
 
   auto col_ref    = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref, col_ref);
 
   auto b        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
   auto expected = column_wrapper<int32_t>(b, b + N, validities.begin());
-  auto result   = cudf::ast::compute_column(table, expression);
+  auto result   = cudf::compute_column(table, expression);
 
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index 7d3b7beb2cb..c48f7ad4dbc 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -48,8 +48,6 @@ using Table       = cudf::table;
 
 template <typename T>
 struct TypedColumnTest : public cudf::test::BaseFixture {
-  static std::size_t data_size() { return 1000; }
-  static std::size_t mask_size() { return 100; }
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 
   TypedColumnTest(rmm::cuda_stream_view stream = rmm::cuda_stream_default)
@@ -58,14 +56,14 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
   {
     auto typed_data = static_cast<char*>(data.data());
     auto typed_mask = static_cast<char*>(mask.data());
-    std::vector<char> h_data(data_size());
+    std::vector<char> h_data(data.size());
     std::iota(h_data.begin(), h_data.end(), char{0});
-    std::vector<char> h_mask(mask_size());
+    std::vector<char> h_mask(mask.size());
     std::iota(h_mask.begin(), h_mask.end(), char{0});
     CUDA_TRY(cudaMemcpyAsync(
-      typed_data, h_data.data(), data_size(), cudaMemcpyHostToDevice, stream.value()));
+      typed_data, h_data.data(), data.size(), cudaMemcpyHostToDevice, stream.value()));
     CUDA_TRY(cudaMemcpyAsync(
-      typed_mask, h_mask.data(), mask_size(), cudaMemcpyHostToDevice, stream.value()));
+      typed_mask, h_mask.data(), mask.size(), cudaMemcpyHostToDevice, stream.value()));
     stream.synchronize();
   }
 
@@ -484,7 +482,7 @@ TEST_F(OverflowTest, Presliced)
     auto offset_gen = cudf::detail::make_counting_transform_iterator(
       0, [string_size](size_type index) { return index * string_size; });
     cudf::test::fixed_width_column_wrapper<int> offsets(offset_gen, offset_gen + num_rows + 1);
-    auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, num_rows);
+    auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, total_chars_size);
     auto col        = cudf::make_strings_column(
       num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{});
 
@@ -515,7 +513,7 @@ TEST_F(OverflowTest, Presliced)
                            offsets->view().begin<offset_type>(),
                            offsets->view().end<offset_type>(),
                            offsets->mutable_view().begin<offset_type>());
-    auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, num_rows);
+    auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, total_chars_size);
     auto col        = cudf::make_strings_column(
       num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{});
 
@@ -826,6 +824,22 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
   cudf::test::expect_columns_equivalent(*result, *expected);
 }
 
+TEST_F(StructsColumnTest, ConcatenateEmptyStructs)
+{
+  using namespace cudf::test;
+
+  auto expected = cudf::make_structs_column(10, {}, 0, rmm::device_buffer());
+  auto first    = cudf::make_structs_column(5, {}, 0, rmm::device_buffer());
+  auto second   = cudf::make_structs_column(2, {}, 0, rmm::device_buffer());
+  auto third    = cudf::make_structs_column(0, {}, 0, rmm::device_buffer());
+  auto fourth   = cudf::make_structs_column(3, {}, 0, rmm::device_buffer());
+
+  // concatenate
+  auto result = cudf::concatenate(std::vector<column_view>({*first, *second, *third, *fourth}));
+  CUDF_EXPECTS(result->size() == expected->size(), "column size changed after concat");
+  cudf::test::expect_columns_equivalent(*result, *expected);
+}
+
 TEST_F(StructsColumnTest, ConcatenateSplitStructs)
 {
   using namespace cudf::test;
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index c05e95c164e..39ad5f556d4 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -570,6 +570,41 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear)
       {true, false, true, true, true, true, true, true, false, true, true, false}});
 }
 
+TEST_F(BasicDatetimeOpsTest, TestDaysInMonths)
+
+{
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  auto timestamps_s =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_s, cudf::timestamp_s::rep>{
+      {
+        0L,            // NULL
+        -1887541682L,  // 1910-03-10 10:51:58
+        0L,            // NULL
+        -1251006943L,  // 1930-05-11 18:04:17
+        -932134638L,   // 1940-06-18 09:42:42
+        -614354877L,   // 1950-07-14 09:52:03
+        -296070394L,   // 1960-08-14 06:13:26
+        22840404L,     // 1970-09-22 08:33:24
+        339817190L,    // 1980-10-08 01:39:50
+        657928062L,    // 1990-11-06 21:47:42
+        976630837L,    // 2000-12-12 14:20:37
+        1294699018L,   // 2011-01-10 22:36:58
+        1613970182L,   // 2021-02-22 05:03:02 - non leap year February
+        1930963331L,   // 2031-03-11 02:42:11
+        2249867102L,   // 2041-04-18 03:05:02
+        951426858L,    // 2000-02-24 21:14:18 - leap year February
+      },
+      iterators::nulls_at({0, 2})};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*days_in_month(timestamps_s),
+                                 cudf::test::fixed_width_column_wrapper<int16_t>{
+                                   {-1, 31, -1, 31, 30, 31, 31, 30, 31, 30, 31, 31, 28, 31, 30, 29},
+                                   iterators::nulls_at({0, 2})});
+}
+
 TEST_F(BasicDatetimeOpsTest, TestQuarter)
 {
   using namespace cudf::test;
diff --git a/cpp/tests/encode/encode_tests.cpp b/cpp/tests/encode/encode_tests.cpp
index 52244b38dfe..73c77a39a97 100644
--- a/cpp/tests/encode/encode_tests.cpp
+++ b/cpp/tests/encode/encode_tests.cpp
@@ -67,9 +67,6 @@ TYPED_TEST(EncodeNumericTests, SimpleWithNulls)
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_keys{{1, 2, 3, 0}, {1, 1, 1, 0}};
   auto const result = cudf::encode(cudf::table_view({input}));
 
-  cudf::test::print(result.first->view().column(0));
-  cudf::test::print(expect_keys);
-
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.first->view().column(0), expect_keys);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.second->view(), expect);
 }
diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp
index 6bf627d7b78..7cf693f7b08 100644
--- a/cpp/tests/groupby/argmax_tests.cpp
+++ b/cpp/tests/groupby/argmax_tests.cpp
@@ -47,10 +47,10 @@ TYPED_TEST(groupby_argmax_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{0, 1, 2};
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -67,10 +67,10 @@ TYPED_TEST(groupby_argmax_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -87,10 +87,10 @@ TYPED_TEST(groupby_argmax_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -111,10 +111,10 @@ TYPED_TEST(groupby_argmax_test, null_keys_and_values)
   //  {6, 3,     5, 4, 0,   2, 1,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 4, 7, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -132,10 +132,10 @@ TEST_F(groupby_argmax_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({0, 4, 2});
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -150,10 +150,10 @@ TEST_F(groupby_argmax_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmax_aggregation();
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmax_aggregation();
+  auto agg2 = cudf::make_argmax_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -172,12 +172,13 @@ TEST_F(groupby_dictionary_argmax_test, basic)
   fixed_width_column_wrapper<R> expect_vals({ 0, 4, 2 });
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_argmax_aggregation<groupby_aggregation>());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals,
-                  cudf::make_argmax_aggregation(),
+                  cudf::make_argmax_aggregation<groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp
index d192c1b21b1..915575546c9 100644
--- a/cpp/tests/groupby/argmin_tests.cpp
+++ b/cpp/tests/groupby/argmin_tests.cpp
@@ -47,10 +47,10 @@ TYPED_TEST(groupby_argmin_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{6, 9, 8};
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -67,10 +67,10 @@ TYPED_TEST(groupby_argmin_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -87,10 +87,10 @@ TYPED_TEST(groupby_argmin_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -111,11 +111,11 @@ TYPED_TEST(groupby_argmin_test, null_keys_and_values)
   //  { 9, 6,     8, 5, 0,   7, 1,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 9, 8, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
   // TODO: explore making this a gtest parameter
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -133,10 +133,10 @@ TEST_F(groupby_argmin_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({3, 5, 7});
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -151,10 +151,10 @@ TEST_F(groupby_argmin_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_argmin_aggregation();
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_argmin_aggregation();
+  auto agg2 = cudf::make_argmin_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -173,12 +173,13 @@ TEST_F(groupby_dictionary_argmin_test, basic)
   fixed_width_column_wrapper<R> expect_vals({ 3, 5, 7 });
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_argmin_aggregation<groupby_aggregation>());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals,
-                  cudf::make_argmin_aggregation(),
+                  cudf::make_argmin_aggregation<groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index 43c62743b9f..009917dabae 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -45,7 +45,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithoutNulls)
   fixed_width_column_wrapper<K, int32_t> expect_keys{1, 2};
   lists_column_wrapper<V, int32_t> expect_vals{{1, 2, 3}, {4, 5, 6}};
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -64,7 +64,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNulls)
   lists_column_wrapper<V, int32_t> expect_vals{
     {{1, 2}, validity.begin()}, {{3, 4}, validity.begin()}, {{5, 6}, validity.begin()}};
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -82,7 +82,7 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion)
 
   lists_column_wrapper<V, int32_t> expect_vals{{2}, {4}, {}, {8, 9}};
 
-  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>(null_policy::EXCLUDE);
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -97,7 +97,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput)
   fixed_width_column_wrapper<K, int32_t> expect_keys{};
   lists_column_wrapper<V, int32_t> expect_vals{};
 
-  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>(null_policy::EXCLUDE);
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -116,7 +116,7 @@ TYPED_TEST(groupby_collect_list_test, CollectLists)
   lists_column_wrapper<V, int32_t> expect_vals{
     {{1, 2}, {3, 4}}, {{5, 6, 7}, LCW{}}, {{9, 10}, {11}}};
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -135,7 +135,7 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion)
 
   LCW expect_vals{{{1, 2}}, {LCW{}}, {{9, 10}, {11}}, {}};
 
-  auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE);
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>(null_policy::EXCLUDE);
   test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -158,7 +158,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists)
   auto expect_values =
     cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {});
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
 }
 
@@ -190,7 +190,7 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
   auto expect_values = cudf::make_lists_column(
     0, make_empty_column(data_type{type_to_id<offset_type>()}), std::move(expect_child), 0, {});
 
-  auto agg = cudf::make_collect_list_aggregation();
+  auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
 }
 
@@ -212,8 +212,11 @@ TYPED_TEST(groupby_collect_list_test, dictionary)
                                              0,
                                              rmm::device_buffer{});
 
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_list_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_collect_list_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 2f89b04c745..198caabfca9 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -33,16 +33,20 @@ namespace test {
 #define VALIDITY std::initializer_list<bool>
 
 struct CollectSetTest : public cudf::test::BaseFixture {
-  static auto collect_set() { return cudf::make_collect_set_aggregation(); }
+  static auto collect_set()
+  {
+    return cudf::make_collect_set_aggregation<cudf::groupby_aggregation>();
+  }
 
   static auto collect_set_null_unequal()
   {
-    return cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL);
+    return cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(null_policy::INCLUDE,
+                                                                         null_equality::UNEQUAL);
   }
 
   static auto collect_set_null_exclude()
   {
-    return cudf::make_collect_set_aggregation(null_policy::EXCLUDE);
+    return cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(null_policy::EXCLUDE);
   }
 };
 
@@ -174,7 +178,7 @@ TEST_F(CollectSetTest, FloatsWithNaN)
                   vals,
                   keys_expected,
                   vals_expected,
-                  cudf::make_collect_set_aggregation(
+                  cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
                     null_policy::INCLUDE, null_equality::EQUAL, nan_equality::ALL_EQUAL));
   // null unequal with nan equal
   vals_expected = {
@@ -183,7 +187,7 @@ TEST_F(CollectSetTest, FloatsWithNaN)
                   vals,
                   keys_expected,
                   vals_expected,
-                  cudf::make_collect_set_aggregation(
+                  cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
                     null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::ALL_EQUAL));
 }
 
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 9740bfa1954..62e8b11241d 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -53,11 +53,11 @@ TYPED_TEST(groupby_count_scan_test, basic)
   result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
   // clang-format on
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
                             "Unsupported groupby scan aggregation");
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -74,10 +74,10 @@ TYPED_TEST(groupby_count_scan_test, empty_cols)
   result_wrapper expect_vals;
   // clang-format on
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
   EXPECT_NO_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)));
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -94,7 +94,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -111,7 +111,7 @@ TYPED_TEST(groupby_count_scan_test, zero_valid_values)
   result_wrapper expect_vals{0, 1, 2};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -130,7 +130,7 @@ TYPED_TEST(groupby_count_scan_test, null_keys_and_values)
   result_wrapper expect_vals{0, 1, 2, 0, 1,    2, 3, 0, 1, 0};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -151,7 +151,7 @@ TEST_F(groupby_count_scan_string_test, basic)
   result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
   // clang-format on
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -182,10 +182,14 @@ TYPED_TEST(FixedPointTestBothReps, GroupByCountScan)
   // clang-format on
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation()),
+    test_single_scan(keys,
+                     vals,
+                     expect_keys,
+                     expect_vals,
+                     cudf::make_count_aggregation<groupby_scan_aggregation>()),
     "Unsupported groupby scan aggregation");
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -205,11 +209,14 @@ TEST_F(groupby_dictionary_count_scan_test, basic)
   result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
   // clang-format on
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
                             "Unsupported groupby scan aggregation");
-  test_single_scan(
-    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(null_policy::INCLUDE));
+  test_single_scan(keys,
+                   vals,
+                   expect_keys,
+                   expect_vals,
+                   cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index 2d45de04607..cbb821767c9 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -45,13 +45,13 @@ TYPED_TEST(groupby_count_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{3, 4, 3};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -66,10 +66,10 @@ TYPED_TEST(groupby_count_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals;
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 }
 
@@ -84,13 +84,13 @@ TYPED_TEST(groupby_count_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -105,14 +105,14 @@ TYPED_TEST(groupby_count_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals{0};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
   fixed_width_column_wrapper<R> expect_vals2{3};
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
 }
 
@@ -133,14 +133,14 @@ TYPED_TEST(groupby_count_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals({2,        3,         2,       0});
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
   fixed_width_column_wrapper<R> expect_vals2{3, 4, 2, 1};
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
 }
 
@@ -160,10 +160,10 @@ TEST_F(groupby_count_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{0, 1, 3, 5};
   fixed_width_column_wrapper<R> expect_vals{1, 1, 2, 2};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 }
 // clang-format on
@@ -191,13 +191,13 @@ TYPED_TEST(FixedPointTestBothReps, GroupByCount)
   auto const expect_keys = fixed_width_column_wrapper<K>{1, 2, 3};
   auto const expect_vals = fixed_width_column_wrapper<R>{3, 4, 3};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg1 = cudf::make_count_aggregation();
+  auto agg1 = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg1), force_use_sort_impl::YES);
 
-  auto agg2 = cudf::make_count_aggregation(null_policy::INCLUDE);
+  auto agg2 = cudf::make_count_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2));
 }
 
@@ -216,9 +216,14 @@ TEST_F(groupby_dictionary_count_test, basic)
   fixed_width_column_wrapper<R> expect_vals{1, 1, 2, 2};
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation());
   test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation(), force_use_sort_impl::YES);
+    keys, vals, expect_keys, expect_vals, cudf::make_count_aggregation<groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_count_aggregation<groupby_aggregation>(),
+                  force_use_sort_impl::YES);
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 9a083ac8e74..542205b5b51 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -63,7 +63,7 @@ inline void test_single_agg(column_view const& keys,
                             column_view const& values,
                             column_view const& expect_keys,
                             column_view const& expect_vals,
-                            std::unique_ptr<aggregation>&& agg,
+                            std::unique_ptr<groupby_aggregation>&& agg,
                             force_use_sort_impl use_sort           = force_use_sort_impl::NO,
                             null_policy include_null_keys          = null_policy::EXCLUDE,
                             sorted keys_are_sorted                 = sorted::NO,
@@ -78,7 +78,7 @@ inline void test_single_agg(column_view const& keys,
 
   if (use_sort == force_use_sort_impl::YES) {
     // WAR to force groupby to use sort implementation
-    requests[0].aggregations.push_back(make_nth_element_aggregation(0));
+    requests[0].aggregations.push_back(make_nth_element_aggregation<groupby_aggregation>(0));
   }
 
   groupby::groupby gb_obj(
@@ -105,14 +105,14 @@ inline void test_single_scan(column_view const& keys,
                              column_view const& values,
                              column_view const& expect_keys,
                              column_view const& expect_vals,
-                             std::unique_ptr<aggregation>&& agg,
+                             std::unique_ptr<groupby_scan_aggregation>&& agg,
                              null_policy include_null_keys                  = null_policy::EXCLUDE,
                              sorted keys_are_sorted                         = sorted::NO,
                              std::vector<order> const& column_order         = {},
                              std::vector<null_order> const& null_precedence = {})
 {
-  std::vector<groupby::aggregation_request> requests;
-  requests.emplace_back(groupby::aggregation_request());
+  std::vector<groupby::scan_request> requests;
+  requests.emplace_back(groupby::scan_request());
   requests[0].values = values;
 
   requests[0].aggregations.push_back(std::move(agg));
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 91db37a5ff6..683eeb7eb01 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -50,7 +50,7 @@ TYPED_TEST(groupby_keys_test, basic)
   fixed_width_column_wrapper<R> expect_vals { 3, 4, 3 };
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_keys_test, zero_valid_keys)
   fixed_width_column_wrapper<R> expect_vals { };
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -89,7 +89,7 @@ TYPED_TEST(groupby_keys_test, some_null_keys)
   fixed_width_column_wrapper<R> expect_vals { 3,        4,           2,     1};
   // clang-format on
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -111,7 +111,7 @@ TYPED_TEST(groupby_keys_test, include_null_keys)
   fixed_width_column_wrapper<R> expect_vals { 9,        19,          10,    4,  7};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -135,7 +135,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys)
   fixed_width_column_wrapper<R> expect_vals { 3,       18,         24,      4};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -160,7 +160,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_descending)
   fixed_width_column_wrapper<R> expect_vals { 0, 6,       22,        21      };
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -187,7 +187,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_nullable)
   fixed_width_column_wrapper<R> expect_vals { 3,       15,         17,      4};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -215,7 +215,7 @@ TYPED_TEST(groupby_keys_test, pre_sorted_keys_nulls_before_include_nulls)
   fixed_width_column_wrapper<R> expect_vals { 3,        7,     11,    7,  17,    4};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -234,10 +234,11 @@ TYPED_TEST(groupby_keys_test, mismatch_num_rows)
   fixed_width_column_wrapper<K> keys{1, 2, 3};
   fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4};
 
-  auto agg = cudf::make_count_aggregation();
+  auto agg = cudf::make_count_aggregation<groupby_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)),
                             "Size mismatch between request values and groupby keys.");
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg)),
+  auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>();
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg2)),
                             "Size mismatch between request values and groupby keys.");
 }
 
@@ -257,7 +258,7 @@ TEST_F(groupby_string_keys_test, basic)
   fixed_width_column_wrapper<R> expect_vals {     9,    19,   17 };
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 // clang-format on
@@ -278,9 +279,14 @@ TEST_F(groupby_dictionary_keys_test, basic)
   fixed_width_column_wrapper<R> expect_vals({     9,    19,   17 });
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation());
   test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES);
+    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation<groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_sum_aggregation<groupby_aggregation>(),
+                  force_use_sort_impl::YES);
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
index 7b338a0d9b8..be7d6c1ce05 100644
--- a/cpp/tests/groupby/m2_tests.cpp
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -44,7 +44,7 @@ auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = values;
-  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index 70a48da69e8..4d83dc9f7ba 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -55,7 +55,7 @@ TYPED_TEST(groupby_max_scan_test, basic)
   result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7});
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -70,7 +70,7 @@ TYPED_TEST(groupby_max_scan_test, empty_cols)
   key_wrapper expect_keys{};
   result_wrapper expect_vals{};
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -87,7 +87,7 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -104,7 +104,7 @@ TYPED_TEST(groupby_max_scan_test, zero_valid_values)
   result_wrapper expect_vals({-1, -1, -1}, all_nulls());
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -124,7 +124,7 @@ TYPED_TEST(groupby_max_scan_test, null_keys_and_values)
                              { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
   // clang-format on
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -152,7 +152,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
     auto const expect_vals_max = fp_wrapper{{5, 8, 8, 6, 9, 9, 9, 7, 7, 7}, scale};
     // clang-format on
 
-    auto agg = cudf::make_max_aggregation();
+    auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
     test_single_scan(keys, vals, expect_keys, expect_vals_max, std::move(agg));
   }
 }
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index b5710d3f4bc..a1e34b625e8 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -46,10 +46,10 @@ TYPED_TEST(groupby_max_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({6, 9, 8});
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -64,10 +64,10 @@ TYPED_TEST(groupby_max_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -82,10 +82,10 @@ TYPED_TEST(groupby_max_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -100,10 +100,10 @@ TYPED_TEST(groupby_max_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -122,10 +122,10 @@ TYPED_TEST(groupby_max_test, null_keys_and_values)
   //  { 0, 3,     1, 4, 5,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 5, 8, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -140,10 +140,10 @@ TEST_F(groupby_max_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   strings_column_wrapper expect_vals({"año", "zit", "₹1"});
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -155,10 +155,10 @@ TEST_F(groupby_max_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   strings_column_wrapper expect_vals({""}, all_nulls());
 
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_max_aggregation();
+  auto agg2 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -187,7 +187,7 @@ TEST_F(groupby_max_string_test, max_sorted_strings)
   // fixed_width_column_wrapper<size_type> expect_argmax(
   // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1},
   // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-  auto agg = cudf::make_max_aggregation();
+  auto agg = cudf::make_max_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -214,12 +214,16 @@ TEST_F(groupby_dictionary_max_test, basic)
 
   auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
 
-  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_max_aggregation());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals->view(),
-                  cudf::make_max_aggregation(),
+                  cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_max_aggregation<cudf::groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
@@ -247,7 +251,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
 
-    auto agg3 = cudf::make_max_aggregation();
+    auto agg3 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
     test_single_agg(
       keys, vals, expect_keys, expect_vals_max, std::move(agg3), force_use_sort_impl::YES);
   }
@@ -271,7 +275,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_max = fp_wrapper{{6, 9, 8}, scale};
 
-    auto agg7 = cudf::make_max_aggregation();
+    auto agg7 = cudf::make_max_aggregation<cudf::groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_max, std::move(agg7));
   }
 }
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
index bac95b11e81..613e1555b79 100644
--- a/cpp/tests/groupby/mean_tests.cpp
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -67,7 +67,7 @@ TYPED_TEST(groupby_mean_test, basic)
   fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend());
   // clang-format on
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -82,7 +82,7 @@ TYPED_TEST(groupby_mean_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -97,7 +97,7 @@ TYPED_TEST(groupby_mean_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -112,7 +112,7 @@ TYPED_TEST(groupby_mean_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -135,7 +135,7 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values)
   fixed_width_column_wrapper<R, RT> expect_vals(expect_v.cbegin(), expect_v.cend(), {1, 1, 1, 0});
   // clang-format on
 
-  auto agg = cudf::make_mean_aggregation();
+  auto agg = cudf::make_mean_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 // clang-format on
@@ -156,7 +156,8 @@ TEST_F(groupby_dictionary_mean_test, basic)
   fixed_width_column_wrapper<R, double> expect_vals({9. / 3, 19. / 4, 17. / 3});
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/median_tests.cpp b/cpp/tests/groupby/median_tests.cpp
index 18979820911..86d89325401 100644
--- a/cpp/tests/groupby/median_tests.cpp
+++ b/cpp/tests/groupby/median_tests.cpp
@@ -51,7 +51,7 @@ TYPED_TEST(groupby_median_test, basic)
   fixed_width_column_wrapper<R> expect_vals({3.,     4.5,        7.}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -66,7 +66,7 @@ TYPED_TEST(groupby_median_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -81,7 +81,7 @@ TYPED_TEST(groupby_median_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -96,7 +96,7 @@ TYPED_TEST(groupby_median_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -115,7 +115,7 @@ TYPED_TEST(groupby_median_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_median_aggregation();
+  auto agg = cudf::make_median_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -134,7 +134,8 @@ TYPED_TEST(groupby_median_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({3.,       4.5,       7.     }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_median_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index 29c6185e3a5..b6b1d1a1720 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -42,7 +42,8 @@ auto merge_lists(vcol_views const& keys_cols, vcol_views const& values_cols)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = *values;
-  requests[0].aggregations.emplace_back(cudf::make_merge_lists_aggregation());
+  requests[0].aggregations.emplace_back(
+    cudf::make_merge_lists_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 3ec8bfec774..60067e78022 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -59,9 +59,9 @@ auto compute_partial_results(cudf::column_view const& keys, cudf::column_view co
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = values;
-  requests[0].aggregations.emplace_back(cudf::make_count_aggregation());
-  requests[0].aggregations.emplace_back(cudf::make_mean_aggregation());
-  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation());
+  requests[0].aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+  requests[0].aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+  requests[0].aggregations.emplace_back(cudf::make_m2_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj                  = cudf::groupby::groupby(cudf::table_view({keys}));
   auto [out_keys, out_results] = gb_obj.aggregate(requests);
@@ -88,7 +88,8 @@ auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = *values;
-  requests[0].aggregations.emplace_back(cudf::make_merge_m2_aggregation());
+  requests[0].aggregations.emplace_back(
+    cudf::make_merge_m2_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index ee4f61bf44f..5a65774b430 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -42,7 +42,8 @@ auto merge_sets(vcol_views const& keys_cols, vcol_views const& values_cols)
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
   requests[0].values = *values;
-  requests[0].aggregations.emplace_back(cudf::make_merge_sets_aggregation());
+  requests[0].aggregations.emplace_back(
+    cudf::make_merge_sets_aggregation<cudf::groupby_aggregation>());
 
   auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys}));
   auto result = gb_obj.aggregate(requests);
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index ef548407761..452f70eaf16 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -53,7 +53,7 @@ TYPED_TEST(groupby_min_scan_test, basic)
   result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2});
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_min_scan_test, empty_cols)
   key_wrapper expect_keys{};
   result_wrapper expect_vals{};
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -85,7 +85,7 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -102,7 +102,7 @@ TYPED_TEST(groupby_min_scan_test, zero_valid_values)
   result_wrapper expect_vals({-1, -1, -1}, all_nulls());
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -122,7 +122,7 @@ TYPED_TEST(groupby_min_scan_test, null_keys_and_values)
                              { 0, 1, 1, 1, 1,  0, 1, 1,    1, 0});
   // clang-format on
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -137,7 +137,7 @@ TEST_F(groupby_min_scan_string_test, basic)
   key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
   strings_column_wrapper expect_vals;
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
   CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)),
                             "Unsupported groupby scan type-agg combination");
 }
@@ -167,7 +167,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
     auto const expect_vals_min = fp_wrapper{{5, 5, 1, 6, 6, 0, 0, 7, 2, 2}, scale};
     // clang-format on
 
-    auto agg = cudf::make_min_aggregation();
+    auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
     test_single_scan(keys, vals, expect_keys, expect_vals_min, std::move(agg));
   }
 }
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 1544e867595..59e9d540709 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -46,10 +46,10 @@ TYPED_TEST(groupby_min_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals({0, 1, 2});
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -64,10 +64,10 @@ TYPED_TEST(groupby_min_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -82,10 +82,10 @@ TYPED_TEST(groupby_min_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -100,10 +100,10 @@ TYPED_TEST(groupby_min_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -122,10 +122,10 @@ TYPED_TEST(groupby_min_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({3, 1, 2, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -140,10 +140,10 @@ TEST_F(groupby_min_string_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   strings_column_wrapper expect_vals({"aaa", "bat", "$1"});
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -155,10 +155,10 @@ TEST_F(groupby_min_string_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   strings_column_wrapper expect_vals({""}, all_nulls());
 
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_min_aggregation();
+  auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -187,7 +187,7 @@ TEST_F(groupby_min_string_test, min_sorted_strings)
   // fixed_width_column_wrapper<size_type> expect_argmin(
   // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1},
   // {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-  auto agg = cudf::make_min_aggregation();
+  auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys,
                   vals,
                   expect_keys,
@@ -214,12 +214,16 @@ TEST_F(groupby_dictionary_min_test, basic)
 
   auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
 
-  test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_min_aggregation());
   test_single_agg(keys,
                   vals,
                   expect_keys,
                   expect_vals->view(),
-                  cudf::make_min_aggregation(),
+                  cudf::make_min_aggregation<cudf::groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_min_aggregation<cudf::groupby_aggregation>(),
                   force_use_sort_impl::YES);
 }
 
@@ -246,7 +250,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
 
-    auto agg2 = cudf::make_min_aggregation();
+    auto agg2 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
     test_single_agg(
       keys, vals, expect_keys, expect_vals_min, std::move(agg2), force_use_sort_impl::YES);
   }
@@ -270,7 +274,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_min = fp_wrapper{{0, 1, 2}, scale};
 
-    auto agg6 = cudf::make_min_aggregation();
+    auto agg6 = cudf::make_min_aggregation<cudf::groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg6));
   }
 }
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index d5029147906..22f1e14815f 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -50,15 +50,15 @@ TYPED_TEST(groupby_nth_element_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
   //groupby.first()
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   fixed_width_column_wrapper<R, int32_t> expect_vals0({0, 1, 2});
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(1);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1);
   fixed_width_column_wrapper<R, int32_t> expect_vals1({3, 4, 7});
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2);
   fixed_width_column_wrapper<R, int32_t> expect_vals2({6, 5, 8});
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
@@ -75,7 +75,7 @@ TYPED_TEST(groupby_nth_element_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -90,7 +90,7 @@ TYPED_TEST(groupby_nth_element_test, basic_out_of_bounds)
 
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
-  auto agg = cudf::make_nth_element_aggregation(3);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(3);
   fixed_width_column_wrapper<R, int32_t> expect_vals({0, 9, 0}, {0, 1, 0});
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
@@ -109,15 +109,15 @@ TYPED_TEST(groupby_nth_element_test, negative)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
   //groupby.last()
-  auto agg = cudf::make_nth_element_aggregation(-1);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1);
   fixed_width_column_wrapper<R, int32_t> expect_vals0({6, 9, 8});
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2);
   fixed_width_column_wrapper<R, int32_t> expect_vals1({3, 5, 7});
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-3);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3);
   fixed_width_column_wrapper<R, int32_t> expect_vals2({0, 4, 2});
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
@@ -133,7 +133,7 @@ TYPED_TEST(groupby_nth_element_test, negative_out_of_bounds)
 
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
-  auto agg = cudf::make_nth_element_aggregation(-4);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-4);
   fixed_width_column_wrapper<R, int32_t> expect_vals({0, 1, 0}, {0, 1, 0});
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
@@ -150,7 +150,7 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -166,7 +166,7 @@ TYPED_TEST(groupby_nth_element_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R, int32_t> expect_vals({3}, all_nulls());
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -186,7 +186,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values)
   //vals                                    {-,3,6,    1,4,-,9,  2,8,      -}
   fixed_width_column_wrapper<R, int32_t> expect_vals({-1, 1, 2, -1}, {0, 1, 1, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -206,7 +206,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values_out_of_bounds)
   //                                         value,     null,       out,    out
   fixed_width_column_wrapper<R, int32_t> expect_vals({6, -1, -1, -1}, {1, 0, 0, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(2);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -237,18 +237,18 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls)
   fixed_width_column_wrapper<R, int32_t> expect_vals1({6, 4, 2, -1}, {1, 1, 1, 0});
   fixed_width_column_wrapper<R, int32_t> expect_vals2({-1, 9, 8, -1}, {0, 1, 1, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls2, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(0, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(1, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(2, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
 
@@ -282,18 +282,18 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls_negative_index)
   fixed_width_column_wrapper<R, int32_t> expect_vals1({3, 4, 2, -1}, {1, 1, 1, 0});
   fixed_width_column_wrapper<R, int32_t> expect_vals2({-1, 1, 2, -1}, {0, 1, 1, 0});
 
-  auto agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::INCLUDE);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::INCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3, cudf::null_policy::INCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_nuls2, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-1, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-2, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
-  agg = cudf::make_nth_element_aggregation(-3, cudf::null_policy::EXCLUDE);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3, cudf::null_policy::EXCLUDE);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 }
 
@@ -312,38 +312,38 @@ TEST_F(groupby_nth_element_string_test, basic_string)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
 
   //groupby.first()
-  auto agg = cudf::make_nth_element_aggregation(0);
+  auto agg = cudf::make_nth_element_aggregation<groupby_aggregation>(0);
   strings_column_wrapper expect_vals0{"ABCD", "1", "2"};
   test_single_agg(keys, vals, expect_keys, expect_vals0, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(1);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(1);
   strings_column_wrapper expect_vals1{"3", "4", "7"};
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(2);
   strings_column_wrapper expect_vals2{"6", "5", "8"};
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg));
 
   //+ve out of bounds
-  agg = cudf::make_nth_element_aggregation(3);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(3);
   strings_column_wrapper expect_vals3{{"", "9", ""}, {0, 1, 0}};
   test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg));
 
   //groupby.last()
-  agg = cudf::make_nth_element_aggregation(-1);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-1);
   strings_column_wrapper expect_vals4{"6", "9", "8"};
   test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-2);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-2);
   strings_column_wrapper expect_vals5{"3", "5", "7"};
   test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg));
 
-  agg = cudf::make_nth_element_aggregation(-3);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-3);
   strings_column_wrapper expect_vals6{"ABCD", "4", "2"};
   test_single_agg(keys, vals, expect_keys, expect_vals6, std::move(agg));
 
   //-ve out of bounds
-  agg = cudf::make_nth_element_aggregation(-4);
+  agg = cudf::make_nth_element_aggregation<groupby_aggregation>(-4);
   strings_column_wrapper expect_vals7{{"", "1", ""}, {0, 1, 0}};
   test_single_agg(keys, vals, expect_keys, expect_vals7, std::move(agg));
 }
@@ -361,8 +361,11 @@ TEST_F(groupby_nth_element_string_test, dictionary)
 
   auto expect_vals = cudf::dictionary::set_keys(expect_vals_w, vals.keys());
 
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2));
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals->view(),
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(2));
 }
 
 template <typename T>
@@ -384,8 +387,11 @@ TYPED_TEST(groupby_nth_element_lists_test, Basics)
   auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{1, 2, 3};
   auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}};
 
-  test_single_agg(
-    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0));
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(0));
 }
 
 TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
@@ -401,8 +407,11 @@ TYPED_TEST(groupby_nth_element_lists_test, EmptyInput)
   auto expected_keys   = fixed_width_column_wrapper<K, int32_t>{};
   auto expected_values = lists{};
 
-  test_single_agg(
-    keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2));
+  test_single_agg(keys,
+                  values,
+                  expected_keys,
+                  expected_values,
+                  cudf::make_nth_element_aggregation<groupby_aggregation>(2));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/nunique_tests.cpp b/cpp/tests/groupby/nunique_tests.cpp
index 089ca8805d4..88a6a1c903b 100644
--- a/cpp/tests/groupby/nunique_tests.cpp
+++ b/cpp/tests/groupby/nunique_tests.cpp
@@ -49,7 +49,7 @@ TYPED_TEST(groupby_nunique_test, basic)
   fixed_width_column_wrapper<R> expect_bool_vals{2,   1,          1};
   // clang-format on
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -67,7 +67,7 @@ TYPED_TEST(groupby_nunique_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -83,7 +83,7 @@ TYPED_TEST(groupby_nunique_test, basic_duplicates)
   fixed_width_column_wrapper<R> expect_vals{2, 4, 1};
   fixed_width_column_wrapper<R> expect_bool_vals{2, 1, 1};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -101,7 +101,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -116,7 +116,7 @@ TYPED_TEST(groupby_nunique_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals{0};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -136,7 +136,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
   fixed_width_column_wrapper<R> expect_bool_vals{1, 1, 1, 0};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -160,7 +160,7 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values_with_duplicates)
   fixed_width_column_wrapper<R> expect_vals{2, 3, 2, 0};
   fixed_width_column_wrapper<R> expect_bool_vals{1, 1, 1, 0};
 
-  auto agg = cudf::make_nunique_aggregation();
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>();
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -184,7 +184,7 @@ TYPED_TEST(groupby_nunique_test, include_nulls)
   fixed_width_column_wrapper<R> expect_vals{3, 4, 2, 1};
   fixed_width_column_wrapper<R> expect_bool_vals{2, 2, 1, 1};
 
-  auto agg = cudf::make_nunique_aggregation(null_policy::INCLUDE);
+  auto agg = cudf::make_nunique_aggregation<groupby_aggregation>(null_policy::INCLUDE);
   if (std::is_same<V, bool>())
     test_single_agg(keys, vals, expect_keys, expect_bool_vals, std::move(agg));
   else
@@ -213,8 +213,11 @@ TYPED_TEST(groupby_nunique_test, dictionary)
   cudf::column_view expect_vals = (std::is_same<V, bool>()) ? cudf::column_view{expect_bool_vals}
                                                             : cudf::column_view{expect_fixed_vals};
 
-  test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_nunique_aggregation(null_policy::INCLUDE));
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_nunique_aggregation<groupby_aggregation>(null_policy::INCLUDE));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/product_tests.cpp b/cpp/tests/groupby/product_tests.cpp
index eaa2cc07ff8..047bf856493 100644
--- a/cpp/tests/groupby/product_tests.cpp
+++ b/cpp/tests/groupby/product_tests.cpp
@@ -51,7 +51,11 @@ TYPED_TEST(groupby_product_test, basic)
   fixed_width_column_wrapper<R> expect_vals({   0.,       180.,      112. }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, empty_cols)
@@ -65,7 +69,11 @@ TYPED_TEST(groupby_product_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, zero_valid_keys)
@@ -79,7 +87,11 @@ TYPED_TEST(groupby_product_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, zero_valid_values)
@@ -93,7 +105,11 @@ TYPED_TEST(groupby_product_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, null_keys_and_values)
@@ -114,7 +130,11 @@ TYPED_TEST(groupby_product_test, null_keys_and_values)
                                             { 1,        1,         1,       0});
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, dictionary)
@@ -132,7 +152,11 @@ TYPED_TEST(groupby_product_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        112. }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 TYPED_TEST(groupby_product_test, dictionary_with_nulls)
@@ -151,7 +175,11 @@ TYPED_TEST(groupby_product_test, dictionary_with_nulls)
   fixed_width_column_wrapper<R> expect_vals({  0.,     180.,        56. }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_product_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_product_aggregation<cudf::groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/quantile_tests.cpp b/cpp/tests/groupby/quantile_tests.cpp
index a82dae9edcb..43b065ee4d3 100644
--- a/cpp/tests/groupby/quantile_tests.cpp
+++ b/cpp/tests/groupby/quantile_tests.cpp
@@ -51,7 +51,7 @@ TYPED_TEST(groupby_quantile_test, basic)
   fixed_width_column_wrapper<R> expect_vals({3., 4.5, 7.}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -66,7 +66,7 @@ TYPED_TEST(groupby_quantile_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -81,7 +81,7 @@ TYPED_TEST(groupby_quantile_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -96,7 +96,7 @@ TYPED_TEST(groupby_quantile_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -115,7 +115,7 @@ TYPED_TEST(groupby_quantile_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({4.5, 4., 5., 0.}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR);
+  auto agg = cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -134,7 +134,8 @@ TYPED_TEST(groupby_quantile_test, multiple_quantile)
   fixed_width_column_wrapper<R> expect_vals({1.5, 4.5, 3.25, 6.,   4.5, 7.5}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_quantile_aggregation({0.25, 0.75}, interpolation::LINEAR);
+  auto agg =
+    cudf::make_quantile_aggregation<groupby_aggregation>({0.25, 0.75}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
 }
 
@@ -152,27 +153,27 @@ TYPED_TEST(groupby_quantile_test, interpolation_types)
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals1({2.4,      4.2,         4.}, no_nulls());
-  auto agg1 = cudf::make_quantile_aggregation({0.4}, interpolation::LINEAR);
+  auto agg1 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::LINEAR);
   test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals2({3,        4,           2}, no_nulls());
-  auto agg2 = cudf::make_quantile_aggregation({0.4}, interpolation::NEAREST);
+  auto agg2 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::NEAREST);
   test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals3({0,        4,          2}, no_nulls());
-  auto agg3 = cudf::make_quantile_aggregation({0.4}, interpolation::LOWER);
+  auto agg3 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::LOWER);
   test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals4({3,        5,           7}, no_nulls());
-  auto agg4 = cudf::make_quantile_aggregation({0.4}, interpolation::HIGHER);
+  auto agg4 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::HIGHER);
   test_single_agg(keys, vals, expect_keys, expect_vals4, std::move(agg4));
 
   //                                         {0, 3, 6,  1, 4, 5, 9,  2, 7}
   fixed_width_column_wrapper<R> expect_vals5({1.5,      4.5,         4.5}, no_nulls());
-  auto agg5 = cudf::make_quantile_aggregation({0.4}, interpolation::MIDPOINT);
+  auto agg5 = cudf::make_quantile_aggregation<groupby_aggregation>({0.4}, interpolation::MIDPOINT);
   test_single_agg(keys, vals, expect_keys, expect_vals5, std::move(agg5));
   // clang-format on
 }
@@ -192,11 +193,12 @@ TYPED_TEST(groupby_quantile_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({3.,      4.5,        7.}, no_nulls());
   // clang-format on
 
-  test_single_agg(keys,
-                  vals,
-                  expect_keys,
-                  expect_vals,
-                  cudf::make_quantile_aggregation({0.5}, interpolation::LINEAR));
+  test_single_agg(
+    keys,
+    vals,
+    expect_keys,
+    expect_vals,
+    cudf::make_quantile_aggregation<groupby_aggregation>({0.5}, interpolation::LINEAR));
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 51c4c1e63c2..37e75e2e906 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -39,11 +39,16 @@ inline void test_pair_rank_scans(column_view const& keys,
                    order,
                    keys,
                    expected_dense,
-                   make_dense_rank_aggregation(),
+                   make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                   null_policy::INCLUDE,
+                   sorted::YES);
+  test_single_scan(keys,
+                   order,
+                   keys,
+                   expected_rank,
+                   make_rank_aggregation<groupby_scan_aggregation>(),
                    null_policy::INCLUDE,
                    sorted::YES);
-  test_single_scan(
-    keys, order, keys, expected_rank, make_rank_aggregation(), null_policy::INCLUDE, sorted::YES);
 }
 
 struct groupby_rank_scan_test : public BaseFixture {
@@ -201,11 +206,11 @@ TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs)
   auto expected_rank_vals =
     fixed_width_column_wrapper<size_type>{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3};
 
-  std::vector<groupby::aggregation_request> requests;
-  requests.emplace_back(groupby::aggregation_request());
+  std::vector<groupby::scan_request> requests;
+  requests.emplace_back(groupby::scan_request());
   requests[0].values = *struct_col;
-  requests[0].aggregations.push_back(make_dense_rank_aggregation());
-  requests[0].aggregations.push_back(make_rank_aggregation());
+  requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
   auto result = gb_obj.scan(requests);
@@ -377,34 +382,61 @@ TEST_F(groupby_rank_scan_test_failures, test_exception_triggers)
   fixed_width_column_wrapper<T> col{3, 3, 1};
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_dense_rank_aggregation(), null_policy::INCLUDE, sorted::NO),
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     null_policy::INCLUDE,
+                     sorted::NO),
     "Dense rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_rank_aggregation(), null_policy::INCLUDE, sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
+                                             col,
+                                             keys,
+                                             col,
+                                             make_rank_aggregation<groupby_scan_aggregation>(),
+                                             null_policy::INCLUDE,
+                                             sorted::NO),
+                            "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_dense_rank_aggregation(), null_policy::EXCLUDE, sorted::YES),
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     null_policy::EXCLUDE,
+                     sorted::YES),
     "Dense rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_rank_aggregation(), null_policy::EXCLUDE, sorted::YES),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
+                                             col,
+                                             keys,
+                                             col,
+                                             make_rank_aggregation<groupby_scan_aggregation>(),
+                                             null_policy::EXCLUDE,
+                                             sorted::YES),
+                            "Rank aggregate in groupby scan requires the keys to be presorted");
 
   CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_dense_rank_aggregation(), null_policy::EXCLUDE, sorted::NO),
+    test_single_scan(keys,
+                     col,
+                     keys,
+                     col,
+                     make_dense_rank_aggregation<groupby_scan_aggregation>(),
+                     null_policy::EXCLUDE,
+                     sorted::NO),
     "Dense rank aggregate in groupby scan requires the keys to be presorted");
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(
-      keys, col, keys, col, make_rank_aggregation(), null_policy::EXCLUDE, sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys,
+                                             col,
+                                             keys,
+                                             col,
+                                             make_rank_aggregation<groupby_scan_aggregation>(),
+                                             null_policy::EXCLUDE,
+                                             sorted::NO),
+                            "Rank aggregate in groupby scan requires the keys to be presorted");
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/std_tests.cpp b/cpp/tests/groupby/std_tests.cpp
index c771971ad9a..e2edabf3e8f 100644
--- a/cpp/tests/groupby/std_tests.cpp
+++ b/cpp/tests/groupby/std_tests.cpp
@@ -53,7 +53,7 @@ TYPED_TEST(groupby_std_test, basic)
   fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_std_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -83,7 +83,7 @@ TYPED_TEST(groupby_std_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -98,7 +98,7 @@ TYPED_TEST(groupby_std_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -118,7 +118,7 @@ TYPED_TEST(groupby_std_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals({3 / sqrt(2), 7 / sqrt(3), 3 * sqrt(2), 0.},
                                             {1, 1, 1, 0});
 
-  auto agg = cudf::make_std_aggregation();
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -137,7 +137,7 @@ TYPED_TEST(groupby_std_test, ddof_non_default)
   //                                        { 3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({0., 7 * sqrt(2. / 3), 0., 0.}, {0, 1, 0, 0});
 
-  auto agg = cudf::make_std_aggregation(2);
+  auto agg = cudf::make_std_aggregation<cudf::groupby_aggregation>(2);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -156,7 +156,8 @@ TYPED_TEST(groupby_std_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({3.,       sqrt(131./12), sqrt(31./3)}, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation());
+  test_single_agg(
+    keys, vals, expect_keys, expect_vals, cudf::make_std_aggregation<cudf::groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/sum_of_squares_tests.cpp b/cpp/tests/groupby/sum_of_squares_tests.cpp
index 12b044c7382..0dab2c6483e 100644
--- a/cpp/tests/groupby/sum_of_squares_tests.cpp
+++ b/cpp/tests/groupby/sum_of_squares_tests.cpp
@@ -49,7 +49,7 @@ TYPED_TEST(groupby_sum_of_squares_test, basic)
   //  { 0, 3, 6,  1, 4, 5, 9,  2, 7, 8}
   fixed_width_column_wrapper<R> expect_vals({45., 123., 117.}, no_nulls());
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -64,7 +64,7 @@ TYPED_TEST(groupby_sum_of_squares_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -79,7 +79,7 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -94,7 +94,7 @@ TYPED_TEST(groupby_sum_of_squares_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -113,7 +113,7 @@ TYPED_TEST(groupby_sum_of_squares_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    3}
   fixed_width_column_wrapper<R> expect_vals({45., 98., 68., 9.}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_sum_of_squares_aggregation();
+  auto agg = cudf::make_sum_of_squares_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -132,7 +132,11 @@ TYPED_TEST(groupby_sum_of_squares_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({45.,       123.,       117.   }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_of_squares_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_sum_of_squares_aggregation<groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 2f1928747ae..86fc0238597 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -57,7 +57,7 @@ TYPED_TEST(groupby_sum_scan_test, basic)
   //                        {0, 3, 6, 1, 4, 5, 9, 2, 7, 8}
   result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17};
   // clang-format on
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -74,7 +74,7 @@ TYPED_TEST(groupby_sum_scan_test, empty_cols)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -91,7 +91,7 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_keys)
   result_wrapper expect_vals{};
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -108,7 +108,7 @@ TYPED_TEST(groupby_sum_scan_test, zero_valid_values)
   result_wrapper expect_vals({3, 4, 5}, all_nulls());
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -128,7 +128,7 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
                              { 0, 1, 1, 1, 1,  0,  1, 1,    1, 0});
   // clang-format on
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -156,7 +156,7 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
     auto const expect_vals_sum = out_fp_wrapper{{0, 3, 9, 1, 5, 10, 19, 2, 9, 17}, scale};
     // clang-format on
 
-    auto agg2 = cudf::make_sum_aggregation();
+    auto agg2 = cudf::make_sum_aggregation<groupby_scan_aggregation>();
     test_single_scan(keys, vals, expect_keys, expect_vals_sum, std::move(agg2));
   }
 }
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 458937ff2e4..5c935ee5a9d 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -49,10 +49,10 @@ TYPED_TEST(groupby_sum_test, basic)
   fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
   fixed_width_column_wrapper<R> expect_vals{9, 19, 17};
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -67,10 +67,10 @@ TYPED_TEST(groupby_sum_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -85,10 +85,10 @@ TYPED_TEST(groupby_sum_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -103,10 +103,10 @@ TYPED_TEST(groupby_sum_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 
@@ -125,10 +125,10 @@ TYPED_TEST(groupby_sum_test, null_keys_and_values)
   //  { 3, 6,     1, 4, 9,   2, 8,    -}
   fixed_width_column_wrapper<R> expect_vals({9, 14, 10, 0}, {1, 1, 1, 0});
 
-  auto agg = cudf::make_sum_aggregation();
+  auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 
-  auto agg2 = cudf::make_sum_aggregation();
+  auto agg2 = cudf::make_sum_aggregation<groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg2), force_use_sort_impl::YES);
 }
 // clang-format on
@@ -146,9 +146,14 @@ TYPED_TEST(groupby_sum_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals{ 9, 19, 17};
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation());
   test_single_agg(
-    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation(), force_use_sort_impl::YES);
+    keys, vals, expect_keys, expect_vals, cudf::make_sum_aggregation<groupby_aggregation>());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_sum_aggregation<groupby_aggregation>(),
+                  force_use_sort_impl::YES);
 }
 
 template <typename T>
@@ -176,11 +181,11 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
 
-    auto agg1 = cudf::make_sum_aggregation();
+    auto agg1 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(
       keys, vals, expect_keys, expect_vals_sum, std::move(agg1), force_use_sort_impl::YES);
 
-    auto agg4 = cudf::make_product_aggregation();
+    auto agg4 = cudf::make_product_aggregation<groupby_aggregation>();
     EXPECT_THROW(
       test_single_agg(keys, vals, expect_keys, {}, std::move(agg4), force_use_sort_impl::YES),
       cudf::logic_error);
@@ -206,10 +211,10 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashSumDecimalAsValue)
     auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
     auto const expect_vals_sum = fp64_wrapper{{9, 19, 17}, scale};
 
-    auto agg5 = cudf::make_sum_aggregation();
+    auto agg5 = cudf::make_sum_aggregation<groupby_aggregation>();
     test_single_agg(keys, vals, expect_keys, expect_vals_sum, std::move(agg5));
 
-    auto agg8 = cudf::make_product_aggregation();
+    auto agg8 = cudf::make_product_aggregation<groupby_aggregation>();
     EXPECT_THROW(test_single_agg(keys, vals, expect_keys, {}, std::move(agg8)), cudf::logic_error);
   }
 }
diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp
index c3fc781801d..68ccf791960 100644
--- a/cpp/tests/groupby/var_tests.cpp
+++ b/cpp/tests/groupby/var_tests.cpp
@@ -53,7 +53,7 @@ TYPED_TEST(groupby_var_test, basic)
   fixed_width_column_wrapper<R> expect_vals({9.,      131. / 12,   31. / 3}, no_nulls());
   // clang-format on
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -68,7 +68,7 @@ TYPED_TEST(groupby_var_test, empty_cols)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -83,7 +83,7 @@ TYPED_TEST(groupby_var_test, zero_valid_keys)
   fixed_width_column_wrapper<K> expect_keys{};
   fixed_width_column_wrapper<R> expect_vals{};
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -98,7 +98,7 @@ TYPED_TEST(groupby_var_test, zero_valid_values)
   fixed_width_column_wrapper<K> expect_keys{1};
   fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -119,7 +119,7 @@ TYPED_TEST(groupby_var_test, null_keys_and_values)
   fixed_width_column_wrapper<R> expect_vals({4.5,      49. / 3,   18.,     0.}, {1, 1, 1, 0});
   // clang-format on
 
-  auto agg = cudf::make_variance_aggregation();
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -141,7 +141,7 @@ TYPED_TEST(groupby_var_test, ddof_non_default)
                                             {0,         1,         0,       0});
   // clang-format on
 
-  auto agg = cudf::make_variance_aggregation(2);
+  auto agg = cudf::make_variance_aggregation<cudf::groupby_aggregation>(2);
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
@@ -160,7 +160,11 @@ TYPED_TEST(groupby_var_test, dictionary)
   fixed_width_column_wrapper<R> expect_vals({9.,      131./12,      31./3  }, no_nulls());
   // clang-format on
 
-  test_single_agg(keys, vals, expect_keys, expect_vals, cudf::make_variance_aggregation());
+  test_single_agg(keys,
+                  vals,
+                  expect_keys,
+                  expect_vals,
+                  cudf::make_variance_aggregation<cudf::groupby_aggregation>());
 }
 
 }  // namespace test
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 94f01fd62f3..5b6270a8be1 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -50,6 +50,16 @@
 
 namespace cudf_io = cudf::io;
 
+using cudf::data_type;
+using cudf::type_id;
+using cudf::type_to_id;
+
+template <typename T>
+auto dtype()
+{
+  return data_type{type_to_id<T>()};
+}
+
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same_v<T, cudf::string_view>,
@@ -80,7 +90,6 @@ struct CsvReaderTest : public cudf::test::BaseFixture {
 // Typed test fixture for timestamp type tests
 template <typename T>
 struct CsvReaderNumericTypeTest : public CsvReaderTest {
-  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
 // Declare typed test cases
@@ -93,8 +102,8 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
   void run_tests(const std::vector<std::string>& reference_strings, numeric::scale_type scale)
   {
     cudf::test::strings_column_wrapper strings(reference_strings.begin(), reference_strings.end());
-    auto input_column = cudf::strings::to_fixed_point(
-      cudf::strings_column_view(strings), cudf::data_type{cudf::type_to_id<DecimalType>(), scale});
+    auto input_column = cudf::strings::to_fixed_point(cudf::strings_column_view(strings),
+                                                      data_type{type_to_id<DecimalType>(), scale});
 
     std::string buffer = std::accumulate(reference_strings.begin(),
                                          reference_strings.end(),
@@ -105,7 +114,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
 
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
-        .dtypes({cudf::data_type{cudf::type_to_id<DecimalType>(), scale}})
+        .dtypes({data_type{type_to_id<DecimalType>(), scale}})
         .header(-1);
 
     const auto result      = cudf_io::read_csv(in_opts);
@@ -389,9 +398,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
   reference_strings = valid_reference_strings;
 
   using DecimalType = TypeParam;
-  auto input_column = cudf::strings::to_fixed_point(
-    cudf::strings_column_view(strings),
-    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{-2}});
+  auto input_column =
+    cudf::strings::to_fixed_point(cudf::strings_column_view(strings),
+                                  data_type{type_to_id<DecimalType>(), numeric::scale_type{-2}});
 
   auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
 
@@ -406,7 +415,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
   result_strings.reserve(reference_strings.size());
 
   std::ifstream read_result_file(filepath);
-  assert(read_result_file.is_open());
+  ASSERT_TRUE(read_result_file.is_open());
 
   std::copy(std::istream_iterator<std::string>(read_result_file),
             std::istream_iterator<std::string>(),
@@ -435,9 +444,9 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
   reference_strings = valid_reference_strings;
 
   using DecimalType = TypeParam;
-  auto input_column = cudf::strings::to_fixed_point(
-    cudf::strings_column_view(strings),
-    cudf::data_type{cudf::type_to_id<DecimalType>(), numeric::scale_type{3}});
+  auto input_column =
+    cudf::strings::to_fixed_point(cudf::strings_column_view(strings),
+                                  data_type{type_to_id<DecimalType>(), numeric::scale_type{3}});
 
   auto input_table = cudf::table_view{std::vector<cudf::column_view>{*input_column}};
 
@@ -452,7 +461,7 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
   result_strings.reserve(reference_strings.size());
 
   std::ifstream read_result_file(filepath);
-  assert(read_result_file.is_open());
+  ASSERT_TRUE(read_result_file.is_open());
 
   std::copy(std::istream_iterator<std::string>(read_result_file),
             std::istream_iterator<std::string>(),
@@ -479,11 +488,10 @@ TEST_F(CsvReaderTest, MultiColumn)
   {
     std::ostringstream line;
     for (int i = 0; i < num_rows; ++i) {
-      line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int16_values[i]
-           << "," << int32_values[i] << "," << int32_values[i] << "," << int64_values[i] << ","
-           << int64_values[i] << "," << std::to_string(uint8_values[i]) << "," << uint16_values[i]
-           << "," << uint32_values[i] << "," << uint64_values[i] << "," << float32_values[i] << ","
-           << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "\n";
+      line << std::to_string(int8_values[i]) << "," << int16_values[i] << "," << int32_values[i]
+           << "," << int64_values[i] << "," << std::to_string(uint8_values[i]) << ","
+           << uint16_values[i] << "," << uint32_values[i] << "," << uint64_values[i] << ","
+           << float32_values[i] << "," << float64_values[i] << "\n";
     }
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << line.str();
@@ -492,39 +500,29 @@ TEST_F(CsvReaderTest, MultiColumn)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .header(-1)
-      .dtypes(std::vector<std::string>{"int8",
-                                       "short",
-                                       "int16",
-                                       "int",
-                                       "int32",
-                                       "long",
-                                       "int64",
-                                       "uint8",
-                                       "uint16",
-                                       "uint32",
-                                       "uint64",
-                                       "float",
-                                       "float32",
-                                       "double",
-                                       "float64"});
+      .dtypes({dtype<int8_t>(),
+               dtype<int16_t>(),
+               dtype<int32_t>(),
+               dtype<int64_t>(),
+               dtype<uint8_t>(),
+               dtype<uint16_t>(),
+               dtype<uint32_t>(),
+               dtype<uint64_t>(),
+               dtype<float>(),
+               dtype<double>()});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   expect_column_data_equal(int8_values, view.column(0));
   expect_column_data_equal(int16_values, view.column(1));
-  expect_column_data_equal(int16_values, view.column(2));
-  expect_column_data_equal(int32_values, view.column(3));
-  expect_column_data_equal(int32_values, view.column(4));
-  expect_column_data_equal(int64_values, view.column(5));
-  expect_column_data_equal(int64_values, view.column(6));
-  expect_column_data_equal(uint8_values, view.column(7));
-  expect_column_data_equal(uint16_values, view.column(8));
-  expect_column_data_equal(uint32_values, view.column(9));
-  expect_column_data_equal(uint64_values, view.column(10));
-  expect_column_data_equal(float32_values, view.column(11));
-  expect_column_data_equal(float32_values, view.column(12));
-  expect_column_data_equal(float64_values, view.column(13));
-  expect_column_data_equal(float64_values, view.column(14));
+  expect_column_data_equal(int32_values, view.column(2));
+  expect_column_data_equal(int64_values, view.column(3));
+  expect_column_data_equal(uint8_values, view.column(4));
+  expect_column_data_equal(uint16_values, view.column(5));
+  expect_column_data_equal(uint32_values, view.column(6));
+  expect_column_data_equal(uint64_values, view.column(7));
+  expect_column_data_equal(float32_values, view.column(8));
+  expect_column_data_equal(float64_values, view.column(9));
 }
 
 TEST_F(CsvReaderTest, RepeatColumn)
@@ -549,7 +547,7 @@ TEST_F(CsvReaderTest, RepeatColumn)
   // repeats column in indexes and names, misses 1 column.
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes(std::vector<std::string>{"int16", "int64", "uint64", "float"})
+      .dtypes({dtype<int16_t>(), dtype<int64_t>(), dtype<uint64_t>(), dtype<float>()})
       .names({"A", "B", "C", "D"})
       .use_cols_indexes({1, 0, 0})
       .use_cols_names({"D", "B", "B"})
@@ -575,7 +573,7 @@ TEST_F(CsvReaderTest, Booleans)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A", "B", "C", "D"})
-      .dtypes(std::vector<std::string>{"int32", "int32", "short", "bool"})
+      .dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int16_t>(), dtype<bool>()})
       .true_values({"yes", "Yes", "YES", "foo", "FOO"})
       .false_values({"no", "No", "NO", "Bar", "bar"})
       .header(-1);
@@ -584,10 +582,10 @@ TEST_F(CsvReaderTest, Booleans)
   // Booleans are the same (integer) data type, but valued at 0 or 1
   const auto view = result.tbl->view();
   EXPECT_EQ(4, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(1).type().id());
-  ASSERT_EQ(cudf::type_id::INT16, view.column(2).type().id());
-  ASSERT_EQ(cudf::type_id::BOOL8, view.column(3).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT16, view.column(2).type().id());
+  ASSERT_EQ(type_id::BOOL8, view.column(3).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{1, 0, 0, 0, 1}, view.column(0));
   expect_column_data_equal(std::vector<int16_t>{0, 1, 1, 0, 1}, view.column(2));
@@ -607,14 +605,14 @@ TEST_F(CsvReaderTest, Dates)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_ms>{cudf::timestamp_ms{983750400000ms},
@@ -643,15 +641,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_SECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_s>{cudf::timestamp_s{983750400s},
@@ -680,15 +677,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_ms>{cudf::timestamp_ms{983750400000ms},
@@ -717,15 +713,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(std::vector<cudf::timestamp_us>{cudf::timestamp_us{983750400000000us},
@@ -754,15 +749,14 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}})
       .dayfirst(true)
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   expect_column_data_equal(
@@ -795,14 +789,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[s]"})
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_SECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_SECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -824,14 +817,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[ms]"})
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MILLISECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MILLISECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -853,14 +845,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[us]"})
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_MICROSECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_MICROSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -882,14 +873,13 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"datetime64[ns]"})
-      .header(-1)
-      .timestamp_type(cudf::data_type{cudf::type_id::TIMESTAMP_NANOSECONDS});
+      .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}})
+      .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
+  ASSERT_EQ(type_id::TIMESTAMP_NANOSECONDS, view.column(0).type().id());
 
   using namespace cuda::std::chrono_literals;
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_column, view.column(0));
@@ -907,14 +897,14 @@ TEST_F(CsvReaderTest, FloatingPoint)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"float32"})
+      .dtypes({dtype<float>()})
       .lineterminator(';')
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());
 
   const auto ref_vals =
     std::vector<float>{5.6, 56.79, 12000000000, 0.7, 3.000, 12.34, 0.31, -73.98007199999998};
@@ -940,14 +930,14 @@ TEST_F(CsvReaderTest, Strings)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
     std::vector<std::string>{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"},
@@ -970,14 +960,14 @@ TEST_F(CsvReaderTest, StringsQuotes)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quotechar('`');
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
     std::vector<std::string>{"abc,\ndef, ghi", "jkl, `mno`, pqr", "stu `vwx` yz"}, view.column(1));
@@ -999,15 +989,15 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE)
       .doublequote(false);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(1).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
 
   expect_column_data_equal(
     std::vector<std::string>{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"},
@@ -1025,7 +1015,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"int32"})
+      .dtypes({dtype<int32_t>()})
       .header(1)
       .skiprows(2)
       .nrows(2);
@@ -1033,7 +1023,7 @@ TEST_F(CsvReaderTest, SkiprowsNrows)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{5, 6}, view.column(0));
 }
@@ -1049,7 +1039,7 @@ TEST_F(CsvReaderTest, ByteRange)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"int32"})
+      .dtypes({dtype<int32_t>()})
       .header(-1)
       .byte_range_offset(11)
       .byte_range_size(15);
@@ -1057,7 +1047,7 @@ TEST_F(CsvReaderTest, ByteRange)
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{4000, 5000, 6000}, view.column(0));
 }
@@ -1068,14 +1058,14 @@ TEST_F(CsvReaderTest, ByteRangeStrings)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{input.c_str(), input.size()})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"str"})
+      .dtypes({dtype<cudf::string_view>()})
       .header(-1)
       .byte_range_offset(4);
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::STRING, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<std::string>{"c"}, view.column(0));
 }
@@ -1091,14 +1081,14 @@ TEST_F(CsvReaderTest, BlanksAndComments)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"int32"})
+      .dtypes({dtype<int32_t>()})
       .header(-1)
       .comment('#');
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT32, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int32_t>{1, 3, 4, 5, 8, 9}, view.column(0));
 }
@@ -1166,12 +1156,12 @@ TEST_F(CsvReaderTest, ArrowFileSource)
   auto arrow_source = cudf_io::arrow_io_source{infile};
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source})
-      .dtypes(std::vector<std::string>{"int8"});
+      .dtypes({dtype<int8_t>()});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::INT8, view.column(0).type().id());
+  ASSERT_EQ(type_id::INT8, view.column(0).type().id());
 
   expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
 }
@@ -1187,13 +1177,13 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"float32"})
+      .dtypes({dtype<float>()})
       .header(-1);
   const auto result = cudf_io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(cudf::type_id::FLOAT32, view.column(0).type().id());
+  ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());
 
   const auto col_data = cudf::test::to_host<float>(view.column(0));
   // col_data.first contains the column data
@@ -1212,7 +1202,7 @@ TEST_F(CsvReaderTest, StringInference)
   const auto result = cudf_io::read_csv(in_opts);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
-  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING);
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), type_id::STRING);
 }
 
 TEST_F(CsvReaderTest, TypeInferenceThousands)
@@ -1226,9 +1216,9 @@ TEST_F(CsvReaderTest, TypeInferenceThousands)
   const auto result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
-  EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(1).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64);
 
   auto tsnd_sep_col = std::vector<int64_t>{1400L, 123456L};
   auto int_col      = std::vector<int64_t>{123L, 123456L};
@@ -1254,9 +1244,9 @@ TEST_F(CsvReaderTest, TypeInferenceWithDecimal)
   const auto result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
-  EXPECT_EQ(result_view.column(0).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result_view.column(1).type().id(), cudf::type_id::STRING);
-  EXPECT_EQ(result_view.column(2).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(1).type().id(), type_id::STRING);
+  EXPECT_EQ(result_view.column(2).type().id(), type_id::FLOAT64);
 
   auto int_col = std::vector<int64_t>{1400L, 123456L};
   auto str_col = std::vector<std::string>{"1.23", "123.456"};
@@ -1296,7 +1286,7 @@ TEST_F(CsvReaderTest, nullHandling)
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .na_filter(false)
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1310,7 +1300,7 @@ TEST_F(CsvReaderTest, nullHandling)
   {
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1327,7 +1317,7 @@ TEST_F(CsvReaderTest, nullHandling)
     cudf_io::csv_reader_options in_opts =
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .na_values({"Null"})
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1345,7 +1335,7 @@ TEST_F(CsvReaderTest, nullHandling)
       cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
         .keep_default_na(false)
         .na_values({"Null"})
-        .dtypes(std::vector<std::string>{"str"})
+        .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
     const auto result = cudf_io::read_csv(in_opts);
@@ -1477,16 +1467,35 @@ TEST_F(CsvReaderTest, HexTest)
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << "0x0\n-0x1000\n0xfedcba\n0xABCDEF\n0xaBcDeF\n9512c20b\n";
   }
+  // specify hex columns by name
+  {
+    cudf_io::csv_reader_options in_opts =
+      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+        .names({"A"})
+        .dtypes({dtype<int64_t>()})
+        .header(-1)
+        .parse_hex({"A"});
+    auto result = cudf_io::read_csv(in_opts);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .names({"A"})
-      .dtypes(std::vector<std::string>{"hex"})
-      .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+    expect_column_data_equal(
+      std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
+      result.tbl->view().column(0));
+  }
+
+  // specify hex columns by index
+  {
+    cudf_io::csv_reader_options in_opts =
+      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+        .names({"A"})
+        .dtypes({dtype<int64_t>()})
+        .header(-1)
+        .parse_hex(std::vector<int>{0});
+    auto result = cudf_io::read_csv(in_opts);
 
-  expect_column_data_equal(std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
-                           result.tbl->view().column(0));
+    expect_column_data_equal(
+      std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
+      result.tbl->view().column(0));
+  }
 }
 
 TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter)
@@ -1555,18 +1564,13 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
 
   std::vector<cudf::column_view> input_columns{int8_column,
                                                int16_column,
-                                               int16_column,
-                                               int32_column,
                                                int32_column,
                                                int64_column,
-                                               int64_column,
                                                uint8_column,
                                                uint16_column,
                                                uint32_column,
                                                uint64_column,
                                                float32_column,
-                                               float32_column,
-                                               float64_column,
                                                float64_column};
   cudf::table_view input_table{input_columns};
 
@@ -1577,26 +1581,21 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .header(-1)
-      .dtypes(std::vector<std::string>{"int8",
-                                       "short",
-                                       "int16",
-                                       "int",
-                                       "int32",
-                                       "long",
-                                       "int64",
-                                       "uint8",
-                                       "uint16",
-                                       "uint32",
-                                       "uint64",
-                                       "float",
-                                       "float32",
-                                       "double",
-                                       "float64"});
+      .dtypes({dtype<int8_t>(),
+               dtype<int16_t>(),
+               dtype<int32_t>(),
+               dtype<int64_t>(),
+               dtype<uint8_t>(),
+               dtype<uint16_t>(),
+               dtype<uint32_t>(),
+               dtype<uint64_t>(),
+               dtype<float>(),
+               dtype<double>()});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
 
-  std::vector<cudf::size_type> non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  std::vector<cudf::size_type> non_float64s{0, 1, 2, 3, 4, 5, 6, 7, 8};
   const auto input_sliced_view  = input_table.select(non_float64s);
   const auto result_sliced_view = result_table.select(non_float64s);
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_sliced_view, result_sliced_view);
@@ -1606,9 +1605,6 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
   auto float64_col_idx = non_float64s.size();
   check_float_column(
     input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity);
-  ++float64_col_idx;
-  check_float_column(
-    input_table.column(float64_col_idx), result_table.column(float64_col_idx), tol, validity);
 }
 
 TEST_F(CsvReaderTest, DatesWithWriter)
@@ -1633,7 +1629,7 @@ TEST_F(CsvReaderTest, DatesWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
@@ -1764,7 +1760,7 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names({"A"})
-      .dtypes(std::vector<std::string>{"float64"})
+      .dtypes({dtype<double>()})
       .header(-1);
   // in_opts.lineterminator = ';';
   auto result = cudf_io::read_csv(in_opts);
@@ -1790,7 +1786,7 @@ TEST_F(CsvReaderTest, StringsWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -1815,7 +1811,7 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quoting(cudf_io::quote_style::NONE);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -1839,7 +1835,7 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str"});
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()});
   auto result = cudf_io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
@@ -1862,7 +1858,11 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{"int32", "str", "int32", "int32", "int32"});
+      .dtypes({dtype<int32_t>(),
+               dtype<cudf::string_view>(),
+               dtype<int32_t>(),
+               dtype<int32_t>(),
+               dtype<int32_t>()});
   auto result = cudf_io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
@@ -1917,7 +1917,7 @@ TEST_F(CsvReaderTest, UserImplementedSource)
   TestSource source{csv_data.str()};
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{&source})
-      .dtypes(std::vector<std::string>{"int8", "int16", "int32"})
+      .dtypes({dtype<int8_t>(), dtype<int16_t>(), dtype<int32_t>()})
       .header(-1);
   auto result = cudf_io::read_csv(in_opts);
 
@@ -1962,8 +1962,11 @@ TEST_F(CsvReaderTest, DurationsWithWriter)
   cudf_io::csv_reader_options in_opts =
     cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
       .names(names)
-      .dtypes(std::vector<std::string>{
-        "timedelta[D]", "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]", "timedelta64[ns]"});
+      .dtypes({data_type{type_id::DURATION_DAYS},
+               data_type{type_id::DURATION_SECONDS},
+               data_type{type_id::DURATION_MILLISECONDS},
+               data_type{type_id::DURATION_MICROSECONDS},
+               data_type{type_id::DURATION_NANOSECONDS}});
   auto result = cudf_io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
@@ -2164,4 +2167,35 @@ TEST_F(CsvReaderTest, DefaultWriteChunkSize)
   }
 }
 
+TEST_F(CsvReaderTest, DtypesMap)
+{
+  std::string csv_in{"12,9\n34,8\n56,7"};
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
+      .names({"A", "B"})
+      .dtypes({{"B", dtype<int16_t>()}, {"A", dtype<int32_t>()}})
+      .header(-1);
+  auto result = cudf_io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  ASSERT_EQ(result_table.num_columns(), 2);
+  ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32});
+  ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT16});
+  expect_column_data_equal(std::vector<int32_t>{12, 34, 56}, result_table.column(0));
+  expect_column_data_equal(std::vector<int16_t>{9, 8, 7}, result_table.column(1));
+}
+
+TEST_F(CsvReaderTest, DtypesMapInvalid)
+{
+  std::string csv_in{""};
+
+  cudf_io::csv_reader_options in_opts =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
+      .names({"A", "B"})
+      .dtypes({{"A", dtype<int16_t>()}});
+
+  EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 308821489c5..e83592a028a 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -42,6 +42,16 @@ using int64_wrapper        = wrapper<int64_t>;
 using timestamp_ms_wrapper = wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>;
 using bool_wrapper         = wrapper<bool>;
 
+using cudf::data_type;
+using cudf::type_id;
+using cudf::type_to_id;
+
+template <typename T>
+auto dtype()
+{
+  return data_type{type_to_id<T>()};
+}
+
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same_v<T, cudf::string_view>,
@@ -151,7 +161,7 @@ TEST_F(JsonReaderTest, BasicJsonLines)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
-      .dtypes({"int", "float64"})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -182,7 +192,7 @@ TEST_F(JsonReaderTest, FloatingPoint)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"float32"})
+      .dtypes({dtype<float>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -206,7 +216,7 @@ TEST_F(JsonReaderTest, JsonLinesStrings)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
-      .dtypes({"2:str", "0:int", "1:float64"})
+      .dtypes({{"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
       .lines(true);
 
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
@@ -245,9 +255,8 @@ TEST_F(JsonReaderTest, MultiColumn)
     std::ostringstream line;
     for (int i = 0; i < num_rows; ++i) {
       line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << ","
-           << int16_values[i] << "," << int32_values[i] << "," << int32_values[i] << ","
-           << int64_values[i] << "," << int64_values[i] << "," << float32_values[i] << ","
-           << float32_values[i] << "," << float64_values[i] << "," << float64_values[i] << "]\n";
+           << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << ","
+           << float64_values[i] << "]\n";
     }
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << line.str();
@@ -255,17 +264,12 @@ TEST_F(JsonReaderTest, MultiColumn)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"int8",
-               "short",
-               "int16",
-               "int",
-               "int32",
-               "long",
-               "int64",
-               "float",
-               "float32",
-               "double",
-               "float64"})
+      .dtypes({dtype<int8_t>(),
+               dtype<int16_t>(),
+               dtype<int32_t>(),
+               dtype<int64_t>(),
+               dtype<float>(),
+               dtype<double>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -275,34 +279,21 @@ TEST_F(JsonReaderTest, MultiColumn)
 
   EXPECT_EQ(view.column(0).type().id(), cudf::type_id::INT8);
   EXPECT_EQ(view.column(1).type().id(), cudf::type_id::INT16);
-  EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT16);
-  EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT32);
-  EXPECT_EQ(view.column(4).type().id(), cudf::type_id::INT32);
-  EXPECT_EQ(view.column(5).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(view.column(6).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(view.column(7).type().id(), cudf::type_id::FLOAT32);
-  EXPECT_EQ(view.column(8).type().id(), cudf::type_id::FLOAT32);
-  EXPECT_EQ(view.column(9).type().id(), cudf::type_id::FLOAT64);
-  EXPECT_EQ(view.column(10).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(view.column(2).type().id(), cudf::type_id::INT32);
+  EXPECT_EQ(view.column(3).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(view.column(4).type().id(), cudf::type_id::FLOAT32);
+  EXPECT_EQ(view.column(5).type().id(), cudf::type_id::FLOAT64);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(0),
                                  int8_wrapper{int8_values.begin(), int8_values.end(), validity});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(1),
                                  int16_wrapper{int16_values.begin(), int16_values.end(), validity});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(2),
-                                 int16_wrapper{int16_values.begin(), int16_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3),
-                                 int_wrapper{int32_values.begin(), int32_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(4),
                                  int_wrapper{int32_values.begin(), int32_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(5),
-                                 int64_wrapper{int64_values.begin(), int64_values.end(), validity});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(6),
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.column(3),
                                  int64_wrapper{int64_values.begin(), int64_values.end(), validity});
-  check_float_column(view.column(7), float32_values, validity);
-  check_float_column(view.column(8), float32_values, validity);
-  check_float_column(view.column(9), float64_values, validity);
-  check_float_column(view.column(10), float64_values, validity);
+  check_float_column(view.column(4), float32_values, validity);
+  check_float_column(view.column(5), float64_values, validity);
 }
 
 TEST_F(JsonReaderTest, Booleans)
@@ -315,7 +306,7 @@ TEST_F(JsonReaderTest, Booleans)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"bool"})
+      .dtypes({dtype<bool>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -342,7 +333,7 @@ TEST_F(JsonReaderTest, Dates)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"date"})
+      .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .lines(true)
       .dayfirst(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
@@ -379,7 +370,7 @@ TEST_F(JsonReaderTest, Durations)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"timedelta64[ns]"})
+      .dtypes({data_type{type_id::DURATION_NANOSECONDS}})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -665,13 +656,12 @@ TEST_F(JsonReaderTest, ArrowFileSource)
   auto arrow_source = cudf_io::arrow_io_source{infile};
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{&arrow_source})
-      .dtypes({"int8"})
+      .dtypes({dtype<int8_t>()})
       .lines(true);
   ;
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
-  EXPECT_EQ(result.tbl->num_columns(),
-            static_cast<cudf::size_type>(in_options.get_dtypes().size()));
+  EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8);
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
@@ -690,7 +680,7 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint)
 
   cudf_io::json_reader_options in_options =
     cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
-      .dtypes({"float32"})
+      .dtypes({dtype<float>()})
       .lines(true);
   cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
 
@@ -898,4 +888,27 @@ TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs)
                                  float64_wrapper{{1.1, 2.2, 3.3, 4.4}, validity});
 }
 
+TEST_F(JsonReaderTest, BadDtypeParams)
+{
+  std::string buffer = "[1,2,3,4]";
+
+  cudf_io::json_reader_options options_vec =
+    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .lines(true)
+      .dtypes({dtype<int8_t>()});
+
+  // should throw because there are four columns and only one dtype
+  EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error);
+
+  cudf_io::json_reader_options options_map =
+    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      .lines(true)
+      .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
+                                                     {"1", dtype<int8_t>()},
+                                                     {"2", dtype<int8_t>()},
+                                                     {"wrong_name", dtype<int8_t>()}});
+  // should throw because one of the columns is not in the dtype map
+  EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 8fdfc6f9165..7260aa9e686 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -275,10 +275,10 @@ inline auto random_values(size_t size)
 TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn)
 {
   auto sequence =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return TypeParam(i % 400); });
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 800;
   column_wrapper<TypeParam> col(sequence, sequence + num_rows, validity);
 
   std::vector<std::unique_ptr<column>> cols;
@@ -816,7 +816,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   expected_metadata.column_metadata[3].set_name("floats");
   expected_metadata.column_metadata[4].set_name("doubles");
   expected_metadata.user_data.insert(
-    {"pandas", "\"index_columns\": [\"floats\", \"doubles\"], \"column1\": [\"int8s\"]"});
+    {"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"});
 
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
   cudf_io::parquet_writer_options out_opts =
@@ -827,7 +827,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   cudf_io::parquet_reader_options in_opts =
     cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
       .use_pandas_metadata(true)
-      .columns({"int8s", "int16s", "int32s"});
+      .columns({"int32s", "floats", "doubles"});
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
@@ -967,8 +967,6 @@ TEST_F(ParquetWriterTest, StructOfList)
   auto struct_2 =
     cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
 
-  // cudf::test::print(struct_2->child(1).child(2));
-
   auto expected = table_view({*struct_2});
 
   cudf_io::table_input_metadata expected_metadata(expected);
@@ -2497,6 +2495,131 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
   }
 }
 
+TEST_F(ParquetReaderTest, SelectNestedColumn)
+{
+  // Struct<is_human:bool,
+  //        Struct<weight:float,
+  //               ages:int,
+  //               land_unit:List<int>>,
+  //               flats:List<List<int>>
+  //              >
+  //       >
+
+  auto weights_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+  auto ages_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+  auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+
+  auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+  auto struct_2 =
+    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+
+  auto input = table_view({*struct_2});
+
+  cudf_io::table_input_metadata input_metadata(input);
+  input_metadata.column_metadata[0].set_name("being");
+  input_metadata.column_metadata[0].child(0).set_name("human?");
+  input_metadata.column_metadata[0].child(1).set_name("particulars");
+  input_metadata.column_metadata[0].child(1).child(0).set_name("weight");
+  input_metadata.column_metadata[0].child(1).child(1).set_name("age");
+
+  auto filepath = temp_env->get_temp_filepath("SelectNestedColumn.parquet");
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, input)
+      .metadata(&input_metadata);
+  cudf_io::write_parquet(args);
+
+  {  // Test selecting a single leaf from the table
+    cudf_io::parquet_reader_options read_args =
+      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+        .columns({"being.particulars.age"});
+    const auto result = cudf_io::read_parquet(read_args);
+
+    auto expect_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+    auto expect_s_1 = cudf::test::structs_column_wrapper{{expect_ages_col}, {1, 1, 1, 1, 0, 1}};
+    auto expect_s_2 =
+      cudf::test::structs_column_wrapper{{expect_s_1}, {0, 1, 1, 1, 1, 1}}.release();
+    auto expected = table_view({*expect_s_2});
+
+    cudf_io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("being");
+    expected_metadata.column_metadata[0].child(0).set_name("particulars");
+    expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    compare_metadata_equality(expected_metadata, result.metadata);
+  }
+
+  {  // Test selecting a non-leaf and expecting all hierarchy from that node onwards
+    cudf_io::parquet_reader_options read_args =
+      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+        .columns({"being.particulars"});
+    const auto result = cudf_io::read_parquet(read_args);
+
+    auto expected_weights_col =
+      cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+    auto expected_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+    auto expected_s_1 = cudf::test::structs_column_wrapper{
+      {expected_weights_col, expected_ages_col}, {1, 1, 1, 1, 0, 1}};
+
+    auto expect_s_2 =
+      cudf::test::structs_column_wrapper{{expected_s_1}, {0, 1, 1, 1, 1, 1}}.release();
+    auto expected = table_view({*expect_s_2});
+
+    cudf_io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("being");
+    expected_metadata.column_metadata[0].child(0).set_name("particulars");
+    expected_metadata.column_metadata[0].child(0).child(0).set_name("weight");
+    expected_metadata.column_metadata[0].child(0).child(1).set_name("age");
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    compare_metadata_equality(expected_metadata, result.metadata);
+  }
+
+  {  // Test selecting struct children out of order
+    cudf_io::parquet_reader_options read_args =
+      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+        .columns({"being.particulars.age", "being.particulars.weight", "being.human?"});
+    const auto result = cudf_io::read_parquet(read_args);
+
+    auto expected_weights_col =
+      cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
+
+    auto expected_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+
+    auto expected_is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
+      {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+
+    auto expect_s_1 = cudf::test::structs_column_wrapper{{expected_ages_col, expected_weights_col},
+                                                         {1, 1, 1, 1, 0, 1}};
+
+    auto expect_s_2 =
+      cudf::test::structs_column_wrapper{{expect_s_1, expected_is_human_col}, {0, 1, 1, 1, 1, 1}}
+        .release();
+
+    auto expected = table_view({*expect_s_2});
+
+    cudf_io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("being");
+    expected_metadata.column_metadata[0].child(0).set_name("particulars");
+    expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
+    expected_metadata.column_metadata[0].child(0).child(1).set_name("weight");
+    expected_metadata.column_metadata[0].child(1).set_name("human?");
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    compare_metadata_equality(expected_metadata, result.metadata);
+  }
+}
+
 TEST_F(ParquetReaderTest, DecimalRead)
 {
   {
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 57abdf17aa6..8018d613e05 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
@@ -50,7 +49,7 @@ const auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_ref
 
 // Common expressions.
 auto left_zero_eq_right_zero =
-  cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+  cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
 }  // namespace
 
 /**
@@ -147,15 +146,17 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
    */
   void test(std::vector<std::vector<T>> left_data,
             std::vector<std::vector<T>> right_data,
-            cudf::ast::expression predicate,
+            cudf::ast::operation predicate,
             std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     // Note that we need to maintain the column wrappers otherwise the
     // resulting column views will be referencing potentially invalid memory.
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
-    auto result = this->join(left, right, predicate);
+    auto result_size = this->join_size(left, right, predicate);
+    EXPECT_TRUE(result_size == expected_outputs.size());
 
+    auto result = this->join(left, right, predicate);
     std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
     for (size_t i = 0; i < result.first->size(); ++i) {
       // Note: Not trying to be terribly efficient here since these tests are
@@ -167,20 +168,22 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(result_pairs.begin(), result_pairs.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
   }
 
   void test_nulls(std::vector<std::pair<std::vector<T>, std::vector<bool>>> left_data,
                   std::vector<std::pair<std::vector<T>, std::vector<bool>>> right_data,
-                  cudf::ast::expression predicate,
+                  cudf::ast::operation predicate,
                   std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     // Note that we need to maintain the column wrappers otherwise the
     // resulting column views will be referencing potentially invalid memory.
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
-    auto result = this->join(left, right, predicate);
+    auto result_size = this->join_size(left, right, predicate);
+    EXPECT_TRUE(result_size == expected_outputs.size());
 
+    auto result = this->join(left, right, predicate);
     std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
     for (size_t i = 0; i < result.first->size(); ++i) {
       // Note: Not trying to be terribly efficient here since these tests are
@@ -192,7 +195,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(result_pairs.begin(), result_pairs.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
   }
 
   /*
@@ -238,7 +241,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
     thrust::sort(thrust::device, reference_pairs.begin(), reference_pairs.end());
 
     EXPECT_TRUE(thrust::equal(
-      thrust::device, result_pairs.begin(), result_pairs.end(), reference_pairs.begin()));
+      thrust::device, reference_pairs.begin(), reference_pairs.end(), result_pairs.begin()));
   }
 
   /**
@@ -248,7 +251,16 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
    */
   virtual std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
                     std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0;
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) = 0;
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * conditional join size computation API.
+   */
+  virtual std::size_t join_size(cudf::table_view left,
+                                cudf::table_view right,
+                                cudf::ast::operation predicate) = 0;
 
   /**
    * This method must be implemented by subclasses for specific types of joins.
@@ -267,11 +279,18 @@ template <typename T>
 struct ConditionalInnerJoinTest : public ConditionalJoinPairReturnTest<T> {
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_inner_join(left, right, predicate);
   }
 
+  std::size_t join_size(cudf::table_view left,
+                        cudf::table_view right,
+                        cudf::ast::operation predicate) override
+  {
+    return cudf::conditional_inner_join_size(left, right, predicate);
+  }
+
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
   reference_join(cudf::table_view left, cudf::table_view right) override
@@ -316,7 +335,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestTwoColumnThreeRowSomeEqual)
 TYPED_TEST(ConditionalInnerJoinTest, TestNotComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::NOT, col_ref_0);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::NOT, col_ref_0);
 
   this->test({{0, 1, 2}}, {{3, 4, 5}}, expression, {{0, 0}, {0, 1}, {0, 2}});
 };
@@ -325,7 +344,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test({{0, 1, 2}}, {{1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}});
 };
@@ -334,7 +353,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterTwoColumnComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test({{0, 1, 2}, {0, 0, 0}},
              {{0, 0, 0}, {1, 0, 0}},
@@ -346,7 +365,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnComparison)
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test(
     {{0, 1, 2}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}, {2, 0}, {2, 1}, {2, 2}});
@@ -356,7 +375,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestGreaterDifferentNumberColumnDifferentSi
 {
   auto col_ref_0  = cudf::ast::column_reference(0);
   auto col_ref_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
-  auto expression = cudf::ast::expression(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_0, col_ref_1);
 
   this->test({{0, 1}}, {{0, 0, 0}, {1, 0, 0}}, expression, {{1, 1}, {1, 2}});
 };
@@ -367,14 +386,14 @@ TYPED_TEST(ConditionalInnerJoinTest, TestComplexConditionMultipleColumns)
   auto col_ref_0      = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
   auto scalar_1       = cudf::numeric_scalar<TypeParam>(1);
   auto literal_1      = cudf::ast::literal(scalar_1);
-  auto left_0_equal_1 = cudf::ast::expression(cudf::ast::ast_operator::EQUAL, col_ref_0, literal_1);
+  auto left_0_equal_1 = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_0, literal_1);
 
   auto col_ref_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
   auto comparison_filter =
-    cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_1, col_ref_0);
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_1, col_ref_0);
 
   auto expression =
-    cudf::ast::expression(cudf::ast::ast_operator::LOGICAL_AND, left_0_equal_1, comparison_filter);
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, left_0_equal_1, comparison_filter);
 
   this->test({{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}},
              {{0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2},
@@ -384,6 +403,20 @@ TYPED_TEST(ConditionalInnerJoinTest, TestComplexConditionMultipleColumns)
              {{4, 0}, {5, 0}, {6, 0}, {7, 0}});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestSymmetry)
+{
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_1, col_ref_0);
+  auto expression_reverse =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+
+  this->test(
+    {{0, 1, 2}}, {{1, 2, 3}}, expression, {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}});
+  this->test(
+    {{0, 1, 2}}, {{1, 2, 3}}, expression_reverse, {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestCompareRandomToHash)
 {
   // Generate columns of 10 repeats of the integer range [0, 10), then merge
@@ -418,7 +451,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsRowAllEqual)
 
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsNoOutputRowAllEqual)
 {
-  this->test_nulls({{{0, 1}, {0, 1}}}, {{{0, 0}, {1, 1}}}, left_zero_eq_right_zero, {{}, {}});
+  this->test_nulls({{{0, 1}, {0, 1}}}, {{{0, 0}, {1, 1}}}, left_zero_eq_right_zero, {});
 };
 
 /**
@@ -428,11 +461,18 @@ template <typename T>
 struct ConditionalLeftJoinTest : public ConditionalJoinPairReturnTest<T> {
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_join(left, right, predicate);
   }
 
+  std::size_t join_size(cudf::table_view left,
+                        cudf::table_view right,
+                        cudf::ast::operation predicate) override
+  {
+    return cudf::conditional_left_join_size(left, right, predicate);
+  }
+
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
   reference_join(cudf::table_view left, cudf::table_view right) override
@@ -484,11 +524,21 @@ template <typename T>
 struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest<T> {
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-  join(cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+  join(cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_full_join(left, right, predicate);
   }
 
+  std::size_t join_size(cudf::table_view left,
+                        cudf::table_view right,
+                        cudf::ast::operation predicate) override
+  {
+    // Full joins don't actually support size calculations, but to support a
+    // uniform testing framework we just calculate it from the result of doing
+    // the join.
+    return cudf::conditional_full_join(left, right, predicate).first->size();
+  }
+
   std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
   reference_join(cudf::table_view left, cudf::table_view right) override
@@ -499,6 +549,19 @@ struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest<T> {
 
 TYPED_TEST_CASE(ConditionalFullJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalFullJoinTest, TestOneColumnNoneEqual)
+{
+  this->test({{0, 1, 2}},
+             {{3, 4, 5}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue},
+              {1, JoinNoneValue},
+              {2, JoinNoneValue},
+              {JoinNoneValue, 0},
+              {JoinNoneValue, 1},
+              {JoinNoneValue, 2}});
+};
+
 TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}},
@@ -546,13 +609,15 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
    */
   void test(std::vector<std::vector<T>> left_data,
             std::vector<std::vector<T>> right_data,
-            cudf::ast::expression predicate,
+            cudf::ast::operation predicate,
             std::vector<cudf::size_type> expected_outputs)
   {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
-    auto result = this->join(left, right, predicate);
+    auto result_size = this->join_size(left, right, predicate);
+    EXPECT_TRUE(result_size == expected_outputs.size());
 
+    auto result = this->join(left, right, predicate);
     std::vector<cudf::size_type> resulting_indices;
     for (size_t i = 0; i < result->size(); ++i) {
       // Note: Not trying to be terribly efficient here since these tests are
@@ -595,7 +660,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
    * conditional join API.
    */
   virtual std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
-    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) = 0;
+    cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) = 0;
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * conditional join size computation API.
+   */
+  virtual std::size_t join_size(cudf::table_view left,
+                                cudf::table_view right,
+                                cudf::ast::operation predicate) = 0;
 
   /**
    * This method must be implemented by subclasses for specific types of joins.
@@ -612,11 +686,18 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
 template <typename T>
 struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
   std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
-    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+    cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_semi_join(left, right, predicate);
   }
 
+  std::size_t join_size(cudf::table_view left,
+                        cudf::table_view right,
+                        cudf::ast::operation predicate) override
+  {
+    return cudf::conditional_left_semi_join_size(left, right, predicate);
+  }
+
   std::unique_ptr<rmm::device_uvector<cudf::size_type>> reference_join(
     cudf::table_view left, cudf::table_view right) override
   {
@@ -663,11 +744,18 @@ TYPED_TEST(ConditionalLeftSemiJoinTest, TestCompareRandomToHash)
 template <typename T>
 struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
   std::unique_ptr<rmm::device_uvector<cudf::size_type>> join(
-    cudf::table_view left, cudf::table_view right, cudf::ast::expression predicate) override
+    cudf::table_view left, cudf::table_view right, cudf::ast::operation predicate) override
   {
     return cudf::conditional_left_anti_join(left, right, predicate);
   }
 
+  std::size_t join_size(cudf::table_view left,
+                        cudf::table_view right,
+                        cudf::ast::operation predicate) override
+  {
+    return cudf::conditional_left_anti_join_size(left, right, predicate);
+  }
+
   std::unique_ptr<rmm::device_uvector<cudf::size_type>> reference_join(
     cudf::table_view left, cudf::table_view right) override
   {
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index 386fd9d08ee..e51f0740787 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -24,7 +24,7 @@
 
 using namespace cudf::test::iterators;
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR};
 
 template <typename T>
 struct InterleaveColumnsTest : public cudf::test::BaseFixture {
@@ -378,7 +378,7 @@ using IntListsCol = cudf::test::lists_column_wrapper<int32_t>;
 using IntCol      = cudf::test::fixed_width_column_wrapper<int32_t>;
 using TView       = cudf::table_view;
 
-constexpr int32_t null{0};
+constexpr int32_t null{0};  // mark for null elements
 }  // namespace
 
 struct ListsColumnsInterleaveTest : public cudf::test::BaseFixture {
@@ -731,4 +731,341 @@ TEST_F(ListsColumnsInterleaveTest, SlicedStringsColumnsInputWithNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
 }
 
+namespace {
+using StructsCol = cudf::test::structs_column_wrapper;
+using StringsCol = cudf::test::strings_column_wrapper;
+}  // namespace
+
+struct StructsColumnsInterleaveTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StructsColumnsInterleaveTest, InvalidInput)
+{
+  // Input table contains non-structs column
+  {
+    auto const col1 = IntCol{};
+    auto const col2 = StructsCol{};
+    EXPECT_THROW(cudf::interleave_columns(TView{{col1, col2}}), cudf::logic_error);
+  }
+
+  // Types mismatch
+  {
+    auto const structs1 = [] {
+      auto child1 = IntCol{1, 2, 3};
+      auto child2 = IntCol{4, 5, 6};
+      return StructsCol{{child1, child2}};
+    }();
+
+    auto const structs2 = [] {
+      auto child1 = IntCol{7, 8, 9};
+      auto child2 = StringsCol{"", "abc", "123"};
+      return StructsCol{{child1, child2}};
+    }();
+
+    EXPECT_THROW(cudf::interleave_columns(TView{{structs1, structs2}}), cudf::logic_error);
+  }
+
+  // Numbers of children mismatch
+  {
+    auto const structs1 = [] {
+      auto child1 = IntCol{1, 2, 3};
+      auto child2 = IntCol{4, 5, 6};
+      return StructsCol{{child1, child2}};
+    }();
+
+    auto const structs2 = [] {
+      auto child1 = IntCol{7, 8, 9};
+      auto child2 = IntCol{10, 11, 12};
+      auto child3 = IntCol{13, 14, 15};
+      return StructsCol{{child1, child2, child3}};
+    }();
+
+    EXPECT_THROW(cudf::interleave_columns(TView{{structs1, structs2}}), cudf::logic_error);
+  }
+}
+
+TEST_F(StructsColumnsInterleaveTest, InterleaveEmptyColumns)
+{
+  auto const structs = StructsCol{};
+  auto const results = cudf::interleave_columns(TView{{structs, structs}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity);
+}
+
+template <typename T>
+struct StructsColumnsInterleaveTypedTest : public cudf::test::BaseFixture {
+};
+
+using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                        cudf::test::FloatingPointTypes,
+                                        cudf::test::FixedPointTypes>;
+TYPED_TEST_SUITE(StructsColumnsInterleaveTypedTest, TypesForTest);
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, InterleaveOneColumnNotNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs = [] {
+    auto child1 = ColWrapper{1, 2, 3};
+    auto child2 = ColWrapper{4, 5, 6};
+    auto child3 = StringsCol{"Banana", "Mango", "Apple"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+  auto const results = cudf::interleave_columns(TView{{structs}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, InterleaveOneColumnWithNulls)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs = [] {
+    auto child1 = ColWrapper{{1, 2, null, 3}, null_at(2)};
+    auto child2 = ColWrapper{{4, null, 5, 6}, null_at(1)};
+    auto child3 = StringsCol{{"" /*NULL*/, "Banana", "Mango", "Apple"}, null_at(0)};
+    return StructsCol{{child1, child2, child3}, null_at(3)};
+  }();
+  auto const results = cudf::interleave_columns(TView{{structs}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SimpleInputNoNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs1 = [] {
+    auto child1 = ColWrapper{1, 2, 3};
+    auto child2 = ColWrapper{4, 5, 6};
+    auto child3 = StringsCol{"Banana", "Mango", "Apple"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const structs2 = [] {
+    auto child1 = ColWrapper{7, 8, 9};
+    auto child2 = ColWrapper{10, 11, 12};
+    auto child3 = StringsCol{"Bear", "Duck", "Cat"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{1, 7, 2, 8, 3, 9};
+    auto child2 = ColWrapper{4, 10, 5, 11, 6, 12};
+    auto child3 = StringsCol{"Banana", "Bear", "Mango", "Duck", "Apple", "Cat"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const results = cudf::interleave_columns(TView{{structs1, structs2}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SimpleInputWithNulls)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs1 = [] {
+    auto child1 = ColWrapper{{1, 2, null, 3, 4}, null_at(2)};
+    auto child2 = ColWrapper{{4, null, 5, 6, 7}, null_at(1)};
+    auto child3 = StringsCol{{"" /*NULL*/, "Banana", "Mango", "Apple", "Cherry"}, null_at(0)};
+    return StructsCol{{child1, child2, child3}, null_at(0)};
+  }();
+
+  auto const structs2 = [] {
+    auto child1 = ColWrapper{{7, null, null, 8, 9}, nulls_at({1, 2})};
+    auto child2 = ColWrapper{{10, 11, 12, null, 14}, null_at(3)};
+    auto child3 = StringsCol{"Bear", "Duck", "Cat", "Dog", "Panda"};
+    return StructsCol{{child1, child2, child3}, null_at(4)};
+  }();
+
+  auto const structs3 = [] {
+    auto child1 = ColWrapper{{-1, -2, -3, 0, null}, null_at(4)};
+    auto child2 = ColWrapper{{-5, 0, null, -1, -10}, null_at(2)};
+    auto child3 = StringsCol{"111", "Bànànà", "abcxyz", "é á í", "zzz"};
+    return StructsCol{{child1, child2, child3}, null_at(1)};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{{1, 7, -1, 2, null, -2, null, null, -3, 3, 8, 0, 4, 9, null},
+                             nulls_at({4, 6, 7, 14})};
+    auto child2 = ColWrapper{{4, 10, -5, null, 11, 0, 5, 12, null, 6, null, -1, 7, 14, -10},
+                             nulls_at({3, 8, 10})};
+    auto child3 = StringsCol{{"" /*NULL*/,
+                              "Bear",
+                              "111",
+                              "Banana",
+                              "Duck",
+                              "Bànànà",
+                              "Mango",
+                              "Cat",
+                              "abcxyz",
+                              "Apple",
+                              "Dog",
+                              "é á í",
+                              "Cherry",
+                              "Panda",
+                              "zzz"},
+                             null_at(0)};
+    return StructsCol{{child1, child2, child3}, nulls_at({0, 5, 13})};
+  }();
+
+  auto const results = cudf::interleave_columns(TView{{structs1, structs2, structs3}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, NestedInputStructsColumns)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const structs1 = [] {
+    auto child_structs1 = [] {
+      auto child1 = ColWrapper{{null, 2, 3, 4, 5}, null_at(0)};
+      auto child2 = ColWrapper{{6, 7, 8, null, 10}, null_at(3)};
+      return StructsCol{{child1, child2}, null_at(0)};
+    }();
+
+    auto child_structs2 = [] {
+      auto child1 = ColWrapper{{11, null, 13, 14, 15}, null_at(1)};
+      auto child2 = ColWrapper{{null, 17, 18, 19, 20}, null_at(0)};
+      return StructsCol{{child1, child2}, nulls_at({0, 1})};
+    }();
+
+    auto child_strings = [] { return StringsCol{"Banana", "Mango", "Apple", "Cherry", "Kiwi"}; }();
+
+    return StructsCol{{child_structs1, child_structs2, child_strings}, null_at(0)};
+  }();
+
+  auto const structs2 = [] {
+    auto child_structs1 = [] {
+      auto child1 = ColWrapper{{-1, null, -3, -4, -5}, null_at(1)};
+      auto child2 = ColWrapper{{-6, -7, -8, null, -10}, null_at(3)};
+      return StructsCol{{child1, child2}};
+    }();
+
+    auto child_structs2 = [] {
+      auto child1 = ColWrapper{{-11, -12, null, -14, -15}, null_at(2)};
+      auto child2 = ColWrapper{{-16, -17, -18, -19, null}, null_at(4)};
+      return StructsCol{{child1, child2}, null_at(2)};
+    }();
+
+    auto child_strings = [] { return StringsCol{"Bear", "Duck", "Cat", "Dog", "Rabbit"}; }();
+
+    return StructsCol{{child_structs1, child_structs2, child_strings}, null_at(2)};
+  }();
+
+  auto const expected = [] {
+    auto child_structs1 = [] {
+      auto child1 = ColWrapper{{null, -1, 2, null, 3, -3, 4, -4, 5, -5}, nulls_at({0, 3})};
+      auto child2 = ColWrapper{{6, -6, 7, -7, 8, -8, null, null, 10, -10}, nulls_at({6, 7})};
+      return StructsCol{{child1, child2}, null_at(0)};
+    }();
+
+    auto child_structs2 = [] {
+      auto child1 = ColWrapper{{11, -11, null, -12, 13, null, 14, -14, 15, -15}, nulls_at({2, 5})};
+      auto child2 = ColWrapper{{null, -16, 17, -17, 18, -18, 19, -19, 20, null}, nulls_at({0, 9})};
+      return StructsCol{{child1, child2}, nulls_at({0, 2, 5})};
+    }();
+
+    auto child_strings = [] {
+      return StringsCol{
+        "Banana", "Bear", "Mango", "Duck", "Apple", "Cat", "Cherry", "Dog", "Kiwi", "Rabbit"};
+    }();
+
+    return StructsCol{{child_structs1, child_structs2, child_strings}, nulls_at({0, 5})};
+  }();
+
+  auto const results = cudf::interleave_columns(TView{{structs1, structs2}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SlicedColumnsInputNoNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  constexpr int32_t NOT_USE{-1};  // mark for elements that we don't care
+
+  auto const structs1_original = [] {
+    auto child1 = ColWrapper{NOT_USE, NOT_USE, 1, 2, 3, NOT_USE};
+    auto child2 = ColWrapper{NOT_USE, NOT_USE, 4, 5, 6, NOT_USE};
+    auto child3 = StringsCol{"NOT_USE", "NOT_USE", "Banana", "Mango", "Apple", "NOT_USE"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  // structs2 has more rows than structs1
+  auto const structs2_original = [] {
+    auto child1 = ColWrapper{NOT_USE, 7, 8, 9, NOT_USE, NOT_USE, NOT_USE};
+    auto child2 = ColWrapper{NOT_USE, 10, 11, 12, NOT_USE, NOT_USE, NOT_USE};
+    auto child3 = StringsCol{"NOT_USE", "Bear", "Duck", "Cat", "NOT_USE", "NOT_USE", "NOT_USE"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{1, 7, 2, 8, 3, 9};
+    auto child2 = ColWrapper{4, 10, 5, 11, 6, 12};
+    auto child3 = StringsCol{"Banana", "Bear", "Mango", "Duck", "Apple", "Cat"};
+    return StructsCol{{child1, child2, child3}};
+  }();
+
+  auto const structs1 = cudf::slice(structs1_original, {2, 5})[0];
+  auto const structs2 = cudf::slice(structs2_original, {1, 4})[0];
+  auto const results  = cudf::interleave_columns(TView{{structs1, structs2}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
+TYPED_TEST(StructsColumnsInterleaveTypedTest, SlicedColumnsInputWithNulls)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  constexpr int32_t NOT_USE{-1};  // mark for elements that we don't care
+
+  auto const structs1_original = [] {
+    auto child1 = ColWrapper{{NOT_USE, NOT_USE, 1, 2, null, 3, 4, NOT_USE}, null_at(4)};
+    auto child2 = ColWrapper{{NOT_USE, NOT_USE, 4, null, 5, 6, 7, NOT_USE}, null_at(3)};
+    auto child3 = StringsCol{
+      {"NOT_USE", "NOT_USE", "" /*NULL*/, "Banana", "Mango", "Apple", "Cherry", "NOT_USE"},
+      null_at(2)};
+    return StructsCol{{child1, child2, child3}, null_at(2)};
+  }();
+
+  auto const structs2_original = [] {
+    auto child1 = ColWrapper{{7, null, null, 8, 9, NOT_USE, NOT_USE}, nulls_at({1, 2})};
+    auto child2 = ColWrapper{{10, 11, 12, null, 14, NOT_USE, NOT_USE}, null_at(3)};
+    auto child3 = StringsCol{"Bear", "Duck", "Cat", "Dog", "Panda", "NOT_USE", "NOT_USE"};
+    return StructsCol{{child1, child2, child3}, null_at(4)};
+  }();
+
+  auto const structs3_original = [] {
+    auto child1 = ColWrapper{{NOT_USE, NOT_USE, NOT_USE, -1, -2, -3, 0, null}, null_at(7)};
+    auto child2 = ColWrapper{{NOT_USE, NOT_USE, NOT_USE, -5, 0, null, -1, -10}, null_at(5)};
+    auto child3 =
+      StringsCol{"NOT_USE", "NOT_USE", "NOT_USE", "111", "Bànànà", "abcxyz", "é á í", "zzz"};
+    return StructsCol{{child1, child2, child3}, null_at(4)};
+  }();
+
+  auto const expected = [] {
+    auto child1 = ColWrapper{{1, 7, -1, 2, null, -2, null, null, -3, 3, 8, 0, 4, 9, null},
+                             nulls_at({4, 6, 7, 14})};
+    auto child2 = ColWrapper{{4, 10, -5, null, 11, 0, 5, 12, null, 6, null, -1, 7, 14, -10},
+                             nulls_at({3, 8, 10})};
+    auto child3 = StringsCol{{"" /*NULL*/,
+                              "Bear",
+                              "111",
+                              "Banana",
+                              "Duck",
+                              "Bànànà",
+                              "Mango",
+                              "Cat",
+                              "abcxyz",
+                              "Apple",
+                              "Dog",
+                              "é á í",
+                              "Cherry",
+                              "Panda",
+                              "zzz"},
+                             null_at(0)};
+    return StructsCol{{child1, child2, child3}, nulls_at({0, 5, 13})};
+  }();
+
+  auto const structs1 = cudf::slice(structs1_original, {2, 7})[0];
+  auto const structs2 = cudf::slice(structs2_original, {0, 5})[0];
+  auto const structs3 = cudf::slice(structs3_original, {3, 8})[0];
+  auto const results  = cudf::interleave_columns(TView{{structs1, structs2, structs3}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, verbosity);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index cb123114fd8..72b30c19fd5 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -139,19 +139,6 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
     auto reference = create_reference_output(
       op, input, expected_grouping, preceding_window, following_window, min_periods);
 
-#ifndef NDEBUG
-    std::cout << "input:\n";
-    cudf::test::print(input, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "output:\n";
-    cudf::test::print(*output, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "reference:\n";
-    cudf::test::print(*reference, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "\n";
-#endif
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference);
   }
 
@@ -709,19 +696,6 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
                                              following_window_in_days,
                                              min_periods);
 
-#ifndef NDEBUG
-    std::cout << "input:\n";
-    cudf::test::print(input, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "output:\n";
-    cudf::test::print(*output, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "reference:\n";
-    cudf::test::print(*reference, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "\n";
-#endif
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference);
   }
 
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index a67e670acb7..ec88500fde1 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -190,19 +190,6 @@ class RollingTest : public cudf::test::BaseFixture {
     auto reference =
       create_reference_output(op, input, preceding_window, following_window, min_periods);
 
-#if 0
-    std::cout << "input:\n";
-    cudf::test::print(input, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "output:\n";
-    cudf::test::print(*output, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "reference:\n";
-    cudf::test::print(reference, std::cout, ", ");
-    std::cout << "\n";
-    std::cout << "\n";
-#endif
-
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*output, *reference);
   }
 
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index 2047d815867..b54594fd1c4 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -14,19 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <thrust/sequence.h>
-#include <random>
-#include <rmm/cuda_stream_view.hpp>
+#include <cudf/scalar/scalar.hpp>
 
 template <typename T>
 struct TypedScalarTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/strings/combine/join_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp
index d4aafbf5f23..bf739e83241 100644
--- a/cpp/tests/strings/combine/join_list_elements_tests.cpp
+++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp
@@ -90,11 +90,11 @@ TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput)
     auto const expected = STR_COL{"", "", "", ""};
 
     auto results = cudf::strings::join_list_elements(string_lv);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
 
     auto const separators = STR_COL{"", "", "", ""}.release();
     results               = cudf::strings::join_list_elements(string_lv, separators->view());
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
   }
 
   // Empty list results in null
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index a2486d60051..1f01f0f1429 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -167,6 +167,20 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest)
+{
+  cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"});
+  auto strings_view = cudf::strings_column_view(strings);
+
+  std::string pattern       = "(\\d+)-(\\d+)";
+  std::string repl_template = "${2} X ${1}0";
+  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+
+  cudf::test::strings_column_wrapper expected(
+    {"3 X 120 5 X 340 89 X 670", "99 X 00: 888 X 7770:: 0 X 56730"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest)
 {
   cudf::test::strings_column_wrapper strings(
@@ -203,6 +217,17 @@ TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest)
+{
+  cudf::test::strings_column_wrapper strings({"this string left intentionally blank"});
+  auto view = cudf::strings_column_view(strings);
+
+  EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\0"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", "\\123"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "", "\\1"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error);
+}
+
 TEST_F(StringsReplaceTests, MediumReplaceRegex)
 {
   // This results in 95 regex instructions and falls in the 'medium' range.
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index 548284d6c87..a94a35e8896 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -433,11 +433,6 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestListsOfStructs)
 
   cudf::test::expect_columns_equivalent(expected_unchanged_struct_col,
                                         cudf::lists_column_view(*list_col).child());
-
-#ifndef NDEBUG
-  std::cout << "Printing list col: \n";
-  cudf::test::print(*list_col);
-#endif
 }
 
 TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
new file mode 100644
index 00000000000..d4ded02adce
--- /dev/null
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <structs/utilities.hpp>
+
+namespace cudf::test {
+
+/**
+ * @brief Round-trip input table through flatten/unflatten,
+ *        verify that the table remains equivalent.
+ */
+void flatten_unflatten_compare(table_view const& input_table)
+{
+  using namespace cudf::structs::detail;
+
+  auto [flattened, _, __, ___] =
+    flatten_nested_columns(input_table, {}, {}, column_nullability::FORCE);
+  auto unflattened =
+    unflatten_nested_columns(std::make_unique<cudf::table>(flattened), input_table);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, unflattened->view());
+}
+
+using namespace cudf;
+using iterators::null_at;
+using strings = strings_column_wrapper;
+using structs = structs_column_wrapper;
+
+struct StructUtilitiesTest : BaseFixture {
+};
+
+template <typename T>
+struct TypedStructUtilitiesTest : StructUtilitiesTest {
+};
+
+TYPED_TEST_CASE(TypedStructUtilitiesTest, FixedWidthTypes);
+
+TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevelUnsupported)
+{
+  using T     = TypeParam;
+  using lists = lists_column_wrapper<T, int32_t>;
+  using nums  = fixed_width_column_wrapper<T, int32_t>;
+
+  auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}};
+  auto nums_col  = nums{{0, 1, 2}, null_at(6)};
+
+  EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{lists_col, nums_col}}),
+               cudf::logic_error);
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
+{
+  using T     = TypeParam;
+  using lists = lists_column_wrapper<T, int32_t>;
+  using nums  = fixed_width_column_wrapper<T, int32_t>;
+
+  auto lists_member = lists{{0, 1}, {22, 33}, {44, 55, 66}};
+  auto nums_member  = nums{{0, 1, 2}, null_at(6)};
+  auto structs_col  = structs{{nums_member, lists_member}};
+
+  auto nums_col = nums{{0, 1, 2}, null_at(6)};
+
+  EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}}),
+               cudf::logic_error);
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col        = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto strings_col     = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, strings_col, nuther_nums_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)};
+  auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_col    = structs{{nums_member, strings_member}};
+  auto nums_col       = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)};
+  auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_col    = structs{{nums_member, strings_member}, null_at(2)};
+  auto nums_col       = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}};
+
+  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member =
+    structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)};
+
+  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}};
+
+  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col =
+    structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
+{
+  using T    = TypeParam;
+  using nums = fixed_width_column_wrapper<T, int32_t>;
+
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
+  auto struct_0_strings_member =
+    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
+  auto structs_1_structs_member =
+    structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)};
+
+  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
+  auto struct_of_structs_col =
+    structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
+
+  flatten_unflatten_compare(cudf::table_view{{nums_col, struct_of_structs_col}});
+}
+
+TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
+{
+  using T    = TypeParam;
+  using ints = fixed_width_column_wrapper<int32_t>;
+  using lcw  = lists_column_wrapper<T, int32_t>;
+
+  // clang-format off
+  auto lists_member = lcw{  {0,1,2}, {3,4,5}, {6,7,8,9} };
+  auto ints_member  = ints{       0,       1,         2 };
+  // clang-format on
+
+  auto structs_with_lists_col = structs{lists_member, ints_member};
+
+  EXPECT_THROW(flatten_unflatten_compare(cudf::table_view{{structs_with_lists_col}}),
+               cudf::logic_error);
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 88e9e3d1384..f3002bc4b1a 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -114,14 +114,6 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   //
   // result = [6, 1, 11, 1, 1]
   //
-  auto validity_iter = cudf::detail::make_counting_transform_iterator(
-    0,
-    [row_indices = row_indices.begin<size_type>(),
-     validity    = c.null_mask(),
-     offset      = c.offset()] __device__(int index) {
-      auto const true_index = row_indices[index] + offset;
-      return !validity || cudf::bit_is_set(validity, true_index) ? 1 : 0;
-    });
   auto output_row_iter = cudf::detail::make_counting_transform_iterator(
     0,
     [row_indices  = row_indices.begin<size_type>(),
@@ -136,8 +128,9 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
                      output_row_iter,
                      output_row_iter + row_indices.size(),
                      output_row_start->view().begin<size_type>(),
-                     validity_iter,
-                     result->mutable_view().begin<size_type>());
+                     row_size_iter,
+                     result->mutable_view().begin<size_type>(),
+                     [] __device__(auto row_size) { return row_size != 0; });
 
   // generate keys for each output row
   //
@@ -150,11 +143,12 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
                    keys->mutable_view().end<size_type>(),
                    [] __device__() { return 0; });
   thrust::scatter_if(rmm::exec_policy(),
-                     validity_iter,
-                     validity_iter + row_indices.size(),
+                     row_size_iter,
+                     row_size_iter + row_indices.size(),
                      output_row_start->view().begin<size_type>(),
-                     validity_iter,
-                     keys->mutable_view().begin<size_type>());
+                     row_size_iter,
+                     keys->mutable_view().begin<size_type>(),
+                     [] __device__(auto row_size) { return row_size != 0; });
   thrust::inclusive_scan(rmm::exec_policy(),
                          keys->view().begin<size_type>(),
                          keys->view().end<size_type>(),
diff --git a/docs/cudf/source/_static/RAPIDS-logo-purple.png b/docs/cudf/source/_static/RAPIDS-logo-purple.png
new file mode 100644
index 00000000000..d884e01374d
Binary files /dev/null and b/docs/cudf/source/_static/RAPIDS-logo-purple.png differ
diff --git a/docs/cudf/source/_static/copybutton_pydocs.js b/docs/cudf/source/_static/copybutton_pydocs.js
deleted file mode 100644
index cec05777e6b..00000000000
--- a/docs/cudf/source/_static/copybutton_pydocs.js
+++ /dev/null
@@ -1,65 +0,0 @@
-$(document).ready(function() {
-    /* Add a [>>>] button on the top-right corner of code samples to hide
-     * the >>> and ... prompts and the output and thus make the code
-     * copyable. */
-    var div = $('.highlight-python .highlight,' +
-                '.highlight-python3 .highlight,' +
-                '.highlight-pycon .highlight,' +
-                '.highlight-default .highlight');
-    var pre = div.find('pre');
-
-    // get the styles from the current theme
-    pre.parent().parent().css('position', 'relative');
-    var hide_text = 'Hide the prompts and output';
-    var show_text = 'Show the prompts and output';
-    var border_width = pre.css('border-top-width');
-    var border_style = pre.css('border-top-style');
-    var border_color = pre.css('border-top-color');
-    var button_styles = {
-        'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0',
-        'border-color': border_color, 'border-style': border_style,
-        'border-width': border_width, 'text-size': '75%',
-        'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '1.5em',
-        'border-radius': '0 3px 0 0',
-        'transition': "0.5s"
-    }
-
-    // create and add the button to all the code blocks that contain >>>
-    div.each(function(index) {
-        var jthis = $(this);
-        if (jthis.find('.gp').length > 0) {
-            var button = $('<span id="strike_button" class="copybutton">&gt;&gt;&gt;</span>');
-            button.css(button_styles)
-            button.attr('title', hide_text);
-            button.data('hidden', 'false');
-            jthis.prepend(button);
-        }
-        // tracebacks (.gt) contain bare text elements that need to be
-        // wrapped in a span to work with .nextUntil() (see later)
-        jthis.find('pre:has(.gt)').contents().filter(function() {
-            return ((this.nodeType == 3) && (this.data.trim().length > 0));
-        }).wrap('<span>');
-    });
-
-    // define the behavior of the button when it's clicked
-    $('.copybutton').click(function(e){
-        e.preventDefault();
-        var button = $(this);
-        if (button.data('hidden') === 'false') {
-            // hide the code output
-            button.parent().find('.go, .gp, .gt').hide();
-            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
-            button.css('text-decoration', 'line-through');
-            button.attr('title', show_text);
-            button.data('hidden', 'true');
-        } else {
-            // show the code output
-            button.parent().find('.go, .gp, .gt').show();
-            button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
-            button.css('text-decoration', 'none');
-            button.attr('title', hide_text);
-            button.data('hidden', 'false');
-        }
-    });
-});
-
diff --git a/docs/cudf/source/_static/params.css b/docs/cudf/source/_static/params.css
index 475b9dfb4ec..2bdd6f5a299 100644
--- a/docs/cudf/source/_static/params.css
+++ b/docs/cudf/source/_static/params.css
@@ -8,14 +8,6 @@
     content: ":";
 }
 
-.highlight:hover span#strike_button {
-    color:#767676;
-}
-
-span#strike_button {
-    color :#d0ced7;
-}
-
 /* Fix for text wrap in sphinx tables:
  * https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html 
  */
@@ -40,3 +32,24 @@ table.io-supported-types-table {
 table.io-supported-types-table thead{
     text-align: center !important;
 }
+
+:root {
+
+  --pst-color-active-navigation: 114, 83, 237;
+  --pst-color-navbar-link: 77, 77, 77;
+  --pst-color-navbar-link-hover: var(--pst-color-active-navigation);
+  --pst-color-navbar-link-active: var(--pst-color-active-navigation);
+  --pst-color-sidebar-link: 77, 77, 77;
+  --pst-color-sidebar-link-hover: var(--pst-color-active-navigation);
+  --pst-color-sidebar-link-active: var(--pst-color-active-navigation);
+  --pst-color-sidebar-expander-background-hover: 244, 244, 244;
+  --pst-color-sidebar-caption: 77, 77, 77;
+  --pst-color-toc-link: 119, 117, 122;
+  --pst-color-toc-link-hover: var(--pst-color-active-navigation);
+  --pst-color-toc-link-active: var(--pst-color-active-navigation);
+
+}
+
+.special-table td, .special-table th {
+    border: 1px solid #dee2e6;
+}
\ No newline at end of file
diff --git a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
new file mode 100644
index 00000000000..f86822bc567
--- /dev/null
+++ b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
@@ -0,0 +1,33 @@
+{% extends "!autosummary/class.rst" %}
+
+{% block methods %}
+{% if methods %}
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+      {% for item in all_methods %}
+      {%- if not item.startswith('_') or item in ['__call__'] %}
+      {{ name }}.{{ item }}
+      {%- endif -%}
+      {%- endfor %}
+
+{% endif %}
+{% endblock %}
+
+{% block attributes %}
+{% if attributes %}
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+      {% for item in all_attributes %}
+      {%- if not item.startswith('_') %}
+      {{ name }}.{{ item }}
+      {%- endif -%}
+      {%- endfor %}
+
+{% endif %}
+{% endblock %}
\ No newline at end of file
diff --git a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
new file mode 100644
index 00000000000..b57a7ceebb0
--- /dev/null
+++ b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
@@ -0,0 +1,6 @@
+{{ fullname }}
+{{ underline }}
+
+.. currentmodule:: {{ module }}
+
+.. autoclass:: {{ objname }}
\ No newline at end of file
diff --git a/docs/cudf/source/api.rst b/docs/cudf/source/api.rst
deleted file mode 100644
index d3042be2129..00000000000
--- a/docs/cudf/source/api.rst
+++ /dev/null
@@ -1,270 +0,0 @@
-~~~~~~~~~~~~~~~~~~~
-cuDF API Reference
-~~~~~~~~~~~~~~~~~~~
-
-.. currentmodule:: cudf.core.dataframe
-
-DataFrame
----------
-.. autoclass:: DataFrame
-    :members:
-    :inherited-members:
-    :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, itertuples, iterrows
-
-Series
-------
-.. currentmodule:: cudf.core.series
-
-.. autoclass:: Series
-    :members:
-    :inherited-members:
-    :exclude-members: serialize, deserialize, logical_not, logical_or, logical_and, remainder, sum_of_squares, fill, merge, iteritems, items, device_deserialize, device_serialize, host_deserialize, host_serialize, to_dict, tolist, to_list
-
-Lists
------
-.. currentmodule:: cudf.core.column.lists
-
-.. autoclass:: ListMethods
-    :members:
-
-Strings
--------
-.. currentmodule:: cudf.core.column.string
-
-.. autoclass:: StringMethods
-    :members:
-
-General Functions
------------------
-.. automodule:: cudf.core.reshape
-    :members:
-.. autofunction:: cudf.to_datetime
-.. autofunction:: cudf.to_numeric
-
-Index
------
-.. currentmodule:: cudf.core.index
-.. autoclass:: Index
-    :members:
-    :inherited-members:
-    :exclude-members: serialize, deserialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-RangeIndex
-----------
-.. currentmodule:: cudf.core.index
-.. autoclass:: RangeIndex
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-GenericIndex
-------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: GenericIndex
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-MultiIndex
-----------
-.. currentmodule:: cudf.core.multiindex
-.. autoclass:: MultiIndex
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-Int8Index
----------
-.. currentmodule:: cudf.core.index
-.. autoclass:: Int8Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-Int16Index
-----------
-.. currentmodule:: cudf.core.index
-.. autoclass:: Int16Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-Int32Index
-----------
-.. currentmodule:: cudf.core.index
-.. autoclass:: Int32Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-Int64Index
-----------
-.. currentmodule:: cudf.core.index
-.. autoclass:: Int64Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-UInt8Index
-----------
-.. currentmodule:: cudf.core.index
-.. autoclass:: UInt8Index
-    :inherited-members:
-    :members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-UInt16Index
------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: UInt16Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-UInt32Index
------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: UInt32Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-UInt64Index
------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: UInt64Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-Float32Index
-------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: Float32Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-Float64Index
-------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: Float64Index
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-CategoricalIndex
-----------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: CategoricalIndex
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-StringIndex
------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: StringIndex
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-DatetimeIndex
--------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: DatetimeIndex
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-TimedeltaIndex
---------------
-.. currentmodule:: cudf.core.index
-.. autoclass:: TimedeltaIndex
-    :members:
-    :inherited-members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize, tolist, to_list
-
-Categories
-----------
-.. currentmodule:: cudf.core.column.categorical
-
-.. autoclass:: CategoricalAccessor
-    :members:
-
-GroupBy
--------
-.. currentmodule:: cudf.core.groupby.groupby
-
-.. autoclass:: GroupBy
-    :members:
-    :exclude-members: deserialize, serialize, device_deserialize, device_serialize, host_deserialize, host_serialize
-
-Window
-------
-.. currentmodule:: cudf.core.window
-.. autoclass:: Rolling
-    :members:
-
-SubwordTokenizer
-----------------
-.. currentmodule:: cudf.core.subword_tokenizer
-
-.. autoclass:: SubwordTokenizer
-    :members:
-    :special-members: __call__
-
-General utility functions
--------------------------
-.. currentmodule:: cudf.testing
-
-.. automodule:: cudf.testing.testing
-    :members:
-
-
-Timedelta Properties
---------------------
-.. currentmodule:: cudf.core.series
-.. autoclass:: TimedeltaProperties
-    :members:
-
-Datetime Properties
--------------------
-.. currentmodule:: cudf.core.series
-.. autoclass:: DatetimeProperties
-    :members:
-
-IO
---
-.. currentmodule:: cudf.io
-
-.. automodule:: cudf.io.csv
-    :members:
-.. automodule:: cudf.io.parquet
-    :members:
-.. automodule:: cudf.io.orc
-    :members:
-.. automodule:: cudf.io.json
-    :members:
-.. automodule:: cudf.io.avro
-    :members:
-.. automodule:: cudf.io.dlpack
-    :members:
-.. automodule:: cudf.io.feather
-    :members:
-.. automodule:: cudf.io.hdf
-    :members:
-
-Extending cuDF
-----------------
-.. currentmodule:: cudf.api.extensions
-
-.. automodule:: cudf.api.extensions.accessor
-    :members:
-
-GpuArrowReader
---------------
-.. currentmodule:: cudf.comm.gpuarrow
-.. autoclass:: GpuArrowReader
-    :members:
-    :exclude-members: count, index
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
new file mode 100644
index 00000000000..12ff1f13bc4
--- /dev/null
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -0,0 +1,254 @@
+=========
+DataFrame
+=========
+.. currentmodule:: cudf
+
+Constructor
+~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_with_autosummary.rst
+
+   DataFrame
+
+Attributes and underlying data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Axes**
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.index
+   DataFrame.columns
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.dtypes
+   DataFrame.info
+   DataFrame.select_dtypes
+   DataFrame.values
+   DataFrame.ndim
+   DataFrame.size
+   DataFrame.shape
+   DataFrame.memory_usage
+   DataFrame.empty
+
+Conversion
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.astype
+   DataFrame.copy
+
+Indexing, iteration
+~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.head
+   DataFrame.at
+   DataFrame.iat
+   DataFrame.loc
+   DataFrame.iloc
+   DataFrame.insert
+   DataFrame.__iter__
+   DataFrame.iteritems
+   DataFrame.keys
+   DataFrame.iterrows
+   DataFrame.itertuples
+   DataFrame.pop
+   DataFrame.tail
+   DataFrame.isin
+   DataFrame.where
+   DataFrame.mask
+   DataFrame.query
+
+For more information on ``.at``, ``.iat``, ``.loc``, and
+``.iloc``,  see the :ref:`indexing documentation <indexing>`.
+
+Binary operator functions
+~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.add
+   DataFrame.sub
+   DataFrame.mul
+   DataFrame.div
+   DataFrame.truediv
+   DataFrame.floordiv
+   DataFrame.mod
+   DataFrame.pow
+   DataFrame.radd
+   DataFrame.rsub
+   DataFrame.rmul
+   DataFrame.rdiv
+   DataFrame.rtruediv
+   DataFrame.rfloordiv
+   DataFrame.rmod
+   DataFrame.rpow
+
+Function application, GroupBy & window
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.apply
+   DataFrame.apply_chunks
+   DataFrame.apply_rows
+   DataFrame.pipe
+   DataFrame.agg
+   DataFrame.groupby
+   DataFrame.rolling
+
+.. _api.dataframe.stats:
+
+Computations / descriptive stats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.all
+   DataFrame.any
+   DataFrame.clip
+   DataFrame.corr
+   DataFrame.count
+   DataFrame.cov
+   DataFrame.cummax
+   DataFrame.cummin
+   DataFrame.cumprod
+   DataFrame.cumsum
+   DataFrame.describe
+   DataFrame.kurt
+   DataFrame.kurtosis
+   DataFrame.max
+   DataFrame.mean
+   DataFrame.min
+   DataFrame.mode
+   DataFrame.prod
+   DataFrame.product
+   DataFrame.quantile
+   DataFrame.quantiles
+   DataFrame.rank
+   DataFrame.round
+   DataFrame.skew
+   DataFrame.sum
+   DataFrame.std
+   DataFrame.var
+
+Reindexing / selection / label manipulation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.drop
+   DataFrame.drop_duplicates
+   DataFrame.equals
+   DataFrame.head
+   DataFrame.reindex
+   DataFrame.rename
+   DataFrame.reset_index
+   DataFrame.sample
+   DataFrame.searchsorted
+   DataFrame.set_index
+   DataFrame.repeat
+   DataFrame.tail
+   DataFrame.take
+   DataFrame.tile
+
+.. _api.dataframe.missing:
+
+Missing data handling
+~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.dropna
+   DataFrame.fillna
+   DataFrame.isna
+   DataFrame.isnull
+   DataFrame.nans_to_nulls
+   DataFrame.notna
+   DataFrame.notnull
+   DataFrame.replace
+
+Reshaping, sorting, transposing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.argsort
+   DataFrame.interleave_columns
+   DataFrame.partition_by_hash
+   DataFrame.pivot
+   DataFrame.scatter_by_map
+   DataFrame.sort_values
+   DataFrame.sort_index
+   DataFrame.nlargest
+   DataFrame.nsmallest
+   DataFrame.stack
+   DataFrame.unstack
+   DataFrame.melt
+   DataFrame.explode
+   DataFrame.T
+   DataFrame.transpose
+
+Combining / comparing / joining / merging / encoding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.append
+   DataFrame.assign
+   DataFrame.join
+   DataFrame.merge
+   DataFrame.update
+   DataFrame.label_encoding
+   DataFrame.one_hot_encoding
+
+Numerical operations
+~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.acos
+   DataFrame.asin
+   DataFrame.atan
+   DataFrame.cos
+   DataFrame.exp
+   DataFrame.log
+   DataFrame.sin
+   DataFrame.sqrt
+   DataFrame.tan
+
+Time Series-related
+~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.shift
+
+Serialization / IO / conversion
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DataFrame.as_gpu_matrix
+   DataFrame.as_matrix
+   DataFrame.from_arrow
+   DataFrame.from_pandas
+   DataFrame.from_records
+   DataFrame.hash_columns
+   DataFrame.to_arrow
+   DataFrame.to_dlpack
+   DataFrame.to_parquet
+   DataFrame.to_csv
+   DataFrame.to_hdf
+   DataFrame.to_dict
+   DataFrame.to_json
+   DataFrame.to_pandas
+   DataFrame.to_feather
+   DataFrame.to_records
+   DataFrame.to_string
diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
new file mode 100644
index 00000000000..226ae8acd32
--- /dev/null
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -0,0 +1,32 @@
+=================
+General Functions
+=================
+.. currentmodule:: cudf
+
+Data manipulations
+------------------
+
+.. autosummary::
+   :toctree: api/
+
+   cudf.concat
+   cudf.melt
+   cudf.get_dummies
+   cudf.merge_sorted
+   cudf.pivot
+   cudf.unstack
+
+Top-level conversions
+---------------------
+.. autosummary::
+   :toctree: api/
+
+    cudf.to_numeric
+
+Top-level dealing with datetimelike
+-----------------------------------
+
+.. autosummary::
+   :toctree: api/
+
+    cudf.to_datetime
diff --git a/docs/cudf/source/api_docs/general_utilities.rst b/docs/cudf/source/api_docs/general_utilities.rst
new file mode 100644
index 00000000000..d9c53c3fbbd
--- /dev/null
+++ b/docs/cudf/source/api_docs/general_utilities.rst
@@ -0,0 +1,13 @@
+=================
+General Utilities
+=================
+
+Testing functions
+-----------------
+.. autosummary::
+   :toctree: api/
+
+   cudf.testing.testing.assert_column_equal
+   cudf.testing.testing.assert_frame_equal
+   cudf.testing.testing.assert_index_equal
+   cudf.testing.testing.assert_series_equal
diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
new file mode 100644
index 00000000000..27a314fa425
--- /dev/null
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -0,0 +1,96 @@
+.. _api.groupby:
+
+=======
+GroupBy
+=======
+.. currentmodule:: cudf.core.groupby
+
+GroupBy objects are returned by groupby calls: :func:`cudf.DataFrame.groupby`, :func:`cudf.Series.groupby`, etc.
+
+Indexing, iteration
+-------------------
+.. autosummary::
+   :toctree: api/
+
+   GroupBy.__iter__
+   GroupBy.groups
+
+.. currentmodule:: cudf
+
+.. autosummary::
+   :toctree: api/
+
+   Grouper
+
+.. currentmodule:: cudf.core.groupby.groupby
+
+Function application
+--------------------
+.. autosummary::
+   :toctree: api/
+
+   GroupBy.apply
+   GroupBy.agg
+   SeriesGroupBy.aggregate
+   DataFrameGroupBy.aggregate
+   GroupBy.pipe
+
+Computations / descriptive stats
+--------------------------------
+.. autosummary::
+   :toctree: api/
+
+   GroupBy.bfill
+   GroupBy.backfill
+   GroupBy.count
+   GroupBy.cumcount
+   GroupBy.cummax
+   GroupBy.cummin
+   GroupBy.cumsum
+   GroupBy.ffill
+   GroupBy.max
+   GroupBy.mean
+   GroupBy.median
+   GroupBy.min
+   GroupBy.nth
+   GroupBy.pad
+   GroupBy.prod
+   GroupBy.size
+   GroupBy.std
+   GroupBy.sum
+   GroupBy.var
+   
+The following methods are available in both ``SeriesGroupBy`` and
+``DataFrameGroupBy`` objects, but may differ slightly, usually in that
+the ``DataFrameGroupBy`` version usually permits the specification of an
+axis argument, and often an argument indicating whether to restrict
+application to columns of a specific data type.
+
+.. autosummary::
+   :toctree: api/
+
+   DataFrameGroupBy.backfill
+   DataFrameGroupBy.bfill
+   DataFrameGroupBy.count
+   DataFrameGroupBy.cumcount
+   DataFrameGroupBy.cummax
+   DataFrameGroupBy.cummin
+   DataFrameGroupBy.cumsum
+   DataFrameGroupBy.describe
+   DataFrameGroupBy.ffill
+   DataFrameGroupBy.fillna
+   DataFrameGroupBy.idxmax
+   DataFrameGroupBy.idxmin
+   DataFrameGroupBy.nunique
+   DataFrameGroupBy.pad
+   DataFrameGroupBy.quantile
+   DataFrameGroupBy.shift
+   DataFrameGroupBy.size
+
+The following methods are available only for ``SeriesGroupBy`` objects.
+
+.. autosummary::
+   :toctree: api/
+
+   SeriesGroupBy.nunique
+   SeriesGroupBy.unique
diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst
new file mode 100644
index 00000000000..70b9563fc1d
--- /dev/null
+++ b/docs/cudf/source/api_docs/index.rst
@@ -0,0 +1,19 @@
+=============
+API reference
+=============
+
+This page provides a list of all publicly accessible modules, methods and classes through
+``cudf.*`` namespace.
+
+.. toctree::
+    :maxdepth: 2
+    :caption: API Documentation
+
+    series
+    dataframe
+    index_objects
+    groupby
+    general_functions
+    general_utilities
+    window
+
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
new file mode 100644
index 00000000000..c23c9a3f6c1
--- /dev/null
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -0,0 +1,296 @@
+=============
+Index objects
+=============
+
+Index
+-----
+.. currentmodule:: cudf
+
+**Many of these methods or variants thereof are available on the objects
+that contain an index (Series/DataFrame) and those should most likely be
+used before calling these methods directly.**
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_with_autosummary.rst
+
+   Index
+
+Properties
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.empty
+   Index.gpu_values
+   Index.is_monotonic
+   Index.is_monotonic_increasing
+   Index.is_monotonic_decreasing
+   Index.is_unique
+   Index.name
+   Index.names
+   Index.ndim
+   Index.nlevels
+   Index.shape
+   Index.size
+   Index.values
+   
+
+Modifying and computations
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.any
+   Index.copy
+   Index.drop_duplicates
+   Index.equals
+   Index.factorize
+   Index.min
+   Index.max
+   Index.rename
+   Index.repeat
+   Index.where
+   Index.take
+   Index.unique
+
+Compatibility with MultiIndex
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.set_names
+
+Missing values
+~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.fillna
+   Index.dropna
+   Index.isna
+   Index.notna
+
+Memory usage
+~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.memory_usage
+
+Conversion
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.astype
+   Index.to_list
+   Index.to_series
+   Index.to_frame
+
+Sorting
+~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.argsort
+   Index.searchsorted
+   Index.sort_values
+
+Time-specific operations
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.shift
+
+Combining / joining / set operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.append
+   Index.join
+   Index.difference
+
+Selecting
+~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Index.get_level_values
+   Index.get_loc
+   Index.get_slice_bound
+   Index.isin
+
+.. _api.numericindex:
+
+Numeric Index
+-------------
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   RangeIndex
+   Int64Index
+   UInt64Index
+   Float64Index
+
+
+.. _api.categoricalindex:
+
+CategoricalIndex
+----------------
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   CategoricalIndex
+
+Categorical components
+~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   CategoricalIndex.codes
+   CategoricalIndex.categories
+
+Modifying and computations
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   CategoricalIndex.equals
+
+.. _api.intervalindex:
+
+IntervalIndex
+-------------
+.. autosummary::
+   :toctree: api/
+
+   IntervalIndex
+
+IntervalIndex components
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   IntervalIndex.from_breaks
+   IntervalIndex.values
+   IntervalIndex.get_loc
+
+.. _api.multiindex:
+
+MultiIndex
+----------
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   MultiIndex
+
+
+MultiIndex constructors
+~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.from_tuples
+   MultiIndex.from_product
+   MultiIndex.from_frame
+   MultiIndex.from_arrow
+
+MultiIndex properties
+~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.names
+   MultiIndex.levels
+   MultiIndex.codes
+   MultiIndex.nlevels
+
+MultiIndex components
+~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.to_frame
+   MultiIndex.droplevel
+
+MultiIndex selecting
+~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   MultiIndex.get_loc
+   MultiIndex.get_level_values
+
+.. _api.datetimeindex:
+
+DatetimeIndex
+-------------
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   DatetimeIndex
+
+Time/date components
+~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DatetimeIndex.year
+   DatetimeIndex.month
+   DatetimeIndex.day
+   DatetimeIndex.hour
+   DatetimeIndex.minute
+   DatetimeIndex.second
+   DatetimeIndex.dayofweek
+   DatetimeIndex.weekday
+
+Time-specific operations
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DatetimeIndex.round
+
+Conversion
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   DatetimeIndex.to_series
+   DatetimeIndex.to_frame
+
+TimedeltaIndex
+--------------
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_without_autosummary.rst
+
+   TimedeltaIndex
+
+Components
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   TimedeltaIndex.days
+   TimedeltaIndex.seconds
+   TimedeltaIndex.microseconds
+   TimedeltaIndex.nanoseconds
+   TimedeltaIndex.components
+   TimedeltaIndex.inferred_freq
+
+Conversion
+~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   TimedeltaIndex.to_series
+   TimedeltaIndex.round
+   TimedeltaIndex.to_frame
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
new file mode 100644
index 00000000000..ffa809268f3
--- /dev/null
+++ b/docs/cudf/source/api_docs/series.rst
@@ -0,0 +1,478 @@
+======
+Series
+======
+.. currentmodule:: cudf
+
+Constructor
+-----------
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/class_with_autosummary.rst
+
+   Series
+
+Attributes
+----------
+**Axes**
+
+.. autosummary::
+   :toctree: api/
+
+   Series.index
+   Series.values
+   Series.data
+   Series.dtype
+   Series.shape
+   Series.ndim
+   Series.nullable
+   Series.nullmask
+   Series.null_count
+   Series.size
+   Series.memory_usage
+   Series.has_nulls
+   Series.empty
+   Series.name
+   Series.valid_count
+   Series.values_host
+
+Conversion
+----------
+.. autosummary::
+   :toctree: api/
+
+   Series.astype
+   Series.copy
+   Series.to_list
+   Series.__array__
+   Series.as_index
+   Series.as_mask
+   Series.scale
+
+
+Indexing, iteration
+-------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.loc
+   Series.iloc
+   Series.__iter__
+   Series.items
+   Series.iteritems
+   Series.keys
+
+For more information on ``.at``, ``.iat``, ``.loc``, and
+``.iloc``,  see the :ref:`indexing documentation <indexing>`.
+
+Binary operator functions
+-------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.add
+   Series.sub
+   Series.subtract
+   Series.mul
+   Series.multiply
+   Series.truediv
+   Series.floordiv
+   Series.mod
+   Series.pow
+   Series.radd
+   Series.rsub
+   Series.rmul
+   Series.rtruediv
+   Series.rfloordiv
+   Series.rmod
+   Series.rpow
+   Series.round
+   Series.lt
+   Series.gt
+   Series.le
+   Series.ge
+   Series.ne
+   Series.eq
+   Series.product
+
+Function application, GroupBy & window
+--------------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.applymap
+   Series.map
+   Series.groupby
+   Series.rolling
+   Series.pipe
+
+.. _api.series.stats:
+
+Computations / descriptive stats
+--------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.abs
+   Series.all
+   Series.any
+   Series.ceil
+   Series.clip
+   Series.corr
+   Series.count
+   Series.cov
+   Series.cummax
+   Series.cummin
+   Series.cumprod
+   Series.cumsum
+   Series.describe
+   Series.diff
+   Series.digitize
+   Series.factorize
+   Series.floor
+   Series.kurt
+   Series.max
+   Series.mean
+   Series.median
+   Series.min
+   Series.mode
+   Series.nlargest
+   Series.nsmallest
+   Series.prod
+   Series.quantile
+   Series.rank
+   Series.skew
+   Series.std
+   Series.sum
+   Series.var
+   Series.kurtosis
+   Series.unique
+   Series.nunique
+   Series.is_unique
+   Series.is_monotonic
+   Series.is_monotonic_increasing
+   Series.is_monotonic_decreasing
+   Series.value_counts
+
+Reindexing / selection / label manipulation
+-------------------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.drop
+   Series.drop_duplicates
+   Series.equals
+   Series.head
+   Series.isin
+   Series.reindex
+   Series.rename
+   Series.reset_index
+   Series.reverse
+   Series.sample
+   Series.set_index
+   Series.set_mask
+   Series.take
+   Series.tail
+   Series.tile
+   Series.where
+   Series.mask
+
+Missing data handling
+---------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.dropna
+   Series.fillna
+   Series.isna
+   Series.isnull
+   Series.nans_to_nulls
+   Series.notna
+   Series.notnull
+   Series.replace
+
+Reshaping, sorting
+------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.argsort
+   Series.interleave_columns
+   Series.sort_values
+   Series.sort_index
+   Series.explode
+   Series.scatter_by_map
+   Series.searchsorted
+   Series.repeat
+
+Combining / comparing / joining / merging / encoding
+----------------------------------------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.append
+   Series.update
+   Series.label_encoding
+   Series.one_hot_encoding
+
+Numerical operations
+~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: api/
+
+   Series.acos
+   Series.asin
+   Series.atan
+   Series.cos
+   Series.exp
+   Series.log
+   Series.sin
+   Series.sqrt
+   Series.tan
+
+Time Series-related
+-------------------
+.. autosummary::
+   :toctree: api/
+
+   Series.shift
+
+Accessors
+---------
+
+pandas provides dtype-specific methods under various accessors.
+These are separate namespaces within :class:`Series` that only apply
+to specific data types.
+
+=========================== =================================
+Data Type                   Accessor
+=========================== =================================
+Datetime, Timedelta         :ref:`dt <api.series.dt>`
+String                      :ref:`str <api.series.str>`
+Categorical                 :ref:`cat <api.series.cat>`
+List                        :ref:`list <api.series.list>`
+=========================== =================================
+
+.. _api.series.dt:
+
+Datetimelike properties
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``Series.dt`` can be used to access the values of the series as
+datetimelike and return several properties.
+These can be accessed like ``Series.dt.<property>``.
+
+Datetime properties
+^^^^^^^^^^^^^^^^^^^
+.. currentmodule:: cudf.core.series.DatetimeProperties
+
+.. autosummary::
+   :toctree: api/
+
+   day
+   dayofweek
+   hour
+   minute
+   month
+   second
+   weekday
+   year
+
+Datetime methods
+^^^^^^^^^^^^^^^^
+
+.. autosummary::
+   :toctree: api/
+
+   strftime
+
+
+Timedelta properties
+^^^^^^^^^^^^^^^^^^^^
+
+.. currentmodule:: cudf.core.series.TimedeltaProperties
+.. autosummary::
+   :toctree: api/
+
+   components
+   days
+   microseconds
+   nanoseconds
+   seconds
+
+
+.. _api.series.str:
+
+String handling
+~~~~~~~~~~~~~~~
+
+``Series.str`` can be used to access the values of the series as
+strings and apply several methods to it. These can be accessed like
+``Series.str.<function/property>``.
+
+.. currentmodule:: cudf.core.column.string.StringMethods
+.. autosummary::
+   :toctree: api/
+
+   byte_count
+   capitalize
+   cat
+   center
+   character_ngrams
+   character_tokenize
+   code_points
+   contains
+   count
+   detokenize
+   edit_distance
+   endswith
+   extract
+   filter_alphanum
+   filter_characters
+   filter_tokens
+   find
+   findall
+   get
+   get_json_object
+   htoi
+   index
+   insert
+   ip2int
+   is_consonant
+   is_vowel
+   isalnum
+   isalpha
+   isdecimal
+   isdigit
+   isempty
+   isfloat
+   ishex
+   isinteger
+   isipv4
+   isspace
+   islower
+   isnumeric
+   isupper
+   istimestamp
+   join
+   len
+   ljust
+   lower
+   lstrip
+   match
+   ngrams
+   ngrams_tokenize
+   normalize_characters
+   pad
+   partition
+   porter_stemmer_measure
+   replace
+   replace_tokens
+   replace_with_backrefs
+   rfind
+   rindex
+   rjust
+   rpartition
+   rstrip
+   slice
+   slice_from
+   slice_replace
+   split
+   rsplit
+   startswith
+   strip
+   subword_tokenize
+   swapcase
+   title
+   token_count
+   tokenize
+   translate
+   upper
+   url_decode
+   url_encode
+   wrap
+   zfill
+   
+
+
+..
+    The following is needed to ensure the generated pages are created with the
+    correct template (otherwise they would be created in the Series/Index class page)
+
+..
+    .. currentmodule:: cudf
+    .. autosummary::
+       :toctree: api/
+       :template: autosummary/accessor.rst
+
+       Series.str
+       Series.cat
+       Series.dt
+       Index.str
+
+.. _api.series.cat:
+
+Categorical accessor
+~~~~~~~~~~~~~~~~~~~~
+
+Categorical-dtype specific methods and attributes are available under
+the ``Series.cat`` accessor.
+
+.. currentmodule:: cudf.core.column.categorical.CategoricalAccessor
+.. autosummary::
+   :toctree: api/
+
+   categories
+   ordered
+   codes
+   reorder_categories
+   add_categories
+   remove_categories
+   set_categories
+   as_ordered
+   as_unordered
+
+
+.. _api.series.list:
+
+List handling
+~~~~~~~~~~~~~
+
+``Series.list`` can be used to access the values of the series as
+lists and apply list methods to it. These can be accessed like
+``Series.list.<function/property>``.
+
+.. currentmodule:: cudf.core.column.lists.ListMethods
+.. autosummary::
+   :toctree: api/
+
+   concat
+   contains
+   get
+   len
+   sort_values
+   take
+   unique
+
+
+Serialization / IO / conversion
+-------------------------------
+.. currentmodule:: cudf
+.. autosummary::
+   :toctree: api/
+
+   Series.to_array
+   Series.to_arrow
+   Series.to_dlpack
+   Series.to_frame
+   Series.to_gpu_array
+   Series.to_hdf
+   Series.to_json
+   Series.to_pandas
+   Series.to_string
+   Series.from_arrow
+   Series.from_categorical
+   Series.from_masked_array
+   Series.from_pandas
+   Series.hash_encode
+   Series.hash_values
+   
\ No newline at end of file
diff --git a/docs/cudf/source/api_docs/window.rst b/docs/cudf/source/api_docs/window.rst
new file mode 100644
index 00000000000..9f94f620949
--- /dev/null
+++ b/docs/cudf/source/api_docs/window.rst
@@ -0,0 +1,24 @@
+.. _api.window:
+
+======
+Window
+======
+
+Rolling objects are returned by ``.rolling`` calls: :func:`cudf.DataFrame.rolling`, :func:`cudf.Series.rolling`, etc.
+
+.. _api.functions_rolling:
+
+Rolling window functions
+------------------------
+.. currentmodule:: cudf.core.window.rolling
+
+.. autosummary::
+   :toctree: api/
+
+   Rolling.count
+   Rolling.sum
+   Rolling.mean
+   Rolling.min
+   Rolling.max
+   Rolling.apply
+
diff --git a/docs/cudf/source/basics.rst b/docs/cudf/source/basics.rst
deleted file mode 100644
index 15b4b43662b..00000000000
--- a/docs/cudf/source/basics.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-Basics
-======
-
-
-Supported Dtypes
-----------------
-
-cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``,
-``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``,
-``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes).
-
-
-The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type.
-
-
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Kind of Data           | Data Type        | Scalar                                                                              | String Aliases                              |
-+========================+==================+=====================================================================================+=============================================+
-| Integer                |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_,                   | ``'int8'``, ``'int16'``, ``'int32'``,       |
-|                        |                  | np.uint32_, np.uint64_                                                              | ``'int64'``, ``'uint8'``, ``'uint16'``,     |
-|                        |                  |                                                                                     | ``'uint32'``, ``'uint64'``                  |
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Float                  |                  | np.float32_, np.float64_                                                            | ``'float32'``, ``'float64'``                |
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Strings                |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_                        | ``'string'``, ``'object'``                  |
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Datetime               |                  | np.datetime64_                                                                      | ``'datetime64[s]'``, ``'datetime64[ms]'``,  |
-|                        |                  |                                                                                     | ``'datetime64[us]'``, ``'datetime64[ns]'``  |
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Timedelta              |                  | np.timedelta64_                                                                     | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,|
-| (duration type)        |                  |                                                                                     | ``'timedelta64[us]'``, ``'timedelta64[ns]'``|
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Categorical            | CategoricalDtype | (none)                                                                              | ``'category'``                              |
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-| Decimal                | Decimal64Dtype   | (none)                                                                              | (none)                                      |
-+------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-
-**Note: All dtypes above are Nullable**
-
-.. _np.int8: 
-.. _np.int16: 
-.. _np.int32:
-.. _np.int64:
-.. _np.uint8:
-.. _np.uint16:
-.. _np.uint32:
-.. _np.uint64:
-.. _np.float32:
-.. _np.float64:
-.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html
-.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes
-.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic
diff --git a/docs/cudf/source/PandasCompat.rst b/docs/cudf/source/basics/PandasCompat.rst
similarity index 100%
rename from docs/cudf/source/PandasCompat.rst
rename to docs/cudf/source/basics/PandasCompat.rst
diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst
new file mode 100644
index 00000000000..ee63f67daa2
--- /dev/null
+++ b/docs/cudf/source/basics/basics.rst
@@ -0,0 +1,56 @@
+Basics
+======
+
+
+Supported Dtypes
+----------------
+
+cuDF uses dtypes for Series or individual columns of a DataFrame. cuDF uses NumPy dtypes, NumPy provides support for ``float``, ``int``, ``bool``,
+``'timedelta64[s]'``, ``'timedelta64[ms]'``, ``'timedelta64[us]'``, ``'timedelta64[ns]'``, ``'datetime64[s]'``, ``'datetime64[ms]'``,
+``'datetime64[us]'``, ``'datetime64[ns]'`` (note that NumPy does not support timezone-aware datetimes).
+
+
+The following table lists all of cudf types. For methods requiring dtype arguments, strings can be specified as indicated. See the respective documentation sections for more on each type.
+
+.. rst-class:: special-table
+.. table::
+
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Kind of Data           | Data Type        | Scalar                                                                              | String Aliases                              |
+    +========================+==================+=====================================================================================+=============================================+
+    | Integer                |                  | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_,                   | ``'int8'``, ``'int16'``, ``'int32'``,       |
+    |                        |                  | np.uint32_, np.uint64_                                                              | ``'int64'``, ``'uint8'``, ``'uint16'``,     |
+    |                        |                  |                                                                                     | ``'uint32'``, ``'uint64'``                  |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Float                  |                  | np.float32_, np.float64_                                                            | ``'float32'``, ``'float64'``                |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Strings                |                  | `str <https://docs.python.org/3/library/stdtypes.html#str>`_                        | ``'string'``, ``'object'``                  |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Datetime               |                  | np.datetime64_                                                                      | ``'datetime64[s]'``, ``'datetime64[ms]'``,  |
+    |                        |                  |                                                                                     | ``'datetime64[us]'``, ``'datetime64[ns]'``  |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Timedelta              |                  | np.timedelta64_                                                                     | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,|
+    | (duration type)        |                  |                                                                                     | ``'timedelta64[us]'``, ``'timedelta64[ns]'``|
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Categorical            | CategoricalDtype | (none)                                                                              | ``'category'``                              |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+    | Decimal                | Decimal64Dtype   | (none)                                                                              | (none)                                      |
+    +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
+
+**Note: All dtypes above are Nullable**
+
+.. _np.int8: 
+.. _np.int16: 
+.. _np.int32:
+.. _np.int64:
+.. _np.uint8:
+.. _np.uint16:
+.. _np.uint32:
+.. _np.uint64:
+.. _np.float32:
+.. _np.float64:
+.. _np.bool: https://numpy.org/doc/stable/user/basics.types.html
+.. _np.datetime64: https://numpy.org/doc/stable/reference/arrays.datetime.html#basic-datetimes
+.. _np.timedelta64: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-and-timedelta-arithmetic
diff --git a/docs/cudf/source/dask-cudf.rst b/docs/cudf/source/basics/dask-cudf.rst
similarity index 100%
rename from docs/cudf/source/dask-cudf.rst
rename to docs/cudf/source/basics/dask-cudf.rst
diff --git a/docs/cudf/source/groupby.rst b/docs/cudf/source/basics/groupby.rst
similarity index 51%
rename from docs/cudf/source/groupby.rst
rename to docs/cudf/source/basics/groupby.rst
index a6ce9db6817..04c4d42fa2a 100644
--- a/docs/cudf/source/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -131,41 +131,44 @@ Aggregations on groups is supported via the ``agg`` method:
 The following table summarizes the available aggregations and the types
 that support them:
 
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| Aggregations / dtypes              | Numeric   | Datetime   | String   | Categorical   | List   | Struct   | Interval   | Decimal   |
-+====================================+===========+============+==========+===============+========+==========+============+===========+
-| count                              | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| size                               | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| sum                                | ✅        | ✅         |          |               |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| idxmin                             | ✅        | ✅         |          |               |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| idxmax                             | ✅        | ✅         |          |               |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| min                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| max                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| mean                               | ✅        | ✅         |          |               |        |          |            |           |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| var                                | ✅        | ✅         |          |               |        |          |            |           |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| std                                | ✅        | ✅         |          |               |        |          |            |           |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| quantile                           | ✅        | ✅         |          |               |        |          |            |           |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| median                             | ✅        | ✅         |          |               |        |          |            |           |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| nunique                            | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| nth                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| collect                            | ✅        | ✅         | ✅       |               | ✅     |          |            | ✅        |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-| unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
-+------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+.. rst-class:: special-table
+.. table::
+
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | Aggregations / dtypes              | Numeric   | Datetime   | String   | Categorical   | List   | Struct   | Interval   | Decimal   |
+   +====================================+===========+============+==========+===============+========+==========+============+===========+
+   | count                              | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | size                               | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | sum                                | ✅        | ✅         |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | idxmin                             | ✅        | ✅         |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | idxmax                             | ✅        | ✅         |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | min                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | max                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | mean                               | ✅        | ✅         |          |               |        |          |            |           |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | var                                | ✅        | ✅         |          |               |        |          |            |           |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | std                                | ✅        | ✅         |          |               |        |          |            |           |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | quantile                           | ✅        | ✅         |          |               |        |          |            |           |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | median                             | ✅        | ✅         |          |               |        |          |            |           |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | nunique                            | ✅        | ✅         | ✅       | ✅            |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | nth                                | ✅        | ✅         | ✅       |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | collect                            | ✅        | ✅         | ✅       |               | ✅     |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
 
 GroupBy apply
 -------------
diff --git a/docs/cudf/source/basics/index.rst b/docs/cudf/source/basics/index.rst
new file mode 100644
index 00000000000..a29866d7e32
--- /dev/null
+++ b/docs/cudf/source/basics/index.rst
@@ -0,0 +1,15 @@
+======
+Basics
+======
+
+
+.. toctree::
+   :maxdepth: 2
+
+   basics
+   io.rst
+   groupby.rst
+   PandasCompat.rst
+   dask-cudf.rst
+   internals.rst
+   
\ No newline at end of file
diff --git a/docs/cudf/source/internals.rst b/docs/cudf/source/basics/internals.rst
similarity index 100%
rename from docs/cudf/source/internals.rst
rename to docs/cudf/source/basics/internals.rst
diff --git a/docs/cudf/source/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
similarity index 100%
rename from docs/cudf/source/io-gds-integration.rst
rename to docs/cudf/source/basics/io-gds-integration.rst
diff --git a/docs/cudf/source/io-supported-types.rst b/docs/cudf/source/basics/io-supported-types.rst
similarity index 99%
rename from docs/cudf/source/io-supported-types.rst
rename to docs/cudf/source/basics/io-supported-types.rst
index 739c1634ca7..78c1bfb6554 100644
--- a/docs/cudf/source/io-supported-types.rst
+++ b/docs/cudf/source/basics/io-supported-types.rst
@@ -3,7 +3,7 @@ I/O Supported dtypes
 
 The following table lists are compatible cudf types for each supported IO format.
 
-.. rst-class:: io-supported-types-table
+.. rst-class:: io-supported-types-table special-table
 .. table::
     :widths: 15 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
 
diff --git a/docs/cudf/source/io.rst b/docs/cudf/source/basics/io.rst
similarity index 100%
rename from docs/cudf/source/io.rst
rename to docs/cudf/source/basics/io.rst
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c764b64da60..c5f1233d022 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -24,7 +24,10 @@
 
 from docutils.nodes import Text
 from sphinx.addnodes import pending_xref
+import cudf
 
+sys.path.insert(0, os.path.abspath(cudf.__path__[0]))
+sys.path.insert(0, os.path.abspath("."))
 sys.path.insert(0, os.path.abspath("../.."))
 sys.path.append(os.path.abspath("./_ext"))
 
@@ -43,7 +46,6 @@
     "sphinx.ext.autosummary",
     "sphinx_copybutton",
     "numpydoc",
-    "sphinx_markdown_tables",
     "IPython.sphinxext.ipython_console_highlighting",
     "IPython.sphinxext.ipython_directive",
     "nbsphinx",
@@ -51,9 +53,11 @@
 ]
 
 copybutton_prompt_text = ">>> "
-
+autosummary_generate = True
 ipython_mplbackend = "str"
 
+html_use_modindex = True
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 
@@ -61,7 +65,7 @@
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = ['.rst', '.md']
-source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
+source_suffix = {".rst": "restructuredtext"}
 
 # The master toctree document.
 master_doc = "index"
@@ -90,21 +94,30 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = []
+exclude_patterns = ['venv', "**/includes/**",]
 
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
 
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+html_theme_options = {
+    "external_links": [],
+    "github_url": "https://github.com/rapidsai/cudf",
+    "twitter_url": "https://twitter.com/rapidsai",
+    "show_toc_level": 1,
+    "navbar_align": "right",
+}
 include_pandas_compat = True
 
-# -- Options for HTML output ----------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
 
-html_theme = "sphinx_rtd_theme"
-
+html_theme = "pydata_sphinx_theme"
+html_logo = "_static/RAPIDS-logo-purple.png"
 # on_rtd is whether we are on readthedocs.org
 on_rtd = os.environ.get("READTHEDOCS", None) == "True"
 
@@ -112,10 +125,10 @@
     # only import and set the theme if we're building docs locally
     # otherwise, readthedocs.org uses their theme by default,
     # so no need to specify it
-    import sphinx_rtd_theme
+    import pydata_sphinx_theme
 
-    html_theme = "sphinx_rtd_theme"
-    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+    html_theme = "pydata_sphinx_theme"
+    html_theme_path = pydata_sphinx_theme.get_html_theme_path()
 
 
 # Theme options are theme-specific and customize the look and feel of a theme
@@ -201,8 +214,9 @@
 # Config numpydoc
 numpydoc_show_inherited_class_members = True
 numpydoc_class_members_toctree = False
+numpydoc_attributes_as_param_list = False
 
-autoclass_content = "init"
+autoclass_content = "class"
 
 # Replace API shorthands with fullname
 _reftarget_aliases = {
@@ -234,10 +248,27 @@ def ignore_internal_references(app, env, node, contnode):
         node["reftarget"] = ""
         return contnode
 
+def process_class_docstrings(app, what, name, obj, options, lines):
+    """
+    For those classes for which we use ::
+    :template: autosummary/class_without_autosummary.rst
+    the documented attributes/methods have to be listed in the class
+    docstring. However, if one of those lists is empty, we use 'None',
+    which then generates warnings in sphinx / ugly html output.
+    This "autodoc-process-docstring" event connector removes that part
+    from the processed docstring.
+    """
+    if what == "class":
+        if name in {"cudf.RangeIndex", "cudf.Int64Index", "cudf.UInt64Index", "cudf.Float64Index", "cudf.CategoricalIndex", "cudf.IntervalIndex", "cudf.MultiIndex", "cudf.DatetimeIndex", "cudf.TimedeltaIndex", "cudf.TimedeltaIndex"}:
+
+            cut_index = lines.index('.. rubric:: Attributes')
+            lines[:] = lines[:cut_index]
+
+
+
 
 def setup(app):
-    app.add_js_file("copybutton_pydocs.js")
     app.add_css_file("params.css")
-    app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.connect("doctree-read", resolve_aliases)
     app.connect("missing-reference", ignore_internal_references)
+    app.connect("autodoc-process-docstring", process_class_docstrings)
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
index 5a6d9a2617d..90b287bd1b6 100644
--- a/docs/cudf/source/index.rst
+++ b/docs/cudf/source/index.rst
@@ -1,25 +1,25 @@
 Welcome to cuDF's documentation!
 =================================
 
+cuDF is a Python GPU DataFrame library (built on the `Apache Arrow
+<http://arrow.apache.org/>`_ columnar memory format) for loading, joining,
+aggregating, filtering, and otherwise manipulating data. cuDF also provides a
+pandas-like API that will be familiar to data engineers & data scientists, so
+they can use it to easily accelerate their workflows without going into
+the details of CUDA programming.
+
+
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
 
-   api.rst
-   10min.ipynb
-   basics.rst
-   io.rst
-   groupby.rst
-   dask-cudf.rst
-   10min-cudf-cupy.ipynb
-   guide-to-udfs.ipynb
-   internals.rst
-   Working-with-missing-data.ipynb
-   PandasCompat.rst
+   user_guide/index
+   basics/index
+   api_docs/index
+
 
 Indices and tables
 ==================
 
 * :ref:`genindex`
-* :ref:`modindex`
 * :ref:`search`
diff --git a/docs/cudf/source/10min-cudf-cupy.ipynb b/docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
similarity index 100%
rename from docs/cudf/source/10min-cudf-cupy.ipynb
rename to docs/cudf/source/user_guide/10min-cudf-cupy.ipynb
diff --git a/docs/cudf/source/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
similarity index 100%
rename from docs/cudf/source/10min.ipynb
rename to docs/cudf/source/user_guide/10min.ipynb
diff --git a/docs/cudf/source/Working-with-missing-data.ipynb b/docs/cudf/source/user_guide/Working-with-missing-data.ipynb
similarity index 100%
rename from docs/cudf/source/Working-with-missing-data.ipynb
rename to docs/cudf/source/user_guide/Working-with-missing-data.ipynb
diff --git a/docs/cudf/source/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
similarity index 100%
rename from docs/cudf/source/guide-to-udfs.ipynb
rename to docs/cudf/source/user_guide/guide-to-udfs.ipynb
diff --git a/docs/cudf/source/user_guide/index.rst b/docs/cudf/source/user_guide/index.rst
new file mode 100644
index 00000000000..1061008eb3c
--- /dev/null
+++ b/docs/cudf/source/user_guide/index.rst
@@ -0,0 +1,12 @@
+==========
+User Guide
+==========
+
+
+.. toctree::
+   :maxdepth: 2
+
+   10min.ipynb
+   10min-cudf-cupy.ipynb
+   guide-to-udfs.ipynb
+   Working-with-missing-data.ipynb
diff --git a/java/pom.xml b/java/pom.xml
index b9bf5e9d8b7..1b4a31116d4 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -138,6 +138,18 @@
             <version>${arrow.version}</version>
            <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>org.apache.parquet</groupId>
+            <artifactId>parquet-avro</artifactId>
+            <version>1.10.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+            <version>3.1.0</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <properties>
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 49c6d2b6ffc..734d9cb5694 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -24,7 +24,7 @@
  * Represents an aggregation operation.  Please note that not all aggregations work, or even make
  * sense in all types of aggregation operations.
  */
-public abstract class Aggregation {
+abstract class Aggregation {
     static {
         NativeDepsLoader.loadNativeDeps();
     }
@@ -65,7 +65,7 @@ enum Kind {
         M2(26),
         MERGE_M2(27),
         RANK(28),
-        DENSE_RANK(29);;
+        DENSE_RANK(29);
 
         final int nativeId;
 
@@ -102,7 +102,7 @@ public boolean equals(Object other) {
         }
     }
 
-    public static final class NthAggregation extends Aggregation {
+    static final class NthAggregation extends Aggregation {
         private final int offset;
         private final NullPolicy nullPolicy;
 
@@ -194,7 +194,7 @@ public boolean equals(Object other) {
         }
     }
 
-    private static class QuantileAggregation extends Aggregation {
+    private static final class QuantileAggregation extends Aggregation {
         private final QuantileMethod method;
         private final double[] quantiles;
 
@@ -275,8 +275,7 @@ long getDefaultOutput() {
         }
     }
 
-    public static final class CollectListAggregation extends Aggregation
-        implements RollingAggregation<CollectListAggregation> {
+    static final class CollectListAggregation extends Aggregation {
         private final NullPolicy nullPolicy;
 
         private CollectListAggregation(NullPolicy nullPolicy) {
@@ -306,8 +305,7 @@ public boolean equals(Object other) {
         }
     }
 
-    public static final class CollectSetAggregation extends Aggregation
-        implements RollingAggregation<CollectSetAggregation> {
+    static final class CollectSetAggregation extends Aggregation {
         private final NullPolicy nullPolicy;
         private final NullEquality nullEquality;
         private final NaNEquality nanEquality;
@@ -348,7 +346,7 @@ public boolean equals(Object other) {
         }
     }
 
-    public static final class MergeSetsAggregation extends Aggregation {
+    static final class MergeSetsAggregation extends Aggregation {
         private final NullEquality nullEquality;
         private final NaNEquality nanEquality;
 
@@ -388,14 +386,6 @@ protected Aggregation(Kind kind) {
         this.kind = kind;
     }
 
-    /**
-     * Add a column to the Aggregation so it can be used on a specific column of data.
-     * @param columnIndex the index of the column to operate on.
-     */
-    public <T extends Aggregation> AggregationOnColumn<T> onColumn(int columnIndex) {
-        return new AggregationOnColumn((T)this, columnIndex);
-    }
-
     /**
      * Get the native view of a ColumnVector that provides default values to be used for some window
      * aggregations when there is not enough data to do the computation.  This really only happens
@@ -433,8 +423,7 @@ static void close(long[] ptrs) {
 
     static native void close(long ptr);
 
-    public static class SumAggregation extends NoParamAggregation
-        implements RollingAggregation<SumAggregation> {
+    static final class SumAggregation extends NoParamAggregation {
         private SumAggregation() {
             super(Kind.SUM);
         }
@@ -443,11 +432,11 @@ private SumAggregation() {
     /**
      * Sum reduction.
      */
-    public static SumAggregation sum() {
+    static SumAggregation sum() {
         return new SumAggregation();
     }
 
-    public static class ProductAggregation extends NoParamAggregation {
+    static final class ProductAggregation extends NoParamAggregation {
         private ProductAggregation() {
             super(Kind.PRODUCT);
         }
@@ -456,12 +445,11 @@ private ProductAggregation() {
     /**
      * Product reduction.
      */
-    public static ProductAggregation product() {
+    static ProductAggregation product() {
         return new ProductAggregation();
     }
 
-    public static class MinAggregation extends NoParamAggregation
-        implements RollingAggregation<MinAggregation> {
+    static final class MinAggregation extends NoParamAggregation {
         private MinAggregation() {
             super(Kind.MIN);
         }
@@ -470,12 +458,11 @@ private MinAggregation() {
     /**
      * Min reduction.
      */
-    public static MinAggregation min() {
+    static MinAggregation min() {
         return new MinAggregation();
     }
 
-    public static class MaxAggregation extends NoParamAggregation
-        implements RollingAggregation<MaxAggregation> {
+    static final class MaxAggregation extends NoParamAggregation {
         private MaxAggregation() {
             super(Kind.MAX);
         }
@@ -484,12 +471,11 @@ private MaxAggregation() {
     /**
      * Max reduction.
      */
-    public static MaxAggregation max() {
+    static MaxAggregation max() {
         return new MaxAggregation();
     }
 
-    public static class CountAggregation extends CountLikeAggregation
-        implements RollingAggregation<CountAggregation> {
+    static final class CountAggregation extends CountLikeAggregation {
         private CountAggregation(NullPolicy nullPolicy) {
             super(Kind.COUNT, nullPolicy);
         }
@@ -498,7 +484,7 @@ private CountAggregation(NullPolicy nullPolicy) {
     /**
      * Count number of valid, a.k.a. non-null, elements.
      */
-    public static CountAggregation count() {
+    static CountAggregation count() {
         return count(NullPolicy.EXCLUDE);
     }
 
@@ -507,11 +493,11 @@ public static CountAggregation count() {
      * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values
      *                   should be counted.
      */
-    public static CountAggregation count(NullPolicy nullPolicy) {
+    static CountAggregation count(NullPolicy nullPolicy) {
         return new CountAggregation(nullPolicy);
     }
 
-    public static class AnyAggregation extends NoParamAggregation {
+    static final class AnyAggregation extends NoParamAggregation {
         private AnyAggregation() {
             super(Kind.ANY);
         }
@@ -522,11 +508,11 @@ private AnyAggregation() {
      * if any of the elements in the range are true or non-zero, otherwise produces a false or 0.
      * Null values are skipped.
      */
-    public static AnyAggregation any() {
+    static AnyAggregation any() {
         return new AnyAggregation();
     }
 
-    public static class AllAggregation extends NoParamAggregation {
+    static final class AllAggregation extends NoParamAggregation {
         private AllAggregation() {
             super(Kind.ALL);
         }
@@ -537,12 +523,11 @@ private AllAggregation() {
      * the range are true or non-zero, otherwise produces a false or 0.
      * Null values are skipped.
      */
-    public static AllAggregation all() {
+    static AllAggregation all() {
         return new AllAggregation();
     }
 
-
-    public static class SumOfSquaresAggregation extends NoParamAggregation {
+    static final class SumOfSquaresAggregation extends NoParamAggregation {
         private SumOfSquaresAggregation() {
             super(Kind.SUM_OF_SQUARES);
         }
@@ -551,12 +536,11 @@ private SumOfSquaresAggregation() {
     /**
      * Sum of squares reduction.
      */
-    public static SumOfSquaresAggregation sumOfSquares() {
+    static SumOfSquaresAggregation sumOfSquares() {
         return new SumOfSquaresAggregation();
     }
 
-    public static class MeanAggregation extends NoParamAggregation
-        implements RollingAggregation<MeanAggregation>{
+    static final class MeanAggregation extends NoParamAggregation {
         private MeanAggregation() {
             super(Kind.MEAN);
         }
@@ -565,11 +549,11 @@ private MeanAggregation() {
     /**
      * Arithmetic mean reduction.
      */
-    public static MeanAggregation mean() {
+    static MeanAggregation mean() {
         return new MeanAggregation();
     }
 
-    public static class M2Aggregation extends NoParamAggregation {
+    static final class M2Aggregation extends NoParamAggregation {
         private M2Aggregation() {
             super(Kind.M2);
         }
@@ -578,11 +562,11 @@ private M2Aggregation() {
     /**
      * Sum of square of differences from mean.
      */
-    public static M2Aggregation M2() {
+    static M2Aggregation M2() {
         return new M2Aggregation();
     }
 
-    public static class VarianceAggregation extends DdofAggregation {
+    static final class VarianceAggregation extends DdofAggregation {
         private VarianceAggregation(int ddof) {
             super(Kind.VARIANCE, ddof);
         }
@@ -591,7 +575,7 @@ private VarianceAggregation(int ddof) {
     /**
      * Variance aggregation with 1 as the delta degrees of freedom.
      */
-    public static VarianceAggregation variance() {
+    static VarianceAggregation variance() {
         return variance(1);
     }
 
@@ -600,12 +584,12 @@ public static VarianceAggregation variance() {
      * @param ddof delta degrees of freedom. The divisor used in calculation of variance is
      *             <code>N - ddof</code>, where N is the population size.
      */
-    public static VarianceAggregation variance(int ddof) {
+    static VarianceAggregation variance(int ddof) {
         return new VarianceAggregation(ddof);
     }
 
 
-    public static class StandardDeviationAggregation extends DdofAggregation {
+    static final class StandardDeviationAggregation extends DdofAggregation {
         private StandardDeviationAggregation(int ddof) {
             super(Kind.STD, ddof);
         }
@@ -614,7 +598,7 @@ private StandardDeviationAggregation(int ddof) {
     /**
      * Standard deviation aggregation with 1 as the delta degrees of freedom.
      */
-    public static StandardDeviationAggregation standardDeviation() {
+    static StandardDeviationAggregation standardDeviation() {
         return standardDeviation(1);
     }
 
@@ -623,11 +607,11 @@ public static StandardDeviationAggregation standardDeviation() {
      * @param ddof delta degrees of freedom. The divisor used in calculation of std is
      *             <code>N - ddof</code>, where N is the population size.
      */
-    public static StandardDeviationAggregation standardDeviation(int ddof) {
+    static StandardDeviationAggregation standardDeviation(int ddof) {
         return new StandardDeviationAggregation(ddof);
     }
 
-    public static class MedianAggregation extends NoParamAggregation {
+    static final class MedianAggregation extends NoParamAggregation {
         private MedianAggregation() {
             super(Kind.MEDIAN);
         }
@@ -636,26 +620,25 @@ private MedianAggregation() {
     /**
      * Median reduction.
      */
-    public static MedianAggregation median() {
+    static MedianAggregation median() {
         return new MedianAggregation();
     }
 
     /**
      * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
      */
-    public static QuantileAggregation quantile(double ... quantiles) {
+    static QuantileAggregation quantile(double ... quantiles) {
         return quantile(QuantileMethod.LINEAR, quantiles);
     }
 
     /**
      * Aggregate to compute various quantiles.
      */
-    public static QuantileAggregation quantile(QuantileMethod method, double ... quantiles) {
+    static QuantileAggregation quantile(QuantileMethod method, double ... quantiles) {
         return new QuantileAggregation(method, quantiles);
     }
 
-    public static class ArgMaxAggregation extends NoParamAggregation
-        implements RollingAggregation<ArgMaxAggregation>{
+    static final class ArgMaxAggregation extends NoParamAggregation {
         private ArgMaxAggregation() {
             super(Kind.ARGMAX);
         }
@@ -667,12 +650,11 @@ private ArgMaxAggregation() {
      * prior to doing the aggregation. This would result in an index into the sorted data being
      * returned.
      */
-    public static ArgMaxAggregation argMax() {
+    static ArgMaxAggregation argMax() {
         return new ArgMaxAggregation();
     }
 
-    public static class ArgMinAggregation extends NoParamAggregation
-        implements RollingAggregation<ArgMinAggregation>{
+    static final class ArgMinAggregation extends NoParamAggregation {
         private ArgMinAggregation() {
             super(Kind.ARGMIN);
         }
@@ -684,11 +666,11 @@ private ArgMinAggregation() {
      * prior to doing the aggregation. This would result in an index into the sorted data being
      * returned.
      */
-    public static ArgMinAggregation argMin() {
+    static ArgMinAggregation argMin() {
         return new ArgMinAggregation();
     }
 
-    public static class NuniqueAggregation extends CountLikeAggregation {
+    static final class NuniqueAggregation extends CountLikeAggregation {
         private NuniqueAggregation(NullPolicy nullPolicy) {
             super(Kind.NUNIQUE, nullPolicy);
         }
@@ -697,7 +679,7 @@ private NuniqueAggregation(NullPolicy nullPolicy) {
     /**
      * Number of unique, non-null, elements.
      */
-    public static NuniqueAggregation nunique() {
+    static NuniqueAggregation nunique() {
         return nunique(NullPolicy.EXCLUDE);
     }
 
@@ -707,7 +689,7 @@ public static NuniqueAggregation nunique() {
      *                   compare as equal so multiple null values in a range would all only
      *                   increase the count by 1.
      */
-    public static NuniqueAggregation nunique(NullPolicy nullPolicy) {
+    static NuniqueAggregation nunique(NullPolicy nullPolicy) {
         return new NuniqueAggregation(nullPolicy);
     }
 
@@ -716,7 +698,7 @@ public static NuniqueAggregation nunique(NullPolicy nullPolicy) {
      * @param offset the offset to look at. Negative numbers go from the end of the group. Any
      *               value outside of the group range results in a null.
      */
-    public static NthAggregation nth(int offset) {
+    static NthAggregation nth(int offset) {
         return nth(offset, NullPolicy.INCLUDE);
     }
 
@@ -727,12 +709,11 @@ public static NthAggregation nth(int offset) {
      * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they
      *                   should be skipped.
      */
-    public static NthAggregation nth(int offset, NullPolicy nullPolicy) {
+    static NthAggregation nth(int offset, NullPolicy nullPolicy) {
         return new NthAggregation(offset, nullPolicy);
     }
 
-    public static class RowNumberAggregation extends NoParamAggregation
-        implements RollingAggregation<RowNumberAggregation>{
+    static final class RowNumberAggregation extends NoParamAggregation {
         private RowNumberAggregation() {
             super(Kind.ROW_NUMBER);
         }
@@ -741,12 +722,11 @@ private RowNumberAggregation() {
     /**
      * Get the row number, only makes sense for a window operations.
      */
-    public static RowNumberAggregation rowNumber() {
+    static RowNumberAggregation rowNumber() {
         return new RowNumberAggregation();
     }
 
-    public static class RankAggregation extends NoParamAggregation
-        implements RollingAggregation<RankAggregation>{
+    static final class RankAggregation extends NoParamAggregation {
         private RankAggregation() {
             super(Kind.RANK);
         }
@@ -755,12 +735,11 @@ private RankAggregation() {
     /**
      * Get the row's ranking.
      */
-    public static RankAggregation rank() {
+    static RankAggregation rank() {
         return new RankAggregation();
     }
 
-    public static class DenseRankAggregation extends NoParamAggregation
-        implements RollingAggregation<DenseRankAggregation>{
+    static final class DenseRankAggregation extends NoParamAggregation {
         private DenseRankAggregation() {
             super(Kind.DENSE_RANK);
         }
@@ -769,14 +748,14 @@ private DenseRankAggregation() {
     /**
      * Get the row's dense ranking.
      */
-    public static DenseRankAggregation denseRank() {
+    static DenseRankAggregation denseRank() {
         return new DenseRankAggregation();
     }
 
     /**
      * Collect the values into a list. Nulls will be skipped.
      */
-    public static CollectListAggregation collectList() {
+    static CollectListAggregation collectList() {
         return collectList(NullPolicy.EXCLUDE);
     }
 
@@ -785,7 +764,7 @@ public static CollectListAggregation collectList() {
      *
      * @param nullPolicy Indicates whether to include/exclude nulls during collection.
      */
-    public static CollectListAggregation collectList(NullPolicy nullPolicy) {
+    static CollectListAggregation collectList(NullPolicy nullPolicy) {
         return new CollectListAggregation(nullPolicy);
     }
 
@@ -793,7 +772,7 @@ public static CollectListAggregation collectList(NullPolicy nullPolicy) {
      * Collect the values into a set. All null values will be excluded, and all nan values are regarded as
      * unique instances.
      */
-    public static CollectSetAggregation collectSet() {
+    static CollectSetAggregation collectSet() {
         return collectSet(NullPolicy.EXCLUDE, NullEquality.UNEQUAL, NaNEquality.UNEQUAL);
     }
 
@@ -804,11 +783,11 @@ public static CollectSetAggregation collectSet() {
      * @param nullEquality Flag to specify whether null entries within each list should be considered equal.
      * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
      */
-    public static CollectSetAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) {
+    static CollectSetAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) {
         return new CollectSetAggregation(nullPolicy, nullEquality, nanEquality);
     }
 
-    public static final class MergeListsAggregation extends NoParamAggregation {
+    static final class MergeListsAggregation extends NoParamAggregation {
         private MergeListsAggregation() {
             super(Kind.MERGE_LISTS);
         }
@@ -818,7 +797,7 @@ private MergeListsAggregation() {
      * Merge the partial lists produced by multiple CollectListAggregations.
      * NOTICE: The partial lists to be merged should NOT include any null list element (but can include null list entries).
      */
-    public static MergeListsAggregation mergeLists() {
+    static MergeListsAggregation mergeLists() {
         return new MergeListsAggregation();
     }
 
@@ -826,7 +805,7 @@ public static MergeListsAggregation mergeLists() {
      * Merge the partial sets produced by multiple CollectSetAggregations. Each null/nan value will be regarded as
      * a unique instance.
      */
-    public static MergeSetsAggregation mergeSets() {
+    static MergeSetsAggregation mergeSets() {
         return mergeSets(NullEquality.UNEQUAL, NaNEquality.UNEQUAL);
     }
 
@@ -836,58 +815,39 @@ public static MergeSetsAggregation mergeSets() {
      * @param nullEquality Flag to specify whether null entries within each list should be considered equal.
      * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
      */
-    public static MergeSetsAggregation mergeSets(NullEquality nullEquality, NaNEquality nanEquality) {
+    static MergeSetsAggregation mergeSets(NullEquality nullEquality, NaNEquality nanEquality) {
         return new MergeSetsAggregation(nullEquality, nanEquality);
     }
 
-    public static class LeadAggregation extends LeadLagAggregation
-        implements RollingAggregation<LeadAggregation> {
+    static final class LeadAggregation extends LeadLagAggregation {
         private LeadAggregation(int offset, ColumnVector defaultOutput) {
             super(Kind.LEAD, offset, defaultOutput);
         }
     }
 
-    /**
-     * In a rolling window return the value offset entries ahead or null if it is outside of the
-     * window.
-     */
-    public static LeadAggregation lead(int offset) {
-        return lead(offset, null);
-    }
-
     /**
      * In a rolling window return the value offset entries ahead or the corresponding value from
      * defaultOutput if it is outside of the window. Note that this does not take any ownership of
      * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life
      * time of this aggregation operation.
      */
-    public static LeadAggregation lead(int offset, ColumnVector defaultOutput) {
+    static LeadAggregation lead(int offset, ColumnVector defaultOutput) {
         return new LeadAggregation(offset, defaultOutput);
     }
 
-    public static class LagAggregation extends LeadLagAggregation
-        implements RollingAggregation<LagAggregation>{
+    static final class LagAggregation extends LeadLagAggregation {
         private LagAggregation(int offset, ColumnVector defaultOutput) {
             super(Kind.LAG, offset, defaultOutput);
         }
     }
 
-
-    /**
-     * In a rolling window return the value offset entries behind or null if it is outside of the
-     * window.
-     */
-    public static LagAggregation lag(int offset) {
-        return lag(offset, null);
-    }
-
     /**
      * In a rolling window return the value offset entries behind or the corresponding value from
      * defaultOutput if it is outside of the window. Note that this does not take any ownership of
      * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life
      * time of this aggregation operation.
      */
-    public static LagAggregation lag(int offset, ColumnVector defaultOutput) {
+    static LagAggregation lag(int offset, ColumnVector defaultOutput) {
         return new LagAggregation(offset, defaultOutput);
     }
 
@@ -900,7 +860,7 @@ private MergeM2Aggregation() {
     /**
      * Merge the partial M2 values produced by multiple instances of M2Aggregation.
      */
-    public static MergeM2Aggregation mergeM2() {
+    static MergeM2Aggregation mergeM2() {
         return new MergeM2Aggregation();
     }
 
diff --git a/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java b/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java
index abce287c9b0..d5544e01e7e 100644
--- a/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java
+++ b/java/src/main/java/ai/rapids/cudf/AggregationOverWindow.java
@@ -22,12 +22,12 @@
  * An Aggregation instance that also holds a column number and window metadata so the aggregation
  * can be done over a specific window.
  */
-public class AggregationOverWindow<T extends Aggregation & RollingAggregation<T>>
-    extends AggregationOnColumn<T> {
+public final class AggregationOverWindow {
+    private final RollingAggregationOnColumn wrapped;
     protected final WindowOptions windowOptions;
 
-    AggregationOverWindow(T wrapped, int columnIndex, WindowOptions windowOptions) {
-        super(wrapped, columnIndex);
+    AggregationOverWindow(RollingAggregationOnColumn wrapped, WindowOptions windowOptions) {
+        this.wrapped = wrapped;
         this.windowOptions = windowOptions;
 
         if (windowOptions == null) {
@@ -43,23 +43,6 @@ public WindowOptions getWindowOptions() {
         return windowOptions;
     }
 
-    @Override
-    public AggregationOnColumn<T> onColumn(int columnIndex) {
-        if (columnIndex == getColumnIndex()) {
-            return this; // NOOP
-        } else {
-            return new AggregationOverWindow(this.wrapped, columnIndex, windowOptions);
-        }
-    }
-
-    @Override
-    public AggregationOverWindow<T> overWindow(WindowOptions windowOptions) {
-        if (this.windowOptions.equals(windowOptions)) {
-            return this;
-        }
-        return new AggregationOverWindow(wrapped, columnIndex, windowOptions);
-    }
-
     @Override
     public int hashCode() {
         return 31 * super.hashCode() + windowOptions.hashCode();
@@ -69,10 +52,22 @@ public int hashCode() {
     public boolean equals(Object other) {
         if (other == this) {
             return true;
-        } else if (other instanceof AggregationOnColumn) {
-            AggregationOnColumn o = (AggregationOnColumn) other;
-            return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex;
+        } else if (other instanceof AggregationOverWindow) {
+            AggregationOverWindow o = (AggregationOverWindow) other;
+            return wrapped.equals(o.wrapped) && windowOptions.equals(o.windowOptions);
         }
         return false;
     }
+
+    int getColumnIndex() {
+        return wrapped.getColumnIndex();
+    }
+
+    long createNativeInstance() {
+        return wrapped.createNativeInstance();
+    }
+
+    long getDefaultOutput() {
+        return wrapped.getDefaultOutput();
+    }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index e543d0c7b21..6902e2b322b 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -152,6 +152,16 @@ private ColumnVector(long viewAddress, DeviceMemoryBuffer contiguousBuffer) {
     incRefCountInternal(true);
   }
 
+
+  /**
+   * For a ColumnVector this is really just incrementing the reference count.
+   * @return this
+   */
+  @Override
+  public ColumnVector copyToColumnVector() {
+    return incRefCount();
+  }
+
   /**
    * Retrieves the column_view for a cudf::column and if it fails to do so, the column is deleted
    * and the exception is thrown to the caller.
@@ -803,7 +813,7 @@ private static native long stringConcatenation(long[] columnViews, long separato
   /**
    * Native method to concatenate columns of strings together using a separator specified for each row
    * and returns the result as a string column.
-   * @param columns array of longs holding the native handles of the column_views to combine.
+   * @param columnViews array of longs holding the native handles of the column_views to combine.
    * @param sep_column long holding the native handle of the strings_column_view used as separators.
    * @param separator_narep string scalar indicating null behavior when a separator is null.
    *                        If set to null and the separator is null the resulting string will
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 4a1ed3a178e..4d9991d0dd9 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -101,11 +101,39 @@ public ColumnView(DType type, long rows, Optional<Long> nullCount,
         || !nullCount.isPresent();
   }
 
+  /**
+   * Create a new column view based off of data already on the device. Ref count on the buffers
+   * is not incremented and none of the underlying buffers are owned by this view. The returned
+   * ColumnView is only valid as long as the underlying buffers remain valid. If the buffers are
+   * closed before this ColumnView is closed, it will result in undefined behavior.
+   *
+   * If ownership is needed, call {@link ColumnView#copyToColumnVector}
+   *
+   * @param type           the type of the vector
+   * @param rows           the number of rows in this vector.
+   * @param nullCount      the number of nulls in the dataset.
+   * @param dataBuffer     a host buffer required for nested types including strings and string
+   *                       categories. The ownership doesn't change on this buffer
+   * @param validityBuffer an optional validity buffer. Must be provided if nullCount != 0.
+   *                       The ownership doesn't change on this buffer
+   * @param offsetBuffer   The offsetbuffer for columns that need an offset buffer
+   */
+  public ColumnView(DType type, long rows, Optional<Long> nullCount,
+                    BaseDeviceMemoryBuffer dataBuffer,
+                    BaseDeviceMemoryBuffer validityBuffer, BaseDeviceMemoryBuffer offsetBuffer) {
+    this(type, (int) rows, nullCount.orElse(UNKNOWN_NULL_COUNT).intValue(),
+        dataBuffer, validityBuffer, offsetBuffer, null);
+    assert (!type.isNestedType());
+    assert (nullCount.isPresent() && nullCount.get() <= Integer.MAX_VALUE)
+        || !nullCount.isPresent();
+  }
+
   private ColumnView(DType type, long rows, int nullCount,
                      BaseDeviceMemoryBuffer dataBuffer, BaseDeviceMemoryBuffer validityBuffer,
                      BaseDeviceMemoryBuffer offsetBuffer, ColumnView[] children) {
     this(ColumnVector.initViewHandle(type, (int) rows, nullCount, dataBuffer, validityBuffer,
-        offsetBuffer, Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
+        offsetBuffer, children == null ? new long[]{} :
+            Arrays.stream(children).mapToLong(c -> c.getNativeView()).toArray()));
   }
 
   /** Creates a ColumnVector from a column view handle
@@ -140,6 +168,32 @@ public final DType getType() {
     return type;
   }
 
+  /**
+   * Returns the child column views for this view
+   * Please note that it is the responsibility of the caller to close these views.
+   * @return an array of child column views
+   */
+  public final ColumnView[] getChildColumnViews() {
+    int numChildren = getNumChildren();
+    if (!getType().isNestedType()) {
+      return null;
+    }
+    ColumnView[] views = new ColumnView[numChildren];
+    try {
+      for (int i = 0; i < numChildren; i++) {
+        views[i] = getChildColumnView(i);
+      }
+      return views;
+    } catch(Throwable t) {
+      for (ColumnView v: views) {
+        if (v != null) {
+          v.close();
+        }
+      }
+      throw t;
+    }
+  }
+
   /**
    * Returns the child column view at a given index.
    * Please note that it is the responsibility of the caller to close this view.
@@ -1135,7 +1189,7 @@ public Scalar sum() {
    * of the specified type.
    */
   public Scalar sum(DType outType) {
-    return reduce(Aggregation.sum(), outType);
+    return reduce(ReductionAggregation.sum(), outType);
   }
 
   /**
@@ -1143,7 +1197,7 @@ public Scalar sum(DType outType) {
    * of the same type as this column.
    */
   public Scalar min() {
-    return reduce(Aggregation.min(), type);
+    return reduce(ReductionAggregation.min(), type);
   }
 
   /**
@@ -1160,7 +1214,7 @@ public Scalar min(DType outType) {
         return tmp.min(outType);
       }
     }
-    return reduce(Aggregation.min(), outType);
+    return reduce(ReductionAggregation.min(), outType);
   }
 
   /**
@@ -1168,7 +1222,7 @@ public Scalar min(DType outType) {
    * of the same type as this column.
    */
   public Scalar max() {
-    return reduce(Aggregation.max(), type);
+    return reduce(ReductionAggregation.max(), type);
   }
 
   /**
@@ -1185,7 +1239,7 @@ public Scalar max(DType outType) {
         return tmp.max(outType);
       }
     }
-    return reduce(Aggregation.max(), outType);
+    return reduce(ReductionAggregation.max(), outType);
   }
 
   /**
@@ -1201,7 +1255,7 @@ public Scalar product() {
    * of the specified type.
    */
   public Scalar product(DType outType) {
-    return reduce(Aggregation.product(), outType);
+    return reduce(ReductionAggregation.product(), outType);
   }
 
   /**
@@ -1217,7 +1271,7 @@ public Scalar sumOfSquares() {
    * scalar of the specified type.
    */
   public Scalar sumOfSquares(DType outType) {
-    return reduce(Aggregation.sumOfSquares(), outType);
+    return reduce(ReductionAggregation.sumOfSquares(), outType);
   }
 
   /**
@@ -1241,7 +1295,7 @@ public Scalar mean() {
    *                types are currently supported.
    */
   public Scalar mean(DType outType) {
-    return reduce(Aggregation.mean(), outType);
+    return reduce(ReductionAggregation.mean(), outType);
   }
 
   /**
@@ -1265,7 +1319,7 @@ public Scalar variance() {
    *                types are currently supported.
    */
   public Scalar variance(DType outType) {
-    return reduce(Aggregation.variance(), outType);
+    return reduce(ReductionAggregation.variance(), outType);
   }
 
   /**
@@ -1290,7 +1344,7 @@ public Scalar standardDeviation() {
    *                types are currently supported.
    */
   public Scalar standardDeviation(DType outType) {
-    return reduce(Aggregation.standardDeviation(), outType);
+    return reduce(ReductionAggregation.standardDeviation(), outType);
   }
 
   /**
@@ -1309,7 +1363,7 @@ public Scalar any() {
    * Null values are skipped.
    */
   public Scalar any(DType outType) {
-    return reduce(Aggregation.any(), outType);
+    return reduce(ReductionAggregation.any(), outType);
   }
 
   /**
@@ -1330,7 +1384,7 @@ public Scalar all() {
    */
   @Deprecated
   public Scalar all(DType outType) {
-    return reduce(Aggregation.all(), outType);
+    return reduce(ReductionAggregation.all(), outType);
   }
 
   /**
@@ -1343,7 +1397,7 @@ public Scalar all(DType outType) {
    * empty or the reduction operation fails then the
    * {@link Scalar#isValid()} method of the result will return false.
    */
-  public Scalar reduce(Aggregation aggregation) {
+  public Scalar reduce(ReductionAggregation aggregation) {
     return reduce(aggregation, type);
   }
 
@@ -1360,7 +1414,7 @@ public Scalar reduce(Aggregation aggregation) {
    * empty or the reduction operation fails then the
    * {@link Scalar#isValid()} method of the result will return false.
    */
-  public Scalar reduce(Aggregation aggregation, DType outType) {
+  public Scalar reduce(ReductionAggregation aggregation, DType outType) {
     long nativeId = aggregation.createNativeInstance();
     try {
       return new Scalar(outType, reduce(getNativeView(), nativeId, outType.typeId.getNativeId(), outType.getScale()));
@@ -1390,20 +1444,19 @@ public final ColumnVector quantile(QuantileMethod method, double[] quantiles) {
    * @throws IllegalArgumentException if unsupported window specification * (i.e. other than {@link WindowOptions.FrameType#ROWS} is used.
    */
   public final ColumnVector rollingWindow(RollingAggregation op, WindowOptions options) {
-    Aggregation agg = op.getBaseAggregation();
     // Check that only row-based windows are used.
     if (!options.getFrameType().equals(WindowOptions.FrameType.ROWS)) {
       throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: "
           + options.getFrameType());
     }
 
-    long nativePtr = agg.createNativeInstance();
+    long nativePtr = op.createNativeInstance();
     try {
       Scalar p = options.getPrecedingScalar();
       Scalar f = options.getFollowingScalar();
       return new ColumnVector(
           rollingWindow(this.getNativeView(),
-              agg.getDefaultOutput(),
+              op.getDefaultOutput(),
               options.getMinPeriods(),
               nativePtr,
               p == null || !p.isValid() ? 0 : p.getInt(),
@@ -1420,7 +1473,7 @@ public final ColumnVector rollingWindow(RollingAggregation op, WindowOptions opt
    * This is just a convenience method for an inclusive scan with a SUM aggregation.
    */
   public final ColumnVector prefixSum() {
-    return scan(Aggregation.sum());
+    return scan(ScanAggregation.sum());
   }
 
   /**
@@ -1431,7 +1484,7 @@ public final ColumnVector prefixSum() {
    *                   null policy too. Currently none of those aggregations are supported so
    *                   it is undefined how they would interact with each other.
    */
-  public final ColumnVector scan(Aggregation aggregation, ScanType scanType, NullPolicy nullPolicy) {
+  public final ColumnVector scan(ScanAggregation aggregation, ScanType scanType, NullPolicy nullPolicy) {
     long nativeId = aggregation.createNativeInstance();
     try {
       return new ColumnVector(scan(getNativeView(), nativeId,
@@ -1446,7 +1499,7 @@ public final ColumnVector scan(Aggregation aggregation, ScanType scanType, NullP
    * @param aggregation the aggregation to perform
    * @param scanType should the scan be inclusive, include the current row, or exclusive.
    */
-  public final ColumnVector scan(Aggregation aggregation, ScanType scanType) {
+  public final ColumnVector scan(ScanAggregation aggregation, ScanType scanType) {
     return scan(aggregation, scanType, NullPolicy.EXCLUDE);
   }
 
@@ -1454,7 +1507,7 @@ public final ColumnVector scan(Aggregation aggregation, ScanType scanType) {
    * Computes an inclusive scan for a column that excludes nulls.
    * @param aggregation the aggregation to perform
    */
-  public final ColumnVector scan(Aggregation aggregation) {
+  public final ColumnVector scan(ScanAggregation aggregation) {
     return scan(aggregation, ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/DType.java b/java/src/main/java/ai/rapids/cudf/DType.java
index 87237f1e4b2..2d851aa2ae3 100644
--- a/java/src/main/java/ai/rapids/cudf/DType.java
+++ b/java/src/main/java/ai/rapids/cudf/DType.java
@@ -30,65 +30,61 @@ public final class DType {
   2. Update SINGLETON_DTYPE_LOOKUP to reflect new type. The order should be maintained between
   DTypeEnum and SINGLETON_DTYPE_LOOKUP */
   public enum DTypeEnum {
-    EMPTY(0, 0, "NOT SUPPORTED"),
-    INT8(1, 1, "byte"),
-    INT16(2, 2, "short"),
-    INT32(4, 3, "int"),
-    INT64(8, 4, "long"),
-    UINT8(1, 5, "uint8"),
-    UINT16(2, 6, "uint16"),
-    UINT32(4, 7, "uint32"),
-    UINT64(8, 8, "uint64"),
-    FLOAT32(4, 9, "float"),
-    FLOAT64(8, 10, "double"),
+    EMPTY(0, 0),
+    INT8(1, 1),
+    INT16(2, 2),
+    INT32(4, 3),
+    INT64(8, 4),
+    UINT8(1, 5),
+    UINT16(2, 6),
+    UINT32(4, 7),
+    UINT64(8, 8),
+    FLOAT32(4, 9),
+    FLOAT64(8, 10),
     /**
      * Byte wise true non-0/false 0.  In general true will be 1.
      */
-    BOOL8(1, 11, "bool"),
+    BOOL8(1, 11),
     /**
      * Days since the UNIX epoch
      */
-    TIMESTAMP_DAYS(4, 12, "date32"),
+    TIMESTAMP_DAYS(4, 12),
     /**
      * s since the UNIX epoch
      */
-    TIMESTAMP_SECONDS(8, 13, "timestamp[s]"),
+    TIMESTAMP_SECONDS(8, 13),
     /**
      * ms since the UNIX epoch
      */
-    TIMESTAMP_MILLISECONDS(8, 14, "timestamp[ms]"),
+    TIMESTAMP_MILLISECONDS(8, 14),
     /**
      * microseconds since the UNIX epoch
      */
-    TIMESTAMP_MICROSECONDS(8, 15, "timestamp[us]"),
+    TIMESTAMP_MICROSECONDS(8, 15),
     /**
      * ns since the UNIX epoch
      */
-    TIMESTAMP_NANOSECONDS(8, 16, "timestamp[ns]"),
-
-    //We currently don't have mappings for duration type to I/O files, and these
-    //simpleNames might change in future when we do
-    DURATION_DAYS(4, 17, "int32"),
-    DURATION_SECONDS(8, 18, "int64"),
-    DURATION_MILLISECONDS(8, 19, "int64"),
-    DURATION_MICROSECONDS(8, 20, "int64"),
-    DURATION_NANOSECONDS(8, 21, "int64"),
-    //DICTIONARY32(4, 22, "NO IDEA"),
-
-    STRING(0, 23, "str"),
-    LIST(0, 24, "list"),
-    DECIMAL32(4, 25, "decimal32"),
-    DECIMAL64(8, 26, "decimal64"),
-    STRUCT(0, 27, "struct");
+    TIMESTAMP_NANOSECONDS(8, 16),
+
+    DURATION_DAYS(4, 17),
+    DURATION_SECONDS(8, 18),
+    DURATION_MILLISECONDS(8, 19),
+    DURATION_MICROSECONDS(8, 20),
+    DURATION_NANOSECONDS(8, 21),
+    //DICTIONARY32(4, 22),
+
+    STRING(0, 23),
+    LIST(0, 24),
+    DECIMAL32(4, 25),
+    DECIMAL64(8, 26),
+    STRUCT(0, 27);
 
     final int sizeInBytes;
     final int nativeId;
-    final String simpleName;
 
-    DTypeEnum(int sizeInBytes, int nativeId, String simpleName) {
+    DTypeEnum(int sizeInBytes, int nativeId) {
       this.sizeInBytes = sizeInBytes;
       this.nativeId = nativeId;
-      this.simpleName = simpleName;
     }
 
     public int getNativeId() { return nativeId; }
@@ -191,12 +187,6 @@ private DType(DTypeEnum id, int decimalScale) {
    */
   public int getScale() { return scale; }
 
-  /**
-   * Returns string name mapped to type.
-   * @return name corresponding to type
-   */
-  public String getSimpleName() { return typeId.simpleName; }
-
   /**
    * Return enum for this DType
    * @return DTypeEnum
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
new file mode 100644
index 00000000000..dd2adf8bee8
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -0,0 +1,296 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * An aggregation that can be used for a reduce.
+ */
+public final class GroupByAggregation {
+  private final Aggregation wrapped;
+
+  private GroupByAggregation(Aggregation wrapped) {
+    this.wrapped = wrapped;
+  }
+
+  Aggregation getWrapped() {
+    return wrapped;
+  }
+
+
+  /**
+   * Add a column to the Aggregation so it can be used on a specific column of data.
+   * @param columnIndex the index of the column to operate on.
+   */
+  public GroupByAggregationOnColumn onColumn(int columnIndex) {
+    return new GroupByAggregationOnColumn(this, columnIndex);
+  }
+
+  @Override
+  public int hashCode() {
+    return wrapped.hashCode();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    } else if (other instanceof GroupByAggregation) {
+      GroupByAggregation o = (GroupByAggregation) other;
+      return wrapped.equals(o.wrapped);
+    }
+    return false;
+  }
+
+  /**
+   * Count number of valid, a.k.a. non-null, elements.
+   */
+  public static GroupByAggregation count() {
+    return new GroupByAggregation(Aggregation.count());
+  }
+
+  /**
+   * Count number of elements.
+   * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values
+   *                   should be counted.
+   */
+  public static GroupByAggregation count(NullPolicy nullPolicy) {
+    return new GroupByAggregation(Aggregation.count(nullPolicy));
+  }
+
+  /**
+   * Sum Aggregation
+   */
+  public static GroupByAggregation sum() {
+    return new GroupByAggregation(Aggregation.sum());
+  }
+
+  /**
+   * Product Aggregation.
+   */
+  public static GroupByAggregation product() {
+    return new GroupByAggregation(Aggregation.product());
+  }
+
+
+  /**
+   * Index of max element. Please note that when using this aggregation if the
+   * data is not already sorted by the grouping keys it may be automatically sorted
+   * prior to doing the aggregation. This would result in an index into the sorted data being
+   * returned.
+   */
+  public static GroupByAggregation argMax() {
+    return new GroupByAggregation(Aggregation.argMax());
+  }
+
+  /**
+   * Index of min element. Please note that when using this aggregation if the
+   * data is not already sorted by the grouping keys it may be automatically sorted
+   * prior to doing the aggregation. This would result in an index into the sorted data being
+   * returned.
+   */
+  public static GroupByAggregation argMin() {
+    return new GroupByAggregation(Aggregation.argMin());
+  }
+
+  /**
+   * Min Aggregation
+   */
+  public static GroupByAggregation min() {
+    return new GroupByAggregation(Aggregation.min());
+  }
+
+  /**
+   * Max Aggregation
+   */
+  public static GroupByAggregation max() {
+    return new GroupByAggregation(Aggregation.max());
+  }
+
+  /**
+   * Arithmetic mean reduction.
+   */
+  public static GroupByAggregation mean() {
+    return new GroupByAggregation(Aggregation.mean());
+  }
+
+  /**
+   * Sum of square of differences from mean.
+   */
+  public static GroupByAggregation M2() {
+    return new GroupByAggregation(Aggregation.M2());
+  }
+
+  /**
+   * Variance aggregation with 1 as the delta degrees of freedom.
+   */
+  public static GroupByAggregation variance() {
+    return new GroupByAggregation(Aggregation.variance());
+  }
+
+  /**
+   * Variance aggregation.
+   * @param ddof delta degrees of freedom. The divisor used in calculation of variance is
+   *             <code>N - ddof</code>, where N is the population size.
+   */
+  public static GroupByAggregation variance(int ddof) {
+    return new GroupByAggregation(Aggregation.variance(ddof));
+  }
+
+  /**
+   * Standard deviation aggregation with 1 as the delta degrees of freedom.
+   */
+  public static GroupByAggregation standardDeviation() {
+    return new GroupByAggregation(Aggregation.standardDeviation());
+  }
+
+  /**
+   * Standard deviation aggregation.
+   * @param ddof delta degrees of freedom. The divisor used in calculation of std is
+   *             <code>N - ddof</code>, where N is the population size.
+   */
+  public static GroupByAggregation standardDeviation(int ddof) {
+    return new GroupByAggregation(Aggregation.standardDeviation(ddof));
+  }
+
+  /**
+   * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
+   */
+  public static GroupByAggregation quantile(double ... quantiles) {
+    return new GroupByAggregation(Aggregation.quantile(quantiles));
+  }
+
+  /**
+   * Aggregate to compute various quantiles.
+   */
+  public static GroupByAggregation quantile(QuantileMethod method, double ... quantiles) {
+    return new GroupByAggregation(Aggregation.quantile(method, quantiles));
+  }
+
+  /**
+   * Median reduction.
+   */
+  public static GroupByAggregation median() {
+    return new GroupByAggregation(Aggregation.median());
+  }
+
+  /**
+   * Number of unique, non-null, elements.
+   */
+  public static GroupByAggregation nunique() {
+    return new GroupByAggregation(Aggregation.nunique());
+  }
+
+  /**
+   * Number of unique elements.
+   * @param nullPolicy INCLUDE if nulls should be counted else EXCLUDE. If nulls are counted they
+   *                   compare as equal so multiple null values in a range would all only
+   *                   increase the count by 1.
+   */
+  public static GroupByAggregation nunique(NullPolicy nullPolicy) {
+    return new GroupByAggregation(Aggregation.nunique(nullPolicy));
+  }
+
+  /**
+   * Get the nth, non-null, element in a group.
+   * @param offset the offset to look at. Negative numbers go from the end of the group. Any
+   *               value outside of the group range results in a null.
+   */
+  public static GroupByAggregation nth(int offset) {
+    return new GroupByAggregation(Aggregation.nth(offset));
+  }
+
+  /**
+   * Get the nth element in a group.
+   * @param offset the offset to look at. Negative numbers go from the end of the group. Any
+   *               value outside of the group range results in a null.
+   * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they
+   *                   should be skipped.
+   */
+  public static GroupByAggregation nth(int offset, NullPolicy nullPolicy) {
+    return new GroupByAggregation(Aggregation.nth(offset, nullPolicy));
+  }
+
+  /**
+   * Collect the values into a list. Nulls will be skipped.
+   */
+  public static GroupByAggregation collectList() {
+    return new GroupByAggregation(Aggregation.collectList());
+  }
+
+  /**
+   * Collect the values into a list.
+   *
+   * @param nullPolicy Indicates whether to include/exclude nulls during collection.
+   */
+  public static GroupByAggregation collectList(NullPolicy nullPolicy) {
+    return new GroupByAggregation(Aggregation.collectList(nullPolicy));
+  }
+
+  /**
+   * Collect the values into a set. All null values will be excluded, and all nan values are regarded as
+   * unique instances.
+   */
+  public static GroupByAggregation collectSet() {
+    return new GroupByAggregation(Aggregation.collectSet());
+  }
+
+  /**
+   * Collect the values into a set.
+   *
+   * @param nullPolicy   Indicates whether to include/exclude nulls during collection.
+   * @param nullEquality Flag to specify whether null entries within each list should be considered equal.
+   * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
+   */
+  public static GroupByAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) {
+    return new GroupByAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality));
+  }
+
+  /**
+   * Merge the partial lists produced by multiple CollectListAggregations.
+   * NOTICE: The partial lists to be merged should NOT include any null list element (but can include null list entries).
+   */
+  public static GroupByAggregation mergeLists() {
+    return new GroupByAggregation(Aggregation.mergeLists());
+  }
+
+  /**
+   * Merge the partial sets produced by multiple CollectSetAggregations. Each null/nan value will be regarded as
+   * a unique instance.
+   */
+  public static GroupByAggregation mergeSets() {
+    return new GroupByAggregation(Aggregation.mergeSets());
+  }
+
+  /**
+   * Merge the partial sets produced by multiple CollectSetAggregations.
+   *
+   * @param nullEquality Flag to specify whether null entries within each list should be considered equal.
+   * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
+   */
+  public static GroupByAggregation mergeSets(NullEquality nullEquality, NaNEquality nanEquality) {
+    return new GroupByAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
+  }
+
+  /**
+   * Merge the partial M2 values produced by multiple instances of M2Aggregation.
+   */
+  public static GroupByAggregation mergeM2() {
+    return new GroupByAggregation(Aggregation.mergeM2());
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregationOnColumn.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregationOnColumn.java
new file mode 100644
index 00000000000..c50cf3728f0
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregationOnColumn.java
@@ -0,0 +1,56 @@
+/*
+ *
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * A GroupByAggregation for a specific column in a table.
+ */
+public final class GroupByAggregationOnColumn {
+    protected final GroupByAggregation wrapped;
+    protected final int columnIndex;
+
+    GroupByAggregationOnColumn(GroupByAggregation wrapped, int columnIndex) {
+        this.wrapped = wrapped;
+        this.columnIndex = columnIndex;
+    }
+
+    public int getColumnIndex() {
+        return columnIndex;
+    }
+
+    GroupByAggregation getWrapped() {
+        return wrapped;
+    }
+
+    @Override
+    public int hashCode() {
+        return 31 * wrapped.hashCode() + columnIndex;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        if (other == this) {
+            return true;
+        } else if (other instanceof GroupByAggregationOnColumn) {
+            GroupByAggregationOnColumn o = (GroupByAggregationOnColumn) other;
+            return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex;
+        }
+        return false;
+    }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java
new file mode 100644
index 00000000000..219b6dde05d
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java
@@ -0,0 +1,118 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * An aggregation that can be used for a grouped scan.
+ */
+public final class GroupByScanAggregation {
+  private final Aggregation wrapped;
+
+  private GroupByScanAggregation(Aggregation wrapped) {
+    this.wrapped = wrapped;
+  }
+
+  long createNativeInstance() {
+    return wrapped.createNativeInstance();
+  }
+
+  long getDefaultOutput() {
+    return wrapped.getDefaultOutput();
+  }
+
+  Aggregation getWrapped() {
+    return wrapped;
+  }
+
+  /**
+   * Add a column to the Aggregation so it can be used on a specific column of data.
+   * @param columnIndex the index of the column to operate on.
+   */
+  public GroupByScanAggregationOnColumn onColumn(int columnIndex) {
+    return new GroupByScanAggregationOnColumn(this, columnIndex);
+  }
+
+  @Override
+  public int hashCode() {
+    return wrapped.hashCode();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    } else if (other instanceof GroupByScanAggregation) {
+      GroupByScanAggregation o = (GroupByScanAggregation) other;
+      return wrapped.equals(o.wrapped);
+    }
+    return false;
+  }
+
+  /**
+   * Sum Aggregation
+   */
+  public static GroupByScanAggregation sum() {
+    return new GroupByScanAggregation(Aggregation.sum());
+  }
+
+
+  /**
+   * Product Aggregation.
+   */
+  public static GroupByScanAggregation product() {
+    return new GroupByScanAggregation(Aggregation.product());
+  }
+
+  /**
+   * Min Aggregation
+   */
+  public static GroupByScanAggregation min() {
+    return new GroupByScanAggregation(Aggregation.min());
+  }
+
+  /**
+   * Max Aggregation
+   */
+  public static GroupByScanAggregation max() {
+    return new GroupByScanAggregation(Aggregation.max());
+  }
+
+  /**
+   * Count number of elements.
+   * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values
+   *                   should be counted.
+   */
+  public static GroupByScanAggregation count(NullPolicy nullPolicy) {
+    return new GroupByScanAggregation(Aggregation.count(nullPolicy));
+  }
+
+  /**
+   * Get the row's ranking.
+   */
+  public static GroupByScanAggregation rank() {
+    return new GroupByScanAggregation(Aggregation.rank());
+  }
+
+  /**
+   * Get the row's dense ranking.
+   */
+  public static GroupByScanAggregation denseRank() {
+    return new GroupByScanAggregation(Aggregation.denseRank());
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregationOnColumn.java b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregationOnColumn.java
new file mode 100644
index 00000000000..75e4936e5b9
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregationOnColumn.java
@@ -0,0 +1,64 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * A GroupByScanAggregation for a specific column in a table.
+ */
+public final class GroupByScanAggregationOnColumn {
+    protected final GroupByScanAggregation wrapped;
+    protected final int columnIndex;
+
+    GroupByScanAggregationOnColumn(GroupByScanAggregation wrapped, int columnIndex) {
+        this.wrapped = wrapped;
+        this.columnIndex = columnIndex;
+    }
+
+    public int getColumnIndex() {
+        return columnIndex;
+    }
+
+    @Override
+    public int hashCode() {
+        return 31 * wrapped.hashCode() + columnIndex;
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        if (other == this) {
+            return true;
+        } else if (other instanceof GroupByScanAggregationOnColumn) {
+            GroupByScanAggregationOnColumn o = (GroupByScanAggregationOnColumn) other;
+            return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex;
+        }
+        return false;
+    }
+
+    long createNativeInstance() {
+        return wrapped.createNativeInstance();
+    }
+
+    long getDefaultOutput() {
+        return wrapped.getDefaultOutput();
+    }
+
+    GroupByScanAggregation getWrapped() {
+        return wrapped;
+    }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/HashJoin.java b/java/src/main/java/ai/rapids/cudf/HashJoin.java
new file mode 100644
index 00000000000..620a7ce6a6c
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/HashJoin.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class represents a hash table built from the join keys of the right-side table for a
+ * join operation. This hash table can then be reused across a series of left probe tables
+ * to compute gather maps for joins more efficiently when the right-side table is not changing.
+ * It can also be used to query the output row count of a join and then pass that result to the
+ * operation that generates the join gather maps to avoid redundant computation when the output
+ * row count must be checked before manifesting the join gather maps.
+ */
+public class HashJoin implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  private static final Logger log = LoggerFactory.getLogger(HashJoin.class);
+
+  private static class HashJoinCleaner extends MemoryCleaner.Cleaner {
+    private Table buildKeys;
+    private long nativeHandle;
+
+    HashJoinCleaner(Table buildKeys, long nativeHandle) {
+      this.buildKeys = buildKeys;
+      this.nativeHandle = nativeHandle;
+      addRef();
+    }
+
+    @Override
+    protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
+      long origAddress = nativeHandle;
+      boolean neededCleanup = nativeHandle != 0;
+      if (neededCleanup) {
+        try {
+          destroy(nativeHandle);
+          buildKeys.close();
+          buildKeys = null;
+        } finally {
+          nativeHandle = 0;
+        }
+        if (logErrorIfNotClean) {
+          log.error("A HASH TABLE WAS LEAKED (ID: " + id + " " + Long.toHexString(origAddress));
+        }
+      }
+      return neededCleanup;
+    }
+
+    @Override
+    public boolean isClean() {
+      return nativeHandle == 0;
+    }
+  }
+
+  private final HashJoinCleaner cleaner;
+  private final boolean compareNulls;
+  private boolean isClosed = false;
+
+  /**
+   * Construct a hash table for a join from a table representing the join key columns from the
+   * right-side table in the join. The resulting instance must be closed to release the
+   * GPU resources associated with the instance.
+   * @param buildKeys table view containing the join keys for the right-side join table
+   * @param compareNulls true if null key values should match otherwise false
+   */
+  public HashJoin(Table buildKeys, boolean compareNulls) {
+    this.compareNulls = compareNulls;
+    Table buildTable = new Table(buildKeys.getColumns());
+    try {
+      long handle = create(buildTable.getNativeView(), compareNulls);
+      this.cleaner = new HashJoinCleaner(buildTable, handle);
+      MemoryCleaner.register(this, cleaner);
+    } catch (Throwable t) {
+      try {
+        buildTable.close();
+      } catch (Throwable t2) {
+        t.addSuppressed(t2);
+      }
+      throw t;
+    }
+  }
+
+  @Override
+  public synchronized void close() {
+    cleaner.delRef();
+    if (isClosed) {
+      cleaner.logRefCountDebug("double free " + this);
+      throw new IllegalStateException("Close called too many times " + this);
+    }
+    cleaner.clean(false);
+    isClosed = true;
+  }
+
+  long getNativeView() {
+    return cleaner.nativeHandle;
+  }
+
+  /** Get the number of join key columns for the table that was used to generate the has table. */
+  public long getNumberOfColumns() {
+    return cleaner.buildKeys.getNumberOfColumns();
+  }
+
+  /** Returns true if the hash table was built to match on nulls otherwise false. */
+  public boolean getCompareNulls() {
+    return compareNulls;
+  }
+
+  private static native long create(long tableView, boolean nullEqual);
+  private static native void destroy(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
index 4bf38543a2d..a936d4830ee 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
@@ -277,6 +277,10 @@ public static void register(CompiledExpression expr, Cleaner cleaner) {
     all.add(new CleanerWeakReference(expr, cleaner, collected, false));
   }
 
+  static void register(HashJoin hashJoin, Cleaner cleaner) {
+    all.add(new CleanerWeakReference(hashJoin, cleaner, collected, true));
+  }
+
   /**
    * This is not 100% perfect and we can still run into situations where RMM buffers were not
    * collected and this returns false because of thread race conditions. This is just a best effort.
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java
index f5b0a0f74b3..229cb0262d3 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java
@@ -28,6 +28,7 @@ public class ParquetColumnWriterOptions {
   private boolean isTimestampTypeInt96;
   private int precision;
   private boolean isNullable;
+  private boolean isMap = false;
   private String columName;
   private ParquetColumnWriterOptions(AbstractStructBuilder builder) {
     this.columName = builder.name;
@@ -122,6 +123,15 @@ public T withListColumn(ParquetListColumnWriterOptions child) {
       return (T) this;
     }
 
+    /**
+     * Set the map column meta.
+     * @return this for chaining.
+     */
+    public T withMapColumn(ParquetColumnWriterOptions child) {
+      children.add(child);
+      return (T) this;
+    }
+
     /**
      * Set a child struct meta data
      * @return this for chaining.
@@ -220,22 +230,22 @@ public T withNullableTimestampColumn(String name, boolean isInt96) {
     public abstract V build();
   }
 
-  ParquetColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
-                             int precision, boolean isNullable) {
+  public ParquetColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
+                                    int precision, boolean isNullable) {
     this.isTimestampTypeInt96 = isTimestampTypeInt96;
     this.precision = precision;
     this.isNullable = isNullable;
     this.columName = columnName;
   }
 
-  ParquetColumnWriterOptions(String columnName, boolean isNullable) {
+  public ParquetColumnWriterOptions(String columnName, boolean isNullable) {
     this.isTimestampTypeInt96 = false;
     this.precision = 0;
     this.isNullable = isNullable;
     this.columName = columnName;
   }
 
-  ParquetColumnWriterOptions(String columnName) {
+  public ParquetColumnWriterOptions(String columnName) {
     this(columnName, true);
   }
 
@@ -295,6 +305,15 @@ boolean[] getFlatIsNullable() {
     }
   }
 
+  boolean[] getFlatIsMap() {
+    boolean[] ret = {isMap};
+    if (childColumnOptions.length > 0) {
+      return getFlatBooleans(ret, (opt) -> opt.getFlatIsMap());
+    } else {
+      return ret;
+    }
+  }
+
   int[] getFlatNumChildren() {
     int[] ret = {childColumnOptions.length};
     if (childColumnOptions.length > 0) {
@@ -351,6 +370,27 @@ protected String[] getFlatColumnNames(String[] ret) {
     return result;
   }
 
+  /**
+   * Add a Map Column to the schema.
+   * <p>
+   * Maps are List columns with a Struct named 'key_value' with a child named 'key' and a child
+   * named 'value'. The caller of this method doesn't need to worry about this as this method will
+   * take care of this without the knowledge of the caller.
+   */
+  public static ParquetColumnWriterOptions mapColumn(String name, ParquetColumnWriterOptions key,
+                                                     ParquetColumnWriterOptions value) {
+    ParquetStructColumnWriterOptions struct = structBuilder("key_value").build();
+    if (key.isNullable) {
+      throw new IllegalArgumentException("key column can not be nullable");
+    }
+    struct.childColumnOptions = new ParquetColumnWriterOptions[]{key, value};
+    ParquetColumnWriterOptions opt = listBuilder(name)
+        .withStructColumn(struct)
+        .build();
+    opt.isMap = true;
+    return opt;
+  }
+
   /**
    * Creates a ListBuilder for column called 'name'
    */
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 9992ae9eaf1..38f8d8e59a4 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -57,6 +57,11 @@ boolean[] getFlatIsNullable() {
     return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsNullable());
   }
 
+  @Override
+  boolean[] getFlatIsMap() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsMap());
+  }
+
   @Override
   String[] getFlatColumnNames() {
     return super.getFlatColumnNames(new String[]{});
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
new file mode 100644
index 00000000000..7eff85dcd0d
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -0,0 +1,212 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * An aggregation that can be used for a reduce.
+ */
+public final class ReductionAggregation {
+  private final Aggregation wrapped;
+
+  private ReductionAggregation(Aggregation wrapped) {
+    this.wrapped = wrapped;
+  }
+
+  long createNativeInstance() {
+    return wrapped.createNativeInstance();
+  }
+
+  long getDefaultOutput() {
+    return wrapped.getDefaultOutput();
+  }
+
+  Aggregation getWrapped() {
+    return wrapped;
+  }
+
+  @Override
+  public int hashCode() {
+    return wrapped.hashCode();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    } else if (other instanceof ReductionAggregation) {
+      ReductionAggregation o = (ReductionAggregation) other;
+      return wrapped.equals(o.wrapped);
+    }
+    return false;
+  }
+
+  /**
+   * Sum Aggregation
+   */
+  public static ReductionAggregation sum() {
+    return new ReductionAggregation(Aggregation.sum());
+  }
+
+  /**
+   * Product Aggregation.
+   */
+  public static ReductionAggregation product() {
+    return new ReductionAggregation(Aggregation.product());
+  }
+
+  /**
+   * Min Aggregation
+   */
+  public static ReductionAggregation min() {
+    return new ReductionAggregation(Aggregation.min());
+  }
+
+  /**
+   * Max Aggregation
+   */
+  public static ReductionAggregation max() {
+    return new ReductionAggregation(Aggregation.max());
+  }
+
+  /**
+   * Any reduction. Produces a true or 1, depending on the output type,
+   * if any of the elements in the range are true or non-zero, otherwise produces a false or 0.
+   * Null values are skipped.
+   */
+  public static ReductionAggregation any() {
+    return new ReductionAggregation(Aggregation.any());
+  }
+
+  /**
+   * All reduction. Produces true or 1, depending on the output type, if all of the elements in
+   * the range are true or non-zero, otherwise produces a false or 0.
+   * Null values are skipped.
+   */
+  public static ReductionAggregation all() {
+    return new ReductionAggregation(Aggregation.all());
+  }
+
+
+  /**
+   * Sum of squares reduction.
+   */
+  public static ReductionAggregation sumOfSquares() {
+    return new ReductionAggregation(Aggregation.sumOfSquares());
+  }
+
+  /**
+   * Arithmetic mean reduction.
+   */
+  public static ReductionAggregation mean() {
+    return new ReductionAggregation(Aggregation.mean());
+  }
+
+
+  /**
+   * Variance aggregation with 1 as the delta degrees of freedom.
+   */
+  public static ReductionAggregation variance() {
+    return new ReductionAggregation(Aggregation.variance());
+  }
+
+  /**
+   * Variance aggregation.
+   * @param ddof delta degrees of freedom. The divisor used in calculation of variance is
+   *             <code>N - ddof</code>, where N is the population size.
+   */
+  public static ReductionAggregation variance(int ddof) {
+    return new ReductionAggregation(Aggregation.variance(ddof));
+  }
+
+  /**
+   * Standard deviation aggregation with 1 as the delta degrees of freedom.
+   */
+  public static ReductionAggregation standardDeviation() {
+    return new ReductionAggregation(Aggregation.standardDeviation());
+  }
+
+  /**
+   * Standard deviation aggregation.
+   * @param ddof delta degrees of freedom. The divisor used in calculation of std is
+   *             <code>N - ddof</code>, where N is the population size.
+   */
+  public static ReductionAggregation standardDeviation(int ddof) {
+    return new ReductionAggregation(Aggregation.standardDeviation(ddof));
+  }
+
+
+  /**
+   * Median reduction.
+   */
+  public static ReductionAggregation median() {
+    return new ReductionAggregation(Aggregation.median());
+  }
+
+  /**
+   * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
+   */
+  public static ReductionAggregation quantile(double ... quantiles) {
+    return new ReductionAggregation(Aggregation.quantile(quantiles));
+  }
+
+  /**
+   * Aggregate to compute various quantiles.
+   */
+  public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) {
+    return new ReductionAggregation(Aggregation.quantile(method, quantiles));
+  }
+
+
+  /**
+   * Number of unique, non-null, elements.
+   */
+  public static ReductionAggregation nunique() {
+    return new ReductionAggregation(Aggregation.nunique());
+  }
+
+  /**
+   * Number of unique elements.
+   * @param nullPolicy INCLUDE if nulls should be counted else EXCLUDE. If nulls are counted they
+   *                   compare as equal so multiple null values in a range would all only
+   *                   increase the count by 1.
+   */
+  public static ReductionAggregation nunique(NullPolicy nullPolicy) {
+    return new ReductionAggregation(Aggregation.nunique(nullPolicy));
+  }
+
+  /**
+   * Get the nth, non-null, element in a group.
+   * @param offset the offset to look at. Negative numbers go from the end of the group. Any
+   *               value outside of the group range results in a null.
+   */
+  public static ReductionAggregation nth(int offset) {
+    return new ReductionAggregation(Aggregation.nth(offset));
+  }
+
+  /**
+   * Get the nth element in a group.
+   * @param offset the offset to look at. Negative numbers go from the end of the group. Any
+   *               value outside of the group range results in a null.
+   * @param nullPolicy INCLUDE if nulls should be included in the aggregation or EXCLUDE if they
+   *                   should be skipped.
+   */
+  public static ReductionAggregation nth(int offset, NullPolicy nullPolicy) {
+    return new ReductionAggregation(Aggregation.nth(offset, nullPolicy));
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
index 9b80924463a..07983f77aad 100644
--- a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
@@ -19,11 +19,189 @@
 package ai.rapids.cudf;
 
 /**
- * Used to tag an aggregation as something that is compatible with rolling window operations.
- * Do not try to implement this yourself
+ * An aggregation that can be used on rolling windows.
  */
-public interface RollingAggregation<T extends Aggregation> {
-  default T getBaseAggregation() {
-    return (T)this;
+public final class RollingAggregation {
+  private final Aggregation wrapped;
+
+  private RollingAggregation(Aggregation wrapped) {
+    this.wrapped = wrapped;
+  }
+
+  long createNativeInstance() {
+    return wrapped.createNativeInstance();
+  }
+
+  long getDefaultOutput() {
+    return wrapped.getDefaultOutput();
+  }
+
+  /**
+   * Add a column to the Aggregation so it can be used on a specific column of data.
+   * @param columnIndex the index of the column to operate on.
+   */
+  public RollingAggregationOnColumn onColumn(int columnIndex) {
+    return new RollingAggregationOnColumn(this, columnIndex);
+  }
+
+  @Override
+  public int hashCode() {
+    return wrapped.hashCode();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    } else if (other instanceof RollingAggregation) {
+      RollingAggregation o = (RollingAggregation) other;
+      return wrapped.equals(o.wrapped);
+    }
+    return false;
+  }
+
+  /**
+   * Rolling Window Sum
+   */
+  public static RollingAggregation sum() {
+    return new RollingAggregation(Aggregation.sum());
+  }
+
+
+  /**
+   * Rolling Window Min
+   */
+  public static RollingAggregation min() {
+    return new RollingAggregation(Aggregation.min());
+  }
+
+  /**
+   * Rolling Window Max
+   */
+  public static RollingAggregation max() {
+    return new RollingAggregation(Aggregation.max());
+  }
+
+
+  /**
+   * Count number of valid, a.k.a. non-null, elements.
+   */
+  public static RollingAggregation count() {
+    return new RollingAggregation(Aggregation.count());
+  }
+
+  /**
+   * Count number of elements.
+   * @param nullPolicy INCLUDE if nulls should be counted. EXCLUDE if only non-null values
+   *                   should be counted.
+   */
+  public static RollingAggregation count(NullPolicy nullPolicy) {
+    return new RollingAggregation(Aggregation.count(nullPolicy));
+  }
+
+  /**
+   * Arithmetic Mean
+   */
+  public static RollingAggregation mean() {
+    return new RollingAggregation(Aggregation.mean());
+  }
+
+
+  /**
+   * Index of max element.
+   */
+  public static RollingAggregation argMax() {
+    return new RollingAggregation(Aggregation.argMax());
+  }
+
+  /**
+   * Index of min element.
+   */
+  public static RollingAggregation argMin() {
+    return new RollingAggregation(Aggregation.argMin());
+  }
+
+
+  /**
+   * Get the row number.
+   */
+  public static RollingAggregation rowNumber() {
+    return new RollingAggregation(Aggregation.rowNumber());
+  }
+
+
+  /**
+   * In a rolling window return the value offset entries ahead or null if it is outside of the
+   * window.
+   */
+  public static RollingAggregation lead(int offset) {
+    return lead(offset, null);
+  }
+
+  /**
+   * In a rolling window return the value offset entries ahead or the corresponding value from
+   * defaultOutput if it is outside of the window. Note that this does not take any ownership of
+   * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life
+   * time of this aggregation operation.
+   */
+  public static RollingAggregation lead(int offset, ColumnVector defaultOutput) {
+    return new RollingAggregation(Aggregation.lead(offset, defaultOutput));
+  }
+
+
+
+  /**
+   * In a rolling window return the value offset entries behind or null if it is outside of the
+   * window.
+   */
+  public static RollingAggregation lag(int offset) {
+    return lag(offset, null);
+  }
+
+  /**
+   * In a rolling window return the value offset entries behind or the corresponding value from
+   * defaultOutput if it is outside of the window. Note that this does not take any ownership of
+   * defaultOutput and the caller mush ensure that defaultOutput remains valid during the life
+   * time of this aggregation operation.
+   */
+  public static RollingAggregation lag(int offset, ColumnVector defaultOutput) {
+    return new RollingAggregation(Aggregation.lag(offset, defaultOutput));
+  }
+
+
+  /**
+   * Collect the values into a list. Nulls will be skipped.
+   */
+  public static RollingAggregation collectList() {
+    return new RollingAggregation(Aggregation.collectList());
+  }
+
+  /**
+   * Collect the values into a list.
+   *
+   * @param nullPolicy Indicates whether to include/exclude nulls during collection.
+   */
+  public static RollingAggregation collectList(NullPolicy nullPolicy) {
+    return new RollingAggregation(Aggregation.collectList(nullPolicy));
+  }
+
+
+  /**
+   * Collect the values into a set. All null values will be excluded, and all nan values are regarded as
+   * unique instances.
+   */
+  public static RollingAggregation collectSet() {
+    return new RollingAggregation(Aggregation.collectSet());
+  }
+
+  /**
+   * Collect the values into a set.
+   *
+   * @param nullPolicy   Indicates whether to include/exclude nulls during collection.
+   * @param nullEquality Flag to specify whether null entries within each list should be considered equal.
+   * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
+   */
+  public static RollingAggregation collectSet(NullPolicy nullPolicy, NullEquality nullEquality, NaNEquality nanEquality) {
+    return new RollingAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality));
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java b/java/src/main/java/ai/rapids/cudf/RollingAggregationOnColumn.java
similarity index 55%
rename from java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java
rename to java/src/main/java/ai/rapids/cudf/RollingAggregationOnColumn.java
index bb1404e5a07..a6b1484aa71 100644
--- a/java/src/main/java/ai/rapids/cudf/AggregationOnColumn.java
+++ b/java/src/main/java/ai/rapids/cudf/RollingAggregationOnColumn.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -19,47 +19,24 @@
 package ai.rapids.cudf;
 
 /**
- * An Aggregation instance that also holds a column number so the aggregation can be done on
- * a specific column of data in a table.
+ * A RollingAggregation for a specific column in a table.
  */
-public class AggregationOnColumn<T extends Aggregation> extends Aggregation {
-    protected final T wrapped;
+public final class RollingAggregationOnColumn {
+    protected final RollingAggregation wrapped;
     protected final int columnIndex;
 
-    AggregationOnColumn(T wrapped, int columnIndex) {
-        super(wrapped.kind);
+    RollingAggregationOnColumn(RollingAggregation wrapped, int columnIndex) {
         this.wrapped = wrapped;
         this.columnIndex = columnIndex;
     }
 
-    @Override
-    public AggregationOnColumn<T> onColumn(int columnIndex) {
-        if (columnIndex == getColumnIndex()) {
-            return this; // NOOP
-        } else {
-            return new AggregationOnColumn(this.wrapped, columnIndex);
-        }
-    }
-
-    /**
-     * Do the aggregation over a given Window.
-     */
-    public <R extends Aggregation & RollingAggregation<R>> AggregationOverWindow<R> overWindow(WindowOptions windowOptions) {
-        return new AggregationOverWindow(wrapped, columnIndex, windowOptions);
-    }
-
     public int getColumnIndex() {
         return columnIndex;
     }
 
-    @Override
-    long createNativeInstance() {
-        return wrapped.createNativeInstance();
-    }
 
-    @Override
-    long getDefaultOutput() {
-        return wrapped.getDefaultOutput();
+    public AggregationOverWindow overWindow(WindowOptions windowOptions) {
+        return new AggregationOverWindow(this, windowOptions);
     }
 
     @Override
@@ -71,10 +48,18 @@ public int hashCode() {
     public boolean equals(Object other) {
         if (other == this) {
             return true;
-        } else if (other instanceof AggregationOnColumn) {
-            AggregationOnColumn o = (AggregationOnColumn) other;
+        } else if (other instanceof RollingAggregationOnColumn) {
+            RollingAggregationOnColumn o = (RollingAggregationOnColumn) other;
             return wrapped.equals(o.wrapped) && columnIndex == o.columnIndex;
         }
         return false;
     }
+
+    long createNativeInstance() {
+        return wrapped.createNativeInstance();
+    }
+
+    long getDefaultOutput() {
+        return wrapped.getDefaultOutput();
+    }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java
new file mode 100644
index 00000000000..08489562adc
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java
@@ -0,0 +1,100 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * An aggregation that can be used for a scan.
+ */
+public final class ScanAggregation {
+  private final Aggregation wrapped;
+
+  private ScanAggregation(Aggregation wrapped) {
+    this.wrapped = wrapped;
+  }
+
+  long createNativeInstance() {
+    return wrapped.createNativeInstance();
+  }
+
+  long getDefaultOutput() {
+    return wrapped.getDefaultOutput();
+  }
+
+  Aggregation getWrapped() {
+    return wrapped;
+  }
+
+  @Override
+  public int hashCode() {
+    return wrapped.hashCode();
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (other == this) {
+      return true;
+    } else if (other instanceof ScanAggregation) {
+      ScanAggregation o = (ScanAggregation) other;
+      return wrapped.equals(o.wrapped);
+    }
+    return false;
+  }
+
+  /**
+   * Sum Aggregation
+   */
+  public static ScanAggregation sum() {
+    return new ScanAggregation(Aggregation.sum());
+  }
+
+  /**
+   * Product Aggregation.
+   */
+  public static ScanAggregation product() {
+    return new ScanAggregation(Aggregation.product());
+  }
+
+  /**
+   * Min Aggregation
+   */
+  public static ScanAggregation min() {
+    return new ScanAggregation(Aggregation.min());
+  }
+
+  /**
+   * Max Aggregation
+   */
+  public static ScanAggregation max() {
+    return new ScanAggregation(Aggregation.max());
+  }
+
+  /**
+   * Get the row's ranking.
+   */
+  public static ScanAggregation rank() {
+    return new ScanAggregation(Aggregation.rank());
+  }
+
+  /**
+   * Get the row's dense ranking.
+   */
+  public static ScanAggregation denseRank() {
+    return new ScanAggregation(Aggregation.denseRank());
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index f0bc3d930d9..c90d27efa97 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -27,11 +27,11 @@
 public class Schema {
   public static final Schema INFERRED = new Schema();
   private final List<String> names;
-  private final List<String> typeNames;
+  private final List<DType> types;
 
-  private Schema(List<String> names, List<String> typeNames) {
+  private Schema(List<String> names, List<DType> types) {
     this.names = new ArrayList<>(names);
-    this.typeNames = new ArrayList<>(typeNames);
+    this.types = new ArrayList<>(types);
   }
 
   /**
@@ -39,7 +39,7 @@ private Schema(List<String> names, List<String> typeNames) {
    */
   private Schema() {
     names = null;
-    typeNames = null;
+    types = null;
   }
 
   public static Builder builder() {
@@ -53,25 +53,40 @@ public String[] getColumnNames() {
     return names.toArray(new String[names.size()]);
   }
 
-  String[] getTypesAsStrings() {
-    if (typeNames == null) {
+  int[] getTypeIds() {
+    if (types == null) {
       return null;
     }
-    return typeNames.toArray(new String[typeNames.size()]);
+    int[] ret = new int[types.size()];
+    for (int i = 0; i < types.size(); i++) {
+      ret[i] = types.get(i).getTypeId().nativeId;
+    }
+    return ret;
+  }
+
+  int[] getTypeScales() {
+    if (types == null) {
+      return null;
+    }
+    int[] ret = new int[types.size()];
+    for (int i = 0; i < types.size(); i++) {
+      ret[i] = types.get(i).getScale();
+    }
+    return ret;
   }
 
   public static class Builder {
     private final List<String> names = new ArrayList<>();
-    private final List<String> typeNames = new ArrayList<>();
+    private final List<DType> types = new ArrayList<>();
 
     public Builder column(DType type, String name) {
-      typeNames.add(type.getSimpleName());
+      types.add(type);
       names.add(name);
       return this;
     }
 
     public Schema build() {
-      return new Schema(names, typeNames);
+      return new Schema(names, types);
     }
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 96a9b608f06..eeb2d308f1a 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -170,10 +170,19 @@ public long getDeviceMemorySize() {
     return total;
   }
 
+  /**
+   * This method is internal and exposed purely for testing purpopses
+   */
+  static Table removeNullMasksIfNeeded(Table table) {
+    return new Table(removeNullMasksIfNeeded(table.nativeHandle));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // NATIVE APIs
   /////////////////////////////////////////////////////////////////////////////
-  
+
+  private static native long[] removeNullMasksIfNeeded(long tableView) throws CudfException;
+
   private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices);
 
   private static native long[] partition(long inputTable, long partitionView,
@@ -200,7 +209,8 @@ private static native long bound(long inputTable, long valueTable,
    * into a java
    * object to try and pull out all of the options.  If this becomes unwieldy we can change it.
    * @param columnNames       names of all of the columns, even the ones filtered out
-   * @param dTypes            types of all of the columns as strings.  Why strings? who knows.
+   * @param dTypeIds          native types IDs of all of the columns.
+   * @param dTypeScales       scale of the type for all of the columns.
    * @param filterColumnNames name of the columns to read, or an empty array if we want to read
    *                          all of them
    * @param filePath          the path of the file to read, or null if no path should be read.
@@ -214,7 +224,8 @@ private static native long bound(long inputTable, long valueTable,
    * @param trueValues        values that should be treated as boolean true
    * @param falseValues       values that should be treated as boolean false
    */
-  private static native long[] readCSV(String[] columnNames, String[] dTypes,
+  private static native long[] readCSV(String[] columnNames,
+                                       int[] dTypeIds, int[] dTypeScales,
                                        String[] filterColumnNames,
                                        String filePath, long address, long length,
                                        int headerRow, byte delim, byte quote,
@@ -248,6 +259,7 @@ private static native long[] readParquet(String[] filterColumnNames, String file
    * @param isInt96         true if timestamp type is int96
    * @param precisions      precision list containing all the precisions of the decimal types in
    *                        the columns
+   * @param isMapValues     true if a column is a map
    * @param filename        local output path
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
@@ -261,7 +273,7 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
-                                                   String filename) throws CudfException;
+                                                   boolean[] isMapValues, String filename) throws CudfException;
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
@@ -276,6 +288,7 @@ private static native long writeParquetFileBegin(String[] columnNames,
    * @param isInt96         true if timestamp type is int96
    * @param precisions      precision list containing all the precisions of the decimal types in
    *                        the columns
+   * @param isMapValues     true if a column is a map
    * @param consumer        consumer of host buffers produced.
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
@@ -289,6 +302,7 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      int statsFreq,
                                                      boolean[] isInt96,
                                                      int[] precisions,
+                                                     boolean[] isMapValues,
                                                      HostBufferConsumer consumer) throws CudfException;
 
   /**
@@ -500,18 +514,48 @@ private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long r
   private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
+  private static native long leftJoinRowCount(long leftTable, long rightHashJoin,
+                                              boolean nullsEqual) throws CudfException;
+
+  private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin,
+                                                      boolean nullsEqual) throws CudfException;
+
+  private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
+                                                               boolean nullsEqual,
+                                                               long outputRowCount) throws CudfException;
+
   private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable,
                                          int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
   private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys,
                                                    boolean compareNullsEqual) throws CudfException;
 
+  private static native long innerJoinRowCount(long table, long hashJoin,
+                                               boolean nullsEqual) throws CudfException;
+
+  private static native long[] innerHashJoinGatherMaps(long table, long hashJoin,
+                                                       boolean nullsEqual) throws CudfException;
+
+  private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin,
+                                                                boolean nullsEqual,
+                                                                long outputRowCount) throws CudfException;
+
   private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable,
                                          int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
   private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
+  private static native long fullJoinRowCount(long leftTable, long rightHashJoin,
+                                              boolean nullsEqual) throws CudfException;
+
+  private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin,
+                                                      boolean nullsEqual) throws CudfException;
+
+  private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
+                                                               boolean nullsEqual,
+                                                               long outputRowCount) throws CudfException;
+
   private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable,
       int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
 
@@ -524,26 +568,67 @@ private static native long[] leftAntiJoin(long leftTable, int[] leftJoinCols, lo
   private static native long[] leftAntiJoinGatherMap(long leftKeys, long rightKeys,
                                                      boolean compareNullsEqual) throws CudfException;
 
+  private static native long conditionalLeftJoinRowCount(long leftTable, long rightTable,
+                                                         long condition,
+                                                         boolean compareNullsEqual) throws CudfException;
+
   private static native long[] conditionalLeftJoinGatherMaps(long leftTable, long rightTable,
                                                              long condition,
                                                              boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] conditionalLeftJoinGatherMapsWithCount(long leftTable, long rightTable,
+                                                                      long condition,
+                                                                      boolean compareNullsEqual,
+                                                                      long rowCount) throws CudfException;
+
+  private static native long conditionalInnerJoinRowCount(long leftTable, long rightTable,
+                                                          long condition,
+                                                          boolean compareNullsEqual) throws CudfException;
+
   private static native long[] conditionalInnerJoinGatherMaps(long leftTable, long rightTable,
                                                               long condition,
                                                               boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] conditionalInnerJoinGatherMapsWithCount(long leftTable, long rightTable,
+                                                                       long condition,
+                                                                       boolean compareNullsEqual,
+                                                                       long rowCount) throws CudfException;
+
   private static native long[] conditionalFullJoinGatherMaps(long leftTable, long rightTable,
                                                              long condition,
                                                              boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] conditionalFullJoinGatherMapsWithCount(long leftTable, long rightTable,
+                                                                      long condition,
+                                                                      boolean compareNullsEqual,
+                                                                      long rowCount) throws CudfException;
+
+  private static native long conditionalLeftSemiJoinRowCount(long leftTable, long rightTable,
+                                                             long condition,
+                                                             boolean compareNullsEqual) throws CudfException;
+
   private static native long[] conditionalLeftSemiJoinGatherMap(long leftTable, long rightTable,
                                                                 long condition,
                                                                 boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] conditionalLeftSemiJoinGatherMapWithCount(long leftTable, long rightTable,
+                                                                         long condition,
+                                                                         boolean compareNullsEqual,
+                                                                         long rowCount) throws CudfException;
+
+  private static native long conditionalLeftAntiJoinRowCount(long leftTable, long rightTable,
+                                                             long condition,
+                                                             boolean compareNullsEqual) throws CudfException;
+
   private static native long[] conditionalLeftAntiJoinGatherMap(long leftTable, long rightTable,
                                                                 long condition,
                                                                 boolean compareNullsEqual) throws CudfException;
 
+  private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long leftTable, long rightTable,
+                                                                         long condition,
+                                                                         boolean compareNullsEqual,
+                                                                         long rowCount) throws CudfException;
+
   private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException;
 
   private static native long[] concatenate(long[] cudfTablePointers) throws CudfException;
@@ -608,7 +693,7 @@ public static Table readCSV(Schema schema, File path) {
    */
   public static Table readCSV(Schema schema, CSVOptions opts, File path) {
     return new Table(
-        readCSV(schema.getColumnNames(), schema.getTypesAsStrings(),
+        readCSV(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
             opts.getIncludeColumnNames(), path.getAbsolutePath(),
             0, 0,
             opts.getHeaderRow(),
@@ -681,7 +766,7 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
     assert len > 0;
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
-    return new Table(readCSV(schema.getColumnNames(), schema.getTypesAsStrings(),
+    return new Table(readCSV(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
         opts.getIncludeColumnNames(), null,
         buffer.getAddress() + offset, len,
         opts.getHeaderRow(),
@@ -864,6 +949,7 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
       String[] columnNames = options.getFlatColumnNames();
       boolean[] columnNullabilities = options.getFlatIsNullable();
       boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
+      boolean[] isMapValues = options.getFlatIsMap();
       int[] precisions = options.getFlatPrecision();
       int[] flatNumChildren = options.getFlatNumChildren();
 
@@ -878,6 +964,7 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           options.getStatisticsFrequency().nativeId,
           timeInt96Values,
           precisions,
+          isMapValues,
           outputFile.getAbsolutePath());
     }
 
@@ -885,6 +972,7 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
       String[] columnNames = options.getFlatColumnNames();
       boolean[] columnNullabilities = options.getFlatIsNullable();
       boolean[] timeInt96Values = options.getFlatIsTimeTypeInt96();
+      boolean[] isMapValues = options.getFlatIsMap();
       int[] precisions = options.getFlatPrecision();
       int[] flatNumChildren = options.getFlatNumChildren();
 
@@ -899,6 +987,7 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           options.getStatisticsFrequency().nativeId,
           timeInt96Values,
           precisions,
+          isMapValues,
           consumer);
     }
 
@@ -1990,6 +2079,84 @@ public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows resulting from a left equi-join between two tables.
+   * It is assumed this table instance holds the key columns from the left table, and the
+   * {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return row count of the join result
+   */
+  public long leftJoinRowCount(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    return leftJoinRowCount(getNativeView(), rightHash.getNativeView(),
+        rightHash.getCompareNulls());
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return left and right table gather maps
+   */
+  public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #leftJoinRowCount(HashJoin)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightHash hash table built from join key columns from the right table
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] leftJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        leftHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls(), outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the number of rows from the result of a left join between two tables when a
+   * conditional expression is true. It is assumed this table instance holds the columns from
+   * the left table, and the table argument represents the columns from the right table.
+   * @param rightTable the right side table of the join in the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return row count for the join result
+   */
+  public long conditionalLeftJoinRowCount(Table rightTable, CompiledExpression condition,
+                                          boolean compareNullsEqual) {
+    return conditionalLeftJoinRowCount(getNativeView(), rightTable.getNativeView(),
+            condition.getNativeHandle(), compareNullsEqual);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of a left join between
    * two tables when a conditional expression is true. It is assumed this table instance holds
@@ -2002,18 +2169,42 @@ public GatherMap[] leftJoinGatherMaps(Table rightKeys, boolean compareNullsEqual
    * @param compareNullsEqual true if null key values should match otherwise false
    * @return left and right table gather maps
    */
-  public GatherMap[] leftJoinGatherMaps(Table rightTable, CompiledExpression condition,
-                                        boolean compareNullsEqual) {
-    if (getNumberOfColumns() != rightTable.getNumberOfColumns()) {
-      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
-          "rightKeys: " + rightTable.getNumberOfColumns());
-    }
+  public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable,
+                                                   CompiledExpression condition,
+                                                   boolean compareNullsEqual) {
     long[] gatherMapData =
         conditionalLeftJoinGatherMaps(getNativeView(), rightTable.getNativeView(),
             condition.getNativeHandle(), compareNullsEqual);
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left join between
+   * two tables when a conditional expression is true. It is assumed this table instance holds
+   * the columns from the left table, and the table argument represents the columns from the
+   * right table. Two {@link GatherMap} instances will be returned that can be used to gather
+   * the left and right tables, respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression, boolean)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightTable the right side table of the join in the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable,
+                                                   CompiledExpression condition,
+                                                   boolean compareNullsEqual,
+                                                   long outputRowCount) {
+    long[] gatherMapData =
+        conditionalLeftJoinGatherMapsWithCount(getNativeView(), rightTable.getNativeView(),
+            condition.getNativeHandle(), compareNullsEqual, outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of an inner equi-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -2035,6 +2226,83 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows resulting from an inner equi-join between two tables.
+   * @param otherHash hash table built from join key columns from the other table
+   * @return row count of the join result
+   */
+  public long innerJoinRowCount(HashJoin otherHash) {
+    if (getNumberOfColumns() != otherHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "otherKeys: " + otherHash.getNumberOfColumns());
+    }
+    return innerJoinRowCount(getNativeView(), otherHash.getNativeView(),
+        otherHash.getCompareNulls());
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return left and right table gather maps
+   */
+  public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #innerJoinRowCount(HashJoin)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightHash hash table built from join key columns from the right table
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        innerHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls(), outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the number of rows from the result of an inner join between two tables when a
+   * conditional expression is true. It is assumed this table instance holds the columns from
+   * the left table, and the table argument represents the columns from the right table.
+   * @param rightTable the right side table of the join in the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return row count for the join result
+   */
+  public long conditionalInnerJoinRowCount(Table rightTable,
+                                           CompiledExpression condition,
+                                           boolean compareNullsEqual) {
+    return conditionalInnerJoinRowCount(getNativeView(), rightTable.getNativeView(),
+        condition.getNativeHandle(), compareNullsEqual);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of an inner join between
    * two tables when a conditional expression is true. It is assumed this table instance holds
@@ -2047,18 +2315,42 @@ public GatherMap[] innerJoinGatherMaps(Table rightKeys, boolean compareNullsEqua
    * @param compareNullsEqual true if null key values should match otherwise false
    * @return left and right table gather maps
    */
-  public GatherMap[] innerJoinGatherMaps(Table rightTable, CompiledExpression condition,
-                                         boolean compareNullsEqual) {
-    if (getNumberOfColumns() != rightTable.getNumberOfColumns()) {
-      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
-          "rightKeys: " + rightTable.getNumberOfColumns());
-    }
+  public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable,
+                                                    CompiledExpression condition,
+                                                    boolean compareNullsEqual) {
     long[] gatherMapData =
         conditionalInnerJoinGatherMaps(getNativeView(), rightTable.getNativeView(),
             condition.getNativeHandle(), compareNullsEqual);
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner join between
+   * two tables when a conditional expression is true. It is assumed this table instance holds
+   * the columns from the left table, and the table argument represents the columns from the
+   * right table. Two {@link GatherMap} instances will be returned that can be used to gather
+   * the left and right tables, respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression, boolean)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightTable the right side table of the join in the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable,
+                                                    CompiledExpression condition,
+                                                    boolean compareNullsEqual,
+                                                    long outputRowCount) {
+    long[] gatherMapData =
+        conditionalInnerJoinGatherMapsWithCount(getNativeView(), rightTable.getNativeView(),
+            condition.getNativeHandle(), compareNullsEqual, outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of an full equi-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -2080,6 +2372,72 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows resulting from a full equi-join between two tables.
+   * It is assumed this table instance holds the key columns from the left table, and the
+   * {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Note that unlike {@link #leftJoinRowCount(HashJoin)} and {@link #innerJoinRowCount(HashJoin),
+   * this will perform some redundant calculations compared to
+   * {@link #fullJoinGatherMaps(HashJoin, long)}.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return row count of the join result
+   */
+  public long fullJoinRowCount(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    return fullJoinRowCount(getNativeView(), rightHash.getNativeView(),
+        rightHash.getCompareNulls());
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a full equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the full join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param rightHash hash table built from join key columns from the right table
+   * @return left and right table gather maps
+   */
+  public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a full equi-join between
+   * two tables. It is assumed this table instance holds the key columns from the left table, and
+   * the {@link HashJoin} argument has been constructed from the key columns from the right table.
+   * Two {@link GatherMap} instances will be returned that can be used to gather the left and right
+   * tables, respectively, to produce the result of the full join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #fullJoinRowCount(HashJoin)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightHash hash table built from join key columns from the right table
+   * @param outputRowCount number of output rows in the join result
+   * @return left and right table gather maps
+   */
+  public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
+    if (getNumberOfColumns() != rightHash.getNumberOfColumns()) {
+      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
+          "rightKeys: " + rightHash.getNumberOfColumns());
+    }
+    long[] gatherMapData =
+        fullHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
+            rightHash.getCompareNulls(), outputRowCount);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of a full join between
    * two tables when a conditional expression is true. It is assumed this table instance holds
@@ -2092,12 +2450,9 @@ public GatherMap[] fullJoinGatherMaps(Table rightKeys, boolean compareNullsEqual
    * @param compareNullsEqual true if null key values should match otherwise false
    * @return left and right table gather maps
    */
-  public GatherMap[] fullJoinGatherMaps(Table rightTable, CompiledExpression condition,
-                                         boolean compareNullsEqual) {
-    if (getNumberOfColumns() != rightTable.getNumberOfColumns()) {
-      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
-          "rightKeys: " + rightTable.getNumberOfColumns());
-    }
+  public GatherMap[] conditionalFullJoinGatherMaps(Table rightTable,
+                                                   CompiledExpression condition,
+                                                   boolean compareNullsEqual) {
     long[] gatherMapData =
         conditionalFullJoinGatherMaps(getNativeView(), rightTable.getNativeView(),
             condition.getNativeHandle(), compareNullsEqual);
@@ -2132,6 +2487,22 @@ public GatherMap leftSemiJoinGatherMap(Table rightKeys, boolean compareNullsEqua
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows from the result of a left semi join between two tables when a
+   * conditional expression is true. It is assumed this table instance holds the columns from
+   * the left table, and the table argument represents the columns from the right table.
+   * @param rightTable the right side table of the join in the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return row count for the join result
+   */
+  public long conditionalLeftSemiJoinRowCount(Table rightTable,
+                                              CompiledExpression condition,
+                                              boolean compareNullsEqual) {
+    return conditionalLeftSemiJoinRowCount(getNativeView(), rightTable.getNativeView(),
+        condition.getNativeHandle(), compareNullsEqual);
+  }
+
   /**
    * Computes the gather map that can be used to manifest the result of a left semi join between
    * two tables when a conditional expression is true. It is assumed this table instance holds
@@ -2144,18 +2515,42 @@ public GatherMap leftSemiJoinGatherMap(Table rightKeys, boolean compareNullsEqua
    * @param compareNullsEqual true if null key values should match otherwise false
    * @return left table gather map
    */
-  public GatherMap leftSemiJoinGatherMap(Table rightTable, CompiledExpression condition,
-                                         boolean compareNullsEqual) {
-    if (getNumberOfColumns() != rightTable.getNumberOfColumns()) {
-      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
-          "rightKeys: " + rightTable.getNumberOfColumns());
-    }
+  public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
+                                                    CompiledExpression condition,
+                                                    boolean compareNullsEqual) {
     long[] gatherMapData =
         conditionalLeftSemiJoinGatherMap(getNativeView(), rightTable.getNativeView(),
             condition.getNativeHandle(), compareNullsEqual);
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * Computes the gather map that can be used to manifest the result of a left semi join between
+   * two tables when a conditional expression is true. It is assumed this table instance holds
+   * the columns from the left table, and the table argument represents the columns from the
+   * right table. The {@link GatherMap} instance returned can be used to gather the left table
+   * to produce the result of the left semi join.
+   * It is the responsibility of the caller to close the resulting gather map instance.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression, boolean)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightTable the right side table of the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @param outputRowCount number of output rows in the join result
+   * @return left table gather map
+   */
+  public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
+                                                    CompiledExpression condition,
+                                                    boolean compareNullsEqual,
+                                                    long outputRowCount) {
+    long[] gatherMapData =
+        conditionalLeftSemiJoinGatherMapWithCount(getNativeView(), rightTable.getNativeView(),
+            condition.getNativeHandle(), compareNullsEqual, outputRowCount);
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
   /**
    * Computes the gather map that can be used to manifest the result of a left anti-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -2177,6 +2572,22 @@ public GatherMap leftAntiJoinGatherMap(Table rightKeys, boolean compareNullsEqua
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * Computes the number of rows from the result of a left anti join between two tables when a
+   * conditional expression is true. It is assumed this table instance holds the columns from
+   * the left table, and the table argument represents the columns from the right table.
+   * @param rightTable the right side table of the join in the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @return row count for the join result
+   */
+  public long conditionalLeftAntiJoinRowCount(Table rightTable,
+                                              CompiledExpression condition,
+                                              boolean compareNullsEqual) {
+    return conditionalLeftAntiJoinRowCount(getNativeView(), rightTable.getNativeView(),
+        condition.getNativeHandle(), compareNullsEqual);
+  }
+
   /**
    * Computes the gather map that can be used to manifest the result of a left anti join between
    * two tables when a conditional expression is true. It is assumed this table instance holds
@@ -2189,18 +2600,42 @@ public GatherMap leftAntiJoinGatherMap(Table rightKeys, boolean compareNullsEqua
    * @param compareNullsEqual true if null key values should match otherwise false
    * @return left table gather map
    */
-  public GatherMap leftAntiJoinGatherMap(Table rightTable, CompiledExpression condition,
-                                         boolean compareNullsEqual) {
-    if (getNumberOfColumns() != rightTable.getNumberOfColumns()) {
-      throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
-          "rightKeys: " + rightTable.getNumberOfColumns());
-    }
+  public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
+                                                    CompiledExpression condition,
+                                                    boolean compareNullsEqual) {
     long[] gatherMapData =
         conditionalLeftAntiJoinGatherMap(getNativeView(), rightTable.getNativeView(),
             condition.getNativeHandle(), compareNullsEqual);
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * Computes the gather map that can be used to manifest the result of a left anti join between
+   * two tables when a conditional expression is true. It is assumed this table instance holds
+   * the columns from the left table, and the table argument represents the columns from the
+   * right table. The {@link GatherMap} instance returned can be used to gather the left table
+   * to produce the result of the left anti join.
+   * It is the responsibility of the caller to close the resulting gather map instance.
+   * This interface allows passing an output row count that was previously computed from
+   * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression, boolean)}.
+   * WARNING: Passing a row count that is smaller than the actual row count will result
+   * in undefined behavior.
+   * @param rightTable the right side table of the join
+   * @param condition conditional expression to evaluate during the join
+   * @param compareNullsEqual true if null key values should match otherwise false
+   * @param outputRowCount number of output rows in the join result
+   * @return left table gather map
+   */
+  public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
+                                                    CompiledExpression condition,
+                                                    boolean compareNullsEqual,
+                                                    long outputRowCount) {
+    long[] gatherMapData =
+        conditionalLeftAntiJoinGatherMapWithCount(getNativeView(), rightTable.getNativeView(),
+            condition.getNativeHandle(), compareNullsEqual, outputRowCount);
+    return buildSemiJoinGatherMap(gatherMapData);
+  }
+
   /**
    * Convert this table of columns into a row major format that is useful for interacting with other
    * systems that do row major processing of the data. Currently only fixed-width column types are
@@ -2456,7 +2891,7 @@ public static final class GroupByOperation {
      *                  1,   2
      *                  2,   1 ==> aggregated count
      */
-    public Table aggregate(AggregationOnColumn... aggregates) {
+    public Table aggregate(GroupByAggregationOnColumn... aggregates) {
       assert aggregates != null;
 
       // To improve performance and memory we want to remove duplicate operations
@@ -2469,9 +2904,9 @@ public Table aggregate(AggregationOnColumn... aggregates) {
       int keysLength = operation.indices.length;
       int totalOps = 0;
       for (int outputIndex = 0; outputIndex < aggregates.length; outputIndex++) {
-        AggregationOnColumn agg = aggregates[outputIndex];
+        GroupByAggregationOnColumn agg = aggregates[outputIndex];
         ColumnOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnOps());
-        totalOps += ops.add(agg, outputIndex + keysLength);
+        totalOps += ops.add(agg.getWrapped().getWrapped(), outputIndex + keysLength);
       }
       int[] aggColumnIndexes = new int[totalOps];
       long[] aggOperationInstances = new long[totalOps];
@@ -2808,7 +3243,7 @@ public Table aggregateWindowsOverRanges(AggregationOverWindow... windowAggregate
       }
     }
 
-    public Table scan(AggregationOnColumn... aggregates) {
+    public Table scan(GroupByScanAggregationOnColumn... aggregates) {
       assert aggregates != null;
 
       // To improve performance and memory we want to remove duplicate operations
@@ -2821,9 +3256,9 @@ public Table scan(AggregationOnColumn... aggregates) {
       int keysLength = operation.indices.length;
       int totalOps = 0;
       for (int outputIndex = 0; outputIndex < aggregates.length; outputIndex++) {
-        AggregationOnColumn agg = aggregates[outputIndex];
+        GroupByScanAggregationOnColumn agg = aggregates[outputIndex];
         ColumnOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnOps());
-        totalOps += ops.add(agg, outputIndex + keysLength);
+        totalOps += ops.add(agg.getWrapped().getWrapped(), outputIndex + keysLength);
       }
       int[] aggColumnIndexes = new int[totalOps];
       long[] aggOperationInstances = new long[totalOps];
diff --git a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java b/java/src/main/java/ai/rapids/cudf/ast/AstExpression.java
similarity index 82%
rename from java/src/main/java/ai/rapids/cudf/ast/AstNode.java
rename to java/src/main/java/ai/rapids/cudf/ast/AstExpression.java
index 78cf39b05d2..5ac15f714f0 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/AstNode.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/AstExpression.java
@@ -17,14 +17,15 @@
 package ai.rapids.cudf.ast;
 
 import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 
 /** Base class of every node in an AST */
-abstract class AstNode {
+public abstract class AstExpression {
   /**
    * Enumeration for the types of AST nodes that can appear in a serialized AST.
    * NOTE: This must be kept in sync with the `jni_serialized_node_type` in CompiledExpression.cpp!
    */
-  protected enum NodeType {
+  protected enum ExpressionType {
     VALID_LITERAL(0),
     NULL_LITERAL(1),
     COLUMN_REFERENCE(2),
@@ -33,7 +34,7 @@ protected enum NodeType {
 
     private final byte nativeId;
 
-    NodeType(int nativeId) {
+    ExpressionType(int nativeId) {
       this.nativeId = (byte) nativeId;
       assert this.nativeId == nativeId;
     }
@@ -49,6 +50,14 @@ void serialize(ByteBuffer bb) {
     }
   }
 
+  public CompiledExpression compile() {
+    int size = getSerializedSize();
+    ByteBuffer bb = ByteBuffer.allocate(size);
+    bb.order(ByteOrder.nativeOrder());
+    serialize(bb);
+    return new CompiledExpression(bb.array());
+  }
+
   /** Get the size in bytes of the serialized form of this node and all child nodes */
   abstract int getSerializedSize();
 
diff --git a/java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java
similarity index 72%
rename from java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java
rename to java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java
index ed4f95b01e1..c39c1c3a1c5 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/BinaryExpression.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperation.java
@@ -18,13 +18,13 @@
 
 import java.nio.ByteBuffer;
 
-/** A binary expression consisting of an operator and two operands. */
-public class BinaryExpression extends Expression {
+/** A binary operation consisting of an operator and two operands. */
+public class BinaryOperation extends AstExpression {
   private final BinaryOperator op;
-  private final AstNode leftInput;
-  private final AstNode rightInput;
+  private final AstExpression leftInput;
+  private final AstExpression rightInput;
 
-  public BinaryExpression(BinaryOperator op, AstNode leftInput, AstNode rightInput) {
+  public BinaryOperation(BinaryOperator op, AstExpression leftInput, AstExpression rightInput) {
     this.op = op;
     this.leftInput = leftInput;
     this.rightInput = rightInput;
@@ -32,7 +32,7 @@ public BinaryExpression(BinaryOperator op, AstNode leftInput, AstNode rightInput
 
   @Override
   int getSerializedSize() {
-    return NodeType.BINARY_EXPRESSION.getSerializedSize() +
+    return ExpressionType.BINARY_EXPRESSION.getSerializedSize() +
         op.getSerializedSize() +
         leftInput.getSerializedSize() +
         rightInput.getSerializedSize();
@@ -40,7 +40,7 @@ int getSerializedSize() {
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType.BINARY_EXPRESSION.serialize(bb);
+    ExpressionType.BINARY_EXPRESSION.serialize(bb);
     op.serialize(bb);
     leftInput.serialize(bb);
     rightInput.serialize(bb);
diff --git a/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java
index 12e4d985658..595badb14b6 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/BinaryOperator.java
@@ -19,7 +19,7 @@
 import java.nio.ByteBuffer;
 
 /**
- * Enumeration of AST operations that can appear in a binary expression.
+ * Enumeration of AST operators that can appear in a binary operation.
  * NOTE: This must be kept in sync with `jni_to_binary_operator` in CompiledExpression.cpp!
  */
 public enum BinaryOperator {
diff --git a/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java b/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java
index 34e4064e23b..4860a088a83 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/ColumnReference.java
@@ -19,7 +19,7 @@
 import java.nio.ByteBuffer;
 
 /** A reference to a column in an input table. */
-public final class ColumnReference extends AstNode {
+public final class ColumnReference extends AstExpression {
   private final int columnIndex;
   private final TableReference tableSource;
 
@@ -37,14 +37,14 @@ public ColumnReference(int columnIndex, TableReference tableSource) {
   @Override
   int getSerializedSize() {
     // node type + table ref + column index
-    return NodeType.COLUMN_REFERENCE.getSerializedSize() +
+    return ExpressionType.COLUMN_REFERENCE.getSerializedSize() +
         tableSource.getSerializedSize() +
         Integer.BYTES;
   }
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType.COLUMN_REFERENCE.serialize(bb);
+    ExpressionType.COLUMN_REFERENCE.serialize(bb);
     tableSource.serialize(bb);
     bb.putInt(columnIndex);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java b/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java
index 0949b09cbb0..ea5dc003844 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/CompiledExpression.java
@@ -18,12 +18,17 @@
 
 import ai.rapids.cudf.ColumnVector;
 import ai.rapids.cudf.MemoryCleaner;
+import ai.rapids.cudf.NativeDepsLoader;
 import ai.rapids.cudf.Table;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /** This class wraps a native compiled AST and must be closed to avoid native memory leaks. */
 public class CompiledExpression implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
   private static final Logger log = LoggerFactory.getLogger(CompiledExpression.class);
 
   private static class CompiledExpressionCleaner extends MemoryCleaner.Cleaner {
diff --git a/java/src/main/java/ai/rapids/cudf/ast/Literal.java b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
index be306cd99c4..b93efce8c94 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/Literal.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/Literal.java
@@ -22,7 +22,7 @@
 import java.nio.ByteOrder;
 
 /** A literal value in an AST expression. */
-public final class Literal extends AstNode {
+public final class Literal extends AstExpression {
   private final DType type;
   private final byte[] serializedValue;
 
@@ -207,8 +207,8 @@ public static Literal ofDurationFromLong(DType type, Long value) {
 
   @Override
   int getSerializedSize() {
-    NodeType nodeType = serializedValue != null
-        ? NodeType.VALID_LITERAL : NodeType.NULL_LITERAL;
+    ExpressionType nodeType = serializedValue != null
+        ? ExpressionType.VALID_LITERAL : ExpressionType.NULL_LITERAL;
     int size = nodeType.getSerializedSize() + getDataTypeSerializedSize();
     if (serializedValue != null) {
       size += serializedValue.length;
@@ -218,8 +218,8 @@ int getSerializedSize() {
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType nodeType = serializedValue != null
-        ? NodeType.VALID_LITERAL : NodeType.NULL_LITERAL;
+    ExpressionType nodeType = serializedValue != null
+        ? ExpressionType.VALID_LITERAL : ExpressionType.NULL_LITERAL;
     nodeType.serialize(bb);
     serializeDataType(bb);
     if (serializedValue != null) {
diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java
similarity index 73%
rename from java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java
rename to java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java
index fa8e70266ac..03c4c45afd4 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/UnaryExpression.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperation.java
@@ -18,26 +18,26 @@
 
 import java.nio.ByteBuffer;
 
-/** A unary expression consisting of an operator and an operand. */
-public final class UnaryExpression extends Expression {
+/** A unary operation consisting of an operator and an operand. */
+public final class UnaryOperation extends AstExpression {
   private final UnaryOperator op;
-  private final AstNode input;
+  private final AstExpression input;
 
-  public UnaryExpression(UnaryOperator op, AstNode input) {
+  public UnaryOperation(UnaryOperator op, AstExpression input) {
     this.op = op;
     this.input = input;
   }
 
   @Override
   int getSerializedSize() {
-    return NodeType.UNARY_EXPRESSION.getSerializedSize() +
+    return ExpressionType.UNARY_EXPRESSION.getSerializedSize() +
         op.getSerializedSize() +
         input.getSerializedSize();
   }
 
   @Override
   void serialize(ByteBuffer bb) {
-    NodeType.UNARY_EXPRESSION.serialize(bb);
+    ExpressionType.UNARY_EXPRESSION.serialize(bb);
     op.serialize(bb);
     input.serialize(bb);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
index c3f193d06b4..9ef18dbd75d 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
@@ -19,7 +19,7 @@
 import java.nio.ByteBuffer;
 
 /**
- * Enumeration of AST operations that can appear in a unary expression.
+ * Enumeration of AST operators that can appear in a unary operation.
  * NOTE: This must be kept in sync with `jni_to_unary_operator` in CompiledExpression.cpp!
  */
 public enum UnaryOperator {
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index a938a2af456..bc59e3aee64 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -186,7 +186,8 @@ endif(CUDF_JNI_ARROW_STATIC)
 
 find_library(ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED
   HINTS "$ENV{ARROW_ROOT}/lib"
-        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release")
+        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release"
+        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/debug")
 
 if(NOT ARROW_LIBRARY)
   if(CUDF_JNI_ARROW_STATIC)
@@ -263,6 +264,7 @@ set(SOURCE_FILES
     "src/ColumnViewJni.cpp"
     "src/CompiledExpression.cpp"
     "src/ContiguousTableJni.cpp"
+    "src/HashJoinJni.cpp"
     "src/HostMemoryBufferNativeUtilsJni.cpp"
     "src/NvcompJni.cpp"
     "src/NvtxRangeJni.cpp"
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index 31f3184f107..470464f35c8 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -18,11 +18,10 @@
 #include <stdexcept>
 #include <vector>
 
-#include <cudf/ast/nodes.hpp>
-#include <cudf/ast/operators.hpp>
-#include <cudf/ast/transform.hpp>
+#include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
 #include "cudf_jni_apis.hpp"
@@ -104,15 +103,15 @@ class jni_serialized_ast {
 };
 
 /**
- * Enumeration of the AST node types that can appear in the serialized data.
+ * Enumeration of the AST expression types that can appear in the serialized data.
  * NOTE: This must be kept in sync with the NodeType enumeration in AstNode.java!
  */
-enum class jni_serialized_node_type : int8_t {
+enum class jni_serialized_expression_type : int8_t {
   VALID_LITERAL = 0,
   NULL_LITERAL = 1,
   COLUMN_REFERENCE = 2,
-  UNARY_EXPRESSION = 3,
-  BINARY_EXPRESSION = 4
+  UNARY_OPERATION = 3,
+  BINARY_OPERATION = 4
 };
 
 /**
@@ -276,41 +275,42 @@ cudf::ast::column_reference &compile_column_reference(cudf::jni::ast::compiled_e
 }
 
 // forward declaration
-cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_expr,
-                                      jni_serialized_ast &jni_ast);
+cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                          jni_serialized_ast &jni_ast);
 
 /** Decode a serialized AST unary expression */
-cudf::ast::expression &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                jni_serialized_ast &jni_ast) {
+cudf::ast::operation &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                               jni_serialized_ast &jni_ast) {
   auto const ast_op = jni_to_unary_operator(jni_ast.read_byte());
-  cudf::ast::detail::node &child_node = compile_node(compiled_expr, jni_ast);
-  return compiled_expr.add_expression(std::make_unique<cudf::ast::expression>(ast_op, child_node));
+  cudf::ast::expression &child_expression = compile_expression(compiled_expr, jni_ast);
+  return compiled_expr.add_operation(
+      std::make_unique<cudf::ast::operation>(ast_op, child_expression));
 }
 
 /** Decode a serialized AST binary expression */
-cudf::ast::expression &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                 jni_serialized_ast &jni_ast) {
+cudf::ast::operation &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                                jni_serialized_ast &jni_ast) {
   auto const ast_op = jni_to_binary_operator(jni_ast.read_byte());
-  cudf::ast::detail::node &left_child = compile_node(compiled_expr, jni_ast);
-  cudf::ast::detail::node &right_child = compile_node(compiled_expr, jni_ast);
-  return compiled_expr.add_expression(
-      std::make_unique<cudf::ast::expression>(ast_op, left_child, right_child));
+  cudf::ast::expression &left_child = compile_expression(compiled_expr, jni_ast);
+  cudf::ast::expression &right_child = compile_expression(compiled_expr, jni_ast);
+  return compiled_expr.add_operation(
+      std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
 }
 
-/** Decode a serialized AST node by reading the node type and dispatching */
-cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_expr,
-                                      jni_serialized_ast &jni_ast) {
-  auto const node_type = static_cast<jni_serialized_node_type>(jni_ast.read_byte());
-  switch (node_type) {
-    case jni_serialized_node_type::VALID_LITERAL:
+/** Decode a serialized AST expression by reading the expression type and dispatching */
+cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
+                                          jni_serialized_ast &jni_ast) {
+  auto const expression_type = static_cast<jni_serialized_expression_type>(jni_ast.read_byte());
+  switch (expression_type) {
+    case jni_serialized_expression_type::VALID_LITERAL:
       return compile_literal(true, compiled_expr, jni_ast);
-    case jni_serialized_node_type::NULL_LITERAL:
+    case jni_serialized_expression_type::NULL_LITERAL:
       return compile_literal(false, compiled_expr, jni_ast);
-    case jni_serialized_node_type::COLUMN_REFERENCE:
+    case jni_serialized_expression_type::COLUMN_REFERENCE:
       return compile_column_reference(compiled_expr, jni_ast);
-    case jni_serialized_node_type::UNARY_EXPRESSION:
+    case jni_serialized_expression_type::UNARY_OPERATION:
       return compile_unary_expression(compiled_expr, jni_ast);
-    case jni_serialized_node_type::BINARY_EXPRESSION:
+    case jni_serialized_expression_type::BINARY_OPERATION:
       return compile_binary_expression(compiled_expr, jni_ast);
     default: throw std::invalid_argument("data is not a serialized AST expression");
   }
@@ -319,16 +319,7 @@ cudf::ast::detail::node &compile_node(cudf::jni::ast::compiled_expr &compiled_ex
 /** Decode a serialized AST into a native libcudf AST and associated resources */
 std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast &jni_ast) {
   auto jni_expr_ptr = std::make_unique<cudf::jni::ast::compiled_expr>();
-  auto const node_type = static_cast<jni_serialized_node_type>(jni_ast.read_byte());
-  switch (node_type) {
-    case jni_serialized_node_type::UNARY_EXPRESSION:
-      (void)compile_unary_expression(*jni_expr_ptr, jni_ast);
-      break;
-    case jni_serialized_node_type::BINARY_EXPRESSION:
-      (void)compile_binary_expression(*jni_expr_ptr, jni_ast);
-      break;
-    default: throw std::invalid_argument("data is not a serialized AST expression");
-  }
+  (void)compile_expression(*jni_expr_ptr, jni_ast);
 
   if (!jni_ast.at_eof()) {
     throw std::invalid_argument("Extra bytes at end of serialized AST");
@@ -366,7 +357,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn
     auto compiled_expr_ptr = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_ast);
     auto tview_ptr = reinterpret_cast<cudf::table_view const *>(j_table);
     std::unique_ptr<cudf::column> result =
-        cudf::ast::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
+        cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/HashJoinJni.cpp b/java/src/main/native/src/HashJoinJni.cpp
new file mode 100644
index 00000000000..0f78aef64bc
--- /dev/null
+++ b/java/src/main/native/src/HashJoinJni.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/join.hpp>
+
+#include "cudf_jni_apis.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv *env, jclass, jlong j_table,
+                                                            jboolean j_nulls_equal) {
+  JNI_NULL_CHECK(env, j_table, "table handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto tview = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto nulleq = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto hash_join_ptr = new cudf::hash_join(*tview, nulleq);
+    return reinterpret_cast<jlong>(hash_join_ptr);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv *env, jclass, jlong j_handle) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto hash_join_ptr = reinterpret_cast<cudf::hash_join *>(j_handle);
+    delete hash_join_ptr;
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c092450da1c..2bb56565f7a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -668,21 +668,26 @@ namespace {
 int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
                         std::vector<std::string> &col_names,
                         cudf::jni::native_jbooleanArray &nullability,
-                        cudf::jni::native_jbooleanArray &isInt96,
+                        cudf::jni::native_jbooleanArray &is_int96,
                         cudf::jni::native_jintArray &precisions,
+                        cudf::jni::native_jbooleanArray &is_map,
                         cudf::jni::native_jintArray &children, int num_children, int read_index) {
   int write_index = 0;
   for (int i = 0; i < num_children; i++, write_index++) {
     cudf::io::column_in_metadata child;
     child.set_name(col_names[read_index])
         .set_decimal_precision(precisions[read_index])
-        .set_int96_timestamps(isInt96[read_index])
+        .set_int96_timestamps(is_int96[read_index])
         .set_nullability(nullability[read_index]);
+    if (is_map[read_index]) {
+      child.set_list_column_as_map();
+    }
     column_metadata.add_child(child);
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index = set_column_metadata(column_metadata.child(write_index), col_names, nullability,
-                                       isInt96, precisions, children, childs_children, read_index);
+      read_index =
+          set_column_metadata(column_metadata.child(write_index), col_names, nullability, is_int96,
+                              precisions, is_map, children, childs_children, read_index);
     }
   }
   return read_index;
@@ -692,7 +697,8 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
                          jintArray &j_children, jbooleanArray &j_col_nullability,
                          jobjectArray &j_metadata_keys, jobjectArray &j_metadata_values,
                          jint j_compression, jint j_stats_freq, jbooleanArray &j_isInt96,
-                         jintArray &j_precisions, cudf::io::table_input_metadata &metadata) {
+                         jintArray &j_precisions, jbooleanArray &j_is_map,
+                         cudf::io::table_input_metadata &metadata) {
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
@@ -701,6 +707,7 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
   cudf::jni::native_jintArray precisions(env, j_precisions);
   cudf::jni::native_jintArray children(env, j_children);
+  cudf::jni::native_jbooleanArray is_map(env, j_is_map);
 
   auto cpp_names = col_names.as_cpp_vector();
 
@@ -714,11 +721,14 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
         .set_nullability(col_nullability[read_index])
         .set_int96_timestamps(isInt96[read_index])
         .set_decimal_precision(precisions[read_index]);
+    if (is_map[read_index]) {
+      metadata.column_metadata[write_index].set_list_column_as_map();
+    }
     int childs_children = children[read_index++];
     if (childs_children > 0) {
       read_index =
           set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability,
-                              isInt96, precisions, children, childs_children, read_index);
+                              isInt96, precisions, is_map, children, childs_children, read_index);
     }
   }
   for (auto i = 0; i < meta_keys.size(); ++i) {
@@ -745,13 +755,46 @@ bool valid_window_parameters(native_jintArray const &values,
          values.size() == preceding.size() && values.size() == following.size();
 }
 
-// Generate gather maps needed to manifest the result of an equi-join between two tables.
+// Convert a cudf gather map pair into the form that Java expects
 // The resulting Java long array contains the following at each index:
 //   0: Size of each gather map in bytes
 //   1: Device address of the gather map for the left table
 //   2: Host address of the rmm::device_buffer instance that owns the left gather map data
 //   3: Device address of the gather map for the right table
 //   4: Host address of the rmm::device_buffer instance that owns the right gather map data
+jlongArray gather_maps_to_java(JNIEnv *env,
+                               std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                                         std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+                                   maps) {
+  // release the underlying device buffer to Java
+  auto left_map_buffer = std::make_unique<rmm::device_buffer>(maps.first->release());
+  auto right_map_buffer = std::make_unique<rmm::device_buffer>(maps.second->release());
+  cudf::jni::native_jlongArray result(env, 5);
+  result[0] = static_cast<jlong>(left_map_buffer->size());
+  result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
+  result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
+  result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
+  result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
+  return result.get_jArray();
+}
+
+// Convert a cudf gather map into the form that Java expects
+// The resulting Java long array contains the following at each index:
+//   0: Size of the gather map in bytes
+//   1: Device address of the gather map
+//   2: Host address of the rmm::device_buffer instance that owns the gather map data
+jlongArray gather_map_to_java(JNIEnv *env,
+                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map) {
+  // release the underlying device buffer to Java
+  auto gather_map_buffer = std::make_unique<rmm::device_buffer>(map->release());
+  cudf::jni::native_jlongArray result(env, 3);
+  result[0] = static_cast<jlong>(gather_map_buffer->size());
+  result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
+  result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
+  return result.get_jArray();
+}
+
+// Generate gather maps needed to manifest the result of an equi-join between two tables.
 template <typename T>
 jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
                             jboolean compare_nulls_equal, T join_func) {
@@ -762,31 +805,29 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
     auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
     auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-              std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-        join_maps = join_func(*left_keys, *right_keys, nulleq);
-
-    // release the underlying device buffer to Java
-    auto left_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.first->release());
-    auto right_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.second->release());
-    cudf::jni::native_jlongArray result(env, 5);
-    result[0] = static_cast<jlong>(left_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
-    result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
-    result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
-    return result.get_jArray();
+    return gather_maps_to_java(env, join_func(*left_keys, *right_keys, nulleq));
+  }
+  CATCH_STD(env, NULL);
+}
+
+// Generate gather maps needed to manifest the result of an equi-join between a left table and
+// a hash table built from the join's right table.
+template <typename T>
+jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join,
+                                 jboolean compare_nulls_equal, T join_func) {
+  JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL);
+  JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    return gather_maps_to_java(env, join_func(*left_keys, *hash_join, nulleq));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate gather maps needed to manifest the result of a conditional join between two tables.
-// The resulting Java long array contains the following at each index:
-//   0: Size of each gather map in bytes
-//   1: Device address of the gather map for the left table
-//   2: Host address of the rmm::device_buffer instance that owns the left gather map data
-//   3: Device address of the gather map for the right table
-//   4: Host address of the rmm::device_buffer instance that owns the right gather map data
 template <typename T>
 jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_table,
                                  jlong j_condition, jboolean compare_nulls_equal, T join_func) {
@@ -799,29 +840,13 @@ jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_
     auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
     auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-              std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-        join_maps = join_func(*left_table, *right_table, condition->get_top_expression(), nulleq);
-
-    // release the underlying device buffer to Java
-    auto left_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.first->release());
-    auto right_map_buffer = std::make_unique<rmm::device_buffer>(join_maps.second->release());
-    cudf::jni::native_jlongArray result(env, 5);
-    result[0] = static_cast<jlong>(left_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(left_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(left_map_buffer.release());
-    result[3] = reinterpret_cast<jlong>(right_map_buffer->data());
-    result[4] = reinterpret_cast<jlong>(right_map_buffer.release());
-    return result.get_jArray();
+    return gather_maps_to_java(
+        env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate a gather map needed to manifest the result of a semi/anti join between two tables.
-// The resulting Java long array contains the following at each index:
-//   0: Size of the gather map in bytes
-//   1: Device address of the gather map
-//   2: Host address of the rmm::device_buffer instance that owns the gather map data
 template <typename T>
 jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
                                   jboolean compare_nulls_equal, T join_func) {
@@ -832,26 +857,13 @@ jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_
     auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
     auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> join_map =
-        join_func(*left_keys, *right_keys, nulleq);
-
-    // release the underlying device buffer to Java
-    auto gather_map_buffer = std::make_unique<rmm::device_buffer>(join_map->release());
-    cudf::jni::native_jlongArray result(env, 3);
-    result[0] = static_cast<jlong>(gather_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
-    return result.get_jArray();
+    return gather_map_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate a gather map needed to manifest the result of a conditional semi/anti join
 // between two tables.
-// The resulting Java long array contains the following at each index:
-//   0: Size of the gather map in bytes
-//   1: Device address of the gather map
-//   2: Host address of the rmm::device_buffer instance that owns the gather map data
 template <typename T>
 jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_right_table,
                                        jlong j_condition, jboolean compare_nulls_equal,
@@ -865,16 +877,8 @@ jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_
     auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
     auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr *>(j_condition);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> join_map =
-        join_func(*left_table, *right_table, condition->get_top_expression(), nulleq);
-
-    // release the underlying device buffer to Java
-    auto gather_map_buffer = std::make_unique<rmm::device_buffer>(join_map->release());
-    cudf::jni::native_jlongArray result(env, 3);
-    result[0] = static_cast<jlong>(gather_map_buffer->size());
-    result[1] = reinterpret_cast<jlong>(gather_map_buffer->data());
-    result[2] = reinterpret_cast<jlong>(gather_map_buffer.release());
-    return result.get_jArray();
+    return gather_map_to_java(
+        env, join_func(*left_table, *right_table, condition->get_top_expression(), nulleq));
   }
   CATCH_STD(env, NULL);
 }
@@ -925,6 +929,45 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
   return combine_join_results(env, std::move(left_cols), std::move(right_cols));
 }
 
+cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
+  if (!cudf::is_compound(column_view.type())) {
+    if (column_view.nullable() && column_view.null_count() == 0) {
+      // null_mask is allocated but no nulls present therefore we create a new column_view without
+      // the null_mask to avoid things blowing up in reading the parquet file
+      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
+                               0, column_view.offset());
+    } else {
+      return cudf::column_view(column_view);
+    }
+  } else {
+    std::unique_ptr<cudf::column_view> ret;
+    std::vector<cudf::column_view> children;
+    children.reserve(column_view.num_children());
+    for (auto it = column_view.child_begin(); it != column_view.child_end(); it++) {
+      children.push_back(remove_validity_from_col(*it));
+    }
+    if (!column_view.nullable() || column_view.null_count() != 0) {
+      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr,
+                                      column_view.null_mask(), column_view.null_count(),
+                                      column_view.offset(), children));
+    } else {
+      ret.reset(new cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0,
+                                      column_view.offset(), children));
+    }
+    return *ret.release();
+  }
+}
+
+cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
+  std::vector<cudf::column_view> views;
+  views.reserve(input_table_view->num_columns());
+  for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) {
+    views.push_back(remove_validity_from_col(*it));
+  }
+
+  return cudf::table_view(views);
+}
+
 } // namespace
 
 } // namespace jni
@@ -932,6 +975,25 @@ jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
 
 extern "C" {
 
+// This is a method purely added for testing remove_validity_if_needed method
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass,
+                                                                               jlong j_table_view) {
+  JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0);
+  try {
+    cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table_view);
+    cudf::table_view result = cudf::jni::remove_validity_if_needed(tview);
+    cudf::table m_tbl(result);
+    std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
+    auto results = cudf::jni::native_jlongArray(env, cols.size());
+    int i = 0;
+    for (auto it = cols.begin(); it != cols.end(); it++) {
+      results[i++] = reinterpret_cast<jlong>(it->release());
+    }
+    return results.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass,
                                                                       jlongArray j_cudf_columns) {
   JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0);
@@ -1152,7 +1214,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
-    JNIEnv *env, jclass, jobjectArray col_names, jobjectArray data_types,
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
     jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
     jint header_row, jbyte delim, jbyte quote, jbyte comment, jobjectArray null_values,
     jobjectArray true_values, jobjectArray false_values) {
@@ -1173,7 +1235,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstringArray n_col_names(env, col_names);
-    cudf::jni::native_jstringArray n_data_types(env, data_types);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    NULL);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      NULL);
+      }
+      data_types.reserve(n_types.size());
+      for (int index = 0; index < n_types.size(); index++) {
+        data_types.emplace_back(cudf::jni::make_data_type(n_types[index], n_scales[index]));
+      }
+    }
 
     cudf::jni::native_jstring filename(env, inputfilepath);
     if (!read_buffer && filename.is_empty()) {
@@ -1197,7 +1275,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
                                             .delimiter(delim)
                                             .header(header_row)
                                             .names(n_col_names.as_cpp_vector())
-                                            .dtypes(n_data_types.as_cpp_vector())
+                                            .dtypes(data_types)
                                             .use_cols_names(n_filter_col_names.as_cpp_vector())
                                             .true_values(n_true_values.as_cpp_vector())
                                             .false_values(n_false_values.as_cpp_vector())
@@ -1207,6 +1285,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
                                             .quotechar(quote)
                                             .comment(comment)
                                             .build();
+
     cudf::io::table_with_metadata result = cudf::io::read_csv(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
   }
@@ -1262,7 +1341,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jobject consumer) {
+    jbooleanArray j_is_map, jobject consumer) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1278,7 +1357,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
                         j_metadata_keys, j_metadata_values, j_compression, j_stats_freq, j_isInt96,
-                        j_precisions, metadata);
+                        j_precisions, j_is_map, metadata);
 
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
@@ -1298,7 +1377,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
     jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
     jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jstring j_output_path) {
+    jbooleanArray j_is_map, jstring j_output_path) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1312,7 +1391,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
                         j_metadata_keys, j_metadata_values, j_compression, j_stats_freq, j_isInt96,
-                        j_precisions, metadata);
+                        j_precisions, j_is_map, metadata);
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
@@ -1336,7 +1415,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view *tview_with_empty_nullmask = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask);
   cudf::jni::native_parquet_writer_handle *state =
       reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
 
@@ -1346,7 +1426,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   }
   try {
     cudf::jni::auto_set_device(env);
-    state->writer->write(*tview);
+    state->writer->write(tview);
   }
   CATCH_STD(env, )
 }
@@ -1924,6 +2004,64 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass,
+                                                                   jlong j_left_table,
+                                                                   jlong j_right_hash_join,
+                                                                   jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
+  JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = hash_join->left_join_size(*left_table, nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
+    jboolean compare_nulls_equal) {
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
+        return hash.left_join(left, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
+    jlong j_output_row_count) {
+  auto output_row_count = static_cast<std::size_t>(j_output_row_count);
+  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
+                                          [output_row_count](cudf::table_view const &left,
+                                                             cudf::hash_join const &hash,
+                                                             cudf::null_equality nulleq) {
+                                            return hash.left_join(left, nulleq, output_row_count);
+                                          });
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
+  JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
+  JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
+    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = cudf::conditional_left_join_size(*left_table, *right_table,
+                                                      condition->get_top_expression(), nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
@@ -1935,6 +2073,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGather
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal, jlong j_row_count) {
+  auto row_count = static_cast<std::size_t>(j_row_count);
+  return cudf::jni::cond_join_gather_maps(
+      env, j_left_table, j_right_table, j_condition, compare_nulls_equal,
+      [row_count](cudf::table_view const &left, cudf::table_view const &right,
+                  cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) {
+        return cudf::conditional_left_join(left, right, cond_expr, nulleq, row_count);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_maps(
@@ -1944,6 +2094,64 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_hash_join,
+                                                                    jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
+  JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = hash_join->inner_join_size(*left_table, nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
+    jboolean compare_nulls_equal) {
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
+        return hash.inner_join(left, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
+    jlong j_output_row_count) {
+  auto output_row_count = static_cast<std::size_t>(j_output_row_count);
+  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
+                                          [output_row_count](cudf::table_view const &left,
+                                                             cudf::hash_join const &hash,
+                                                             cudf::null_equality nulleq) {
+                                            return hash.inner_join(left, nulleq, output_row_count);
+                                          });
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
+  JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
+  JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
+    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = cudf::conditional_inner_join_size(*left_table, *right_table,
+                                                       condition->get_top_expression(), nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
@@ -1955,6 +2163,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGathe
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal, jlong j_row_count) {
+  auto row_count = static_cast<std::size_t>(j_row_count);
+  return cudf::jni::cond_join_gather_maps(
+      env, j_left_table, j_right_table, j_condition, compare_nulls_equal,
+      [row_count](cudf::table_view const &left, cudf::table_view const &right,
+                  cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) {
+        return cudf::conditional_inner_join(left, right, cond_expr, nulleq, row_count);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_maps(
@@ -1964,6 +2184,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass,
+                                                                   jlong j_left_table,
+                                                                   jlong j_right_hash_join,
+                                                                   jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
+  JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = hash_join->full_join_size(*left_table, nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
+    jboolean compare_nulls_equal) {
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join, compare_nulls_equal,
+      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
+        return hash.full_join(left, nulleq);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
+    jlong j_output_row_count) {
+  auto output_row_count = static_cast<std::size_t>(j_output_row_count);
+  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
+                                          [output_row_count](cudf::table_view const &left,
+                                                             cudf::hash_join const &hash,
+                                                             cudf::null_equality nulleq) {
+                                            return hash.full_join(left, nulleq, output_row_count);
+                                          });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
@@ -1984,6 +2243,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinRowCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
+  JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
+  JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
+    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = cudf::conditional_left_semi_join_size(*left_table, *right_table,
+                                                           condition->get_top_expression(), nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
@@ -1995,6 +2273,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGa
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal, jlong j_row_count) {
+  auto row_count = static_cast<std::size_t>(j_row_count);
+  return cudf::jni::cond_join_gather_single_map(
+      env, j_left_table, j_right_table, j_condition, compare_nulls_equal,
+      [row_count](cudf::table_view const &left, cudf::table_view const &right,
+                  cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) {
+        return cudf::conditional_left_semi_join(left, right, cond_expr, nulleq, row_count);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_single_map(
@@ -2004,6 +2294,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
       });
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinRowCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal) {
+  JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
+  JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
+  JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
+    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto row_count = cudf::conditional_left_anti_join_size(*left_table, *right_table,
+                                                           condition->get_top_expression(), nulleq);
+    return static_cast<jlong>(row_count);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
     jboolean compare_nulls_equal) {
@@ -2015,6 +2324,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGa
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount(
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
+    jboolean compare_nulls_equal, jlong j_row_count) {
+  auto row_count = static_cast<std::size_t>(j_row_count);
+  return cudf::jni::cond_join_gather_single_map(
+      env, j_left_table, j_right_table, j_condition, compare_nulls_equal,
+      [row_count](cudf::table_view const &left, cudf::table_view const &right,
+                  cudf::ast::expression const &cond_expr, cudf::null_equality nulleq) {
+        return cudf::conditional_left_anti_join(left, right, cond_expr, nulleq, row_count);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
                                                                  jlong left_table,
                                                                  jlong right_table) {
@@ -2194,11 +2515,19 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     for (int i = 0; i < n_values.size(); i++) {
       cudf::groupby::aggregation_request req;
       int col_index = n_values[i];
+
+      cudf::groupby_aggregation *agg =
+          dynamic_cast<cudf::groupby_aggregation *>(n_agg_instances[i]);
+      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of groupby_aggregation",
+                    nullptr);
+      std::unique_ptr<cudf::groupby_aggregation> cloned(
+          dynamic_cast<cudf::groupby_aggregation *>(agg->clone().release()));
+
       if (col_index == previous_index) {
-        requests.back().aggregations.push_back(n_agg_instances[i]->clone());
+        requests.back().aggregations.push_back(std::move(cloned));
       } else {
         req.values = n_input_table->column(col_index);
-        req.aggregations.push_back(n_agg_instances[i]->clone());
+        req.aggregations.push_back(std::move(cloned));
         requests.push_back(std::move(req));
       }
       previous_index = col_index;
@@ -2250,17 +2579,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
-    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<cudf::groupby::scan_request> requests;
 
     int previous_index = -1;
     for (int i = 0; i < n_values.size(); i++) {
-      cudf::groupby::aggregation_request req;
+      cudf::groupby::scan_request req;
       int col_index = n_values[i];
+
+      cudf::groupby_scan_aggregation *agg =
+          dynamic_cast<cudf::groupby_scan_aggregation *>(n_agg_instances[i]);
+      JNI_ARG_CHECK(env, agg != nullptr,
+                    "aggregation is not an instance of groupby_scan_aggregation", nullptr);
+      std::unique_ptr<cudf::groupby_scan_aggregation> cloned(
+          dynamic_cast<cudf::groupby_scan_aggregation *>(agg->clone().release()));
+
       if (col_index == previous_index) {
-        requests.back().aggregations.push_back(n_agg_instances[i]->clone());
+        requests.back().aggregations.push_back(std::move(cloned));
       } else {
         req.values = n_input_table->column(col_index);
-        req.aggregations.push_back(n_agg_instances[i]->clone());
+        req.aggregations.push_back(std::move(cloned));
         requests.push_back(std::move(req));
       }
       previous_index = col_index;
diff --git a/java/src/main/native/src/jni_compiled_expr.hpp b/java/src/main/native/src/jni_compiled_expr.hpp
index e42e5a37fba..74010f71011 100644
--- a/java/src/main/native/src/jni_compiled_expr.hpp
+++ b/java/src/main/native/src/jni_compiled_expr.hpp
@@ -32,12 +32,6 @@ namespace ast {
  * base AST node type. Then we do not have to track every AST node type separately.
  */
 class compiled_expr {
-  /** All literal nodes within the expression tree */
-  std::vector<std::unique_ptr<cudf::ast::literal>> literals;
-
-  /** All column reference nodes within the expression tree */
-  std::vector<std::unique_ptr<cudf::ast::column_reference>> column_refs;
-
   /** All expression nodes within the expression tree */
   std::vector<std::unique_ptr<cudf::ast::expression>> expressions;
 
@@ -47,20 +41,20 @@ class compiled_expr {
 public:
   cudf::ast::literal &add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
                                   std::unique_ptr<cudf::scalar> scalar_ptr) {
-    literals.push_back(std::move(literal_ptr));
+    expressions.push_back(std::move(literal_ptr));
     scalars.push_back(std::move(scalar_ptr));
-    return *literals.back();
+    return static_cast<cudf::ast::literal &>(*expressions.back());
   }
 
   cudf::ast::column_reference &
   add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr) {
-    column_refs.push_back(std::move(ref_ptr));
-    return *column_refs.back();
+    expressions.push_back(std::move(ref_ptr));
+    return static_cast<cudf::ast::column_reference &>(*expressions.back());
   }
 
-  cudf::ast::expression &add_expression(std::unique_ptr<cudf::ast::expression> expr_ptr) {
+  cudf::ast::operation &add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr) {
     expressions.push_back(std::move(expr_ptr));
-    return *expressions.back();
+    return static_cast<cudf::ast::operation &>(*expressions.back());
   }
 
   /** Return the expression node at the top of the tree */
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index d3fdb0e19bb..4856071e296 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -2899,24 +2899,22 @@ void testPrefixSum() {
   @Test
   void testScanSum() {
     try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) {
-      // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE
-      // tests have been disabled
-//      try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(1, 3, null, null, null, null, null)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
-
-      try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(1, 3, null, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
+
+      try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(1, 3, null, 6, 11, 19, 29)) {
         assertColumnsAreEqual(expected, result);
       }
 
-//      try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(0, 1, 3, 3, 6, 11, 19)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
+      try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(0, 1, 3, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
 
-      try (ColumnVector result = v1.scan(Aggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.sum(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(0, 1, null, 3, 6, 11, 19)) {
         assertColumnsAreEqual(expected, result);
       }
@@ -2925,25 +2923,23 @@ void testScanSum() {
 
   @Test
   void testScanMax() {
-    // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE
-    // tests have been disabled
     try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) {
-//      try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
+      try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
 
-      try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) {
         assertColumnsAreEqual(expected, result);
       }
 
-//      try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MIN_VALUE, 1, 2, 2, 3, 5, 8)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
+      try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MIN_VALUE, 1, 2, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
 
-      try (ColumnVector result = v1.scan(Aggregation.max(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.max(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MIN_VALUE, 1, null, 2, 3, 5, 8)) {
         assertColumnsAreEqual(expected, result);
       }
@@ -2952,25 +2948,23 @@ void testScanMax() {
 
   @Test
   void testScanMin() {
-    // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE
-    // tests have been disabled
     try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) {
-//      try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, null, null, null, null)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
+      try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
 
-      try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, 1, 1, 1, 1)) {
         assertColumnsAreEqual(expected, result);
       }
 
-//      try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MAX_VALUE, 1, 1, 1, 1, 1, 1)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
+      try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MAX_VALUE, 1, 1, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
 
-      try (ColumnVector result = v1.scan(Aggregation.min(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.min(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(Integer.MAX_VALUE, 1, null, 1, 1, 1, 1)) {
         assertColumnsAreEqual(expected, result);
       }
@@ -2979,25 +2973,23 @@ void testScanMin() {
 
   @Test
   void testScanProduct() {
-    // Due to https://github.com/rapidsai/cudf/issues/8462 NullPolicy.INCLUDE
-    // tests have been disabled
     try (ColumnVector v1 = ColumnVector.fromBoxedInts(1, 2, null, 3, 5, 8, 10)) {
-//      try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
+      try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.INCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
 
-      try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.INCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 6, 30, 240, 2400)) {
         assertColumnsAreEqual(expected, result);
       }
 
-//      try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
-//           ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, 2, 2, 6, 30, 240)) {
-//        assertColumnsAreEqual(expected, result);
-//      }
+      try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.EXCLUSIVE, NullPolicy.INCLUDE);
+           ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, 2, null, null, null, null)) {
+        assertColumnsAreEqual(expected, result);
+      }
 
-      try (ColumnVector result = v1.scan(Aggregation.product(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
+      try (ColumnVector result = v1.scan(ScanAggregation.product(), ScanType.EXCLUSIVE, NullPolicy.EXCLUDE);
            ColumnVector expected = ColumnVector.fromBoxedInts(1, 1, null, 2, 6, 30, 240)) {
         assertColumnsAreEqual(expected, result);
       }
@@ -3011,13 +3003,13 @@ void testScanRank() {
          ColumnVector struct_order = ColumnVector.makeStruct(col1, col2);
          ColumnVector expected = ColumnVector.fromBoxedInts(
             1, 1, 3, 4, 5, 6, 7, 7, 9, 9, 11, 12)) {
-      try (ColumnVector result = struct_order.scan(Aggregation.rank(),
+      try (ColumnVector result = struct_order.scan(ScanAggregation.rank(),
               ScanType.INCLUSIVE, NullPolicy.INCLUDE)) {
         assertColumnsAreEqual(expected, result);
       }
 
       // Exclude should have identical results
-      try (ColumnVector result = struct_order.scan(Aggregation.rank(),
+      try (ColumnVector result = struct_order.scan(ScanAggregation.rank(),
               ScanType.INCLUSIVE, NullPolicy.EXCLUDE)
               ) {
         assertColumnsAreEqual(expected, result);
@@ -3034,13 +3026,13 @@ void testScanDenseRank() {
          ColumnVector struct_order = ColumnVector.makeStruct(col1, col2);
          ColumnVector expected = ColumnVector.fromBoxedInts(
             1, 1, 2, 3, 4, 5, 6, 6, 7, 7, 8, 9)) {
-      try (ColumnVector result = struct_order.scan(Aggregation.denseRank(),
+      try (ColumnVector result = struct_order.scan(ScanAggregation.denseRank(),
               ScanType.INCLUSIVE, NullPolicy.INCLUDE)) {
         assertColumnsAreEqual(expected, result);
       }
 
       // Exclude should have identical results
-      try (ColumnVector result = struct_order.scan(Aggregation.denseRank(),
+      try (ColumnVector result = struct_order.scan(ScanAggregation.denseRank(),
               ScanType.INCLUSIVE, NullPolicy.EXCLUDE)) {
         assertColumnsAreEqual(expected, result);
       }
@@ -3058,39 +3050,39 @@ void testWindowStatic() {
              .minPeriods(2).build()) {
       try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8)) {
         try (ColumnVector expected = ColumnVector.fromLongs(9, 16, 17, 21, 14);
-             ColumnVector result = v1.rollingWindow(Aggregation.sum(), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.sum(), options)) {
           assertColumnsAreEqual(expected, result);
         }
 
         try (ColumnVector expected = ColumnVector.fromInts(4, 4, 4, 6, 6);
-             ColumnVector result = v1.rollingWindow(Aggregation.min(), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.min(), options)) {
           assertColumnsAreEqual(expected, result);
         }
 
         try (ColumnVector expected = ColumnVector.fromInts(5, 7, 7, 8, 8);
-             ColumnVector result = v1.rollingWindow(Aggregation.max(), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.max(), options)) {
           assertColumnsAreEqual(expected, result);
         }
 
         // The rolling window produces the same result type as the input
         try (ColumnVector expected = ColumnVector.fromDoubles(4.5, 16.0 / 3, 17.0 / 3, 7, 7);
-             ColumnVector result = v1.rollingWindow(Aggregation.mean(), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.mean(), options)) {
           assertColumnsAreEqual(expected, result);
         }
 
         try (ColumnVector expected = ColumnVector.fromBoxedInts(4, 7, 6, 8, null);
-             ColumnVector result = v1.rollingWindow(Aggregation.lead(1), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.lead(1), options)) {
           assertColumnsAreEqual(expected, result);
         }
 
         try (ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6);
-             ColumnVector result = v1.rollingWindow(Aggregation.lag(1), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.lag(1), options)) {
           assertColumnsAreEqual(expected, result);
         }
 
         try (ColumnVector defaultOutput = ColumnVector.fromInts(-1, -2, -3, -4, -5);
              ColumnVector expected = ColumnVector.fromBoxedInts(-1, 5, 4, 7, 6);
-             ColumnVector result = v1.rollingWindow(Aggregation.lag(1, defaultOutput), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.lag(1, defaultOutput), options)) {
           assertColumnsAreEqual(expected, result);
         }
       }
@@ -3106,11 +3098,11 @@ void testWindowStaticCounts() {
              .minPeriods(2).build()) {
       try (ColumnVector v1 = ColumnVector.fromBoxedInts(5, 4, null, 6, 8)) {
         try (ColumnVector expected = ColumnVector.fromInts(2, 2, 2, 2, 2);
-             ColumnVector result = v1.rollingWindow(Aggregation.count(NullPolicy.EXCLUDE), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.count(NullPolicy.EXCLUDE), options)) {
           assertColumnsAreEqual(expected, result);
         }
         try (ColumnVector expected = ColumnVector.fromInts(2, 3, 3, 3, 2);
-             ColumnVector result = v1.rollingWindow(Aggregation.count(NullPolicy.INCLUDE), options)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.count(NullPolicy.INCLUDE), options)) {
           assertColumnsAreEqual(expected, result);
         }
       }
@@ -3125,7 +3117,7 @@ void testWindowDynamicNegative() {
           .minPeriods(2).window(precedingCol, followingCol).build()) {
         try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
              ColumnVector expected = ColumnVector.fromBoxedLongs(null, null, 9L, 16L, 25L);
-             ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.sum(), window)) {
           assertColumnsAreEqual(expected, result);
         }
       }
@@ -3141,7 +3133,7 @@ void testWindowLag() {
              .window(two, negOne).build()) {
       try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
            ColumnVector expected = ColumnVector.fromBoxedInts(null, 5, 4, 7, 6);
-           ColumnVector result = v1.rollingWindow(Aggregation.max(), window)) {
+           ColumnVector result = v1.rollingWindow(RollingAggregation.max(), window)) {
         assertColumnsAreEqual(expected, result);
       }
     }
@@ -3155,7 +3147,7 @@ void testWindowDynamic() {
           .window(precedingCol, followingCol).build()) {
         try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
              ColumnVector expected = ColumnVector.fromLongs(16, 22, 30, 14, 14);
-             ColumnVector result = v1.rollingWindow(Aggregation.sum(), window)) {
+             ColumnVector result = v1.rollingWindow(RollingAggregation.sum(), window)) {
           assertColumnsAreEqual(expected, result);
         }
       }
@@ -3181,7 +3173,7 @@ void testWindowThrowsException() {
             .minPeriods(1)
             .orderByColumnIndex(0)
             .build()) {
-          arraywindowCol.rollingWindow(Aggregation.sum(), options);
+          arraywindowCol.rollingWindow(RollingAggregation.sum(), options);
         }
       });
     }
diff --git a/java/src/test/java/ai/rapids/cudf/HashJoinTest.java b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java
new file mode 100644
index 00000000000..be6125340ec
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/HashJoinTest.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class HashJoinTest {
+  @Test
+  void testGetNumberOfColumns() {
+    try (Table t = new Table.TestBuilder().column(1, 2).column(3, 4).column(5, 6).build();
+         HashJoin hashJoin = new HashJoin(t, false)) {
+      assertEquals(3, hashJoin.getNumberOfColumns());
+    }
+  }
+
+  @Test
+  void testGetCompareNulls() {
+    try (Table t = new Table.TestBuilder().column(1, 2, 3, 4).column(5, 6, 7, 8).build()) {
+      try (HashJoin hashJoin = new HashJoin(t, false)) {
+        assertFalse(hashJoin.getCompareNulls());
+      }
+      try (HashJoin hashJoin = new HashJoin(t, true)) {
+        assertTrue(hashJoin.getCompareNulls());
+      }
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
index 17b9ec3556f..2b26597c8f7 100644
--- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java
@@ -43,17 +43,17 @@ class ReductionTest extends CudfTestBase {
       Aggregation.Kind.ANY,
       Aggregation.Kind.ALL);
 
-  private static Scalar buildExpectedScalar(Aggregation op, DType baseType, Object expectedObject) {
+  private static Scalar buildExpectedScalar(ReductionAggregation op, DType baseType, Object expectedObject) {
     if (expectedObject == null) {
       return Scalar.fromNull(baseType);
     }
-    if (FLOAT_REDUCTIONS.contains(op.kind)) {
+    if (FLOAT_REDUCTIONS.contains(op.getWrapped().kind)) {
       if (baseType.equals(DType.FLOAT32)) {
         return Scalar.fromFloat((Float) expectedObject);
       }
       return Scalar.fromDouble((Double) expectedObject);
     }
-    if (BOOL_REDUCTIONS.contains(op.kind)) {
+    if (BOOL_REDUCTIONS.contains(op.getWrapped().kind)) {
       return Scalar.fromBool((Boolean) expectedObject);
     }
     switch (baseType.typeId) {
@@ -88,165 +88,165 @@ private static Scalar buildExpectedScalar(Aggregation op, DType baseType, Object
   private static Stream<Arguments> createBooleanParams() {
     Boolean[] vals = new Boolean[]{true, true, null, false, true, false, null};
     return Stream.of(
-        Arguments.of(Aggregation.sum(), new Boolean[0], null, 0.),
-        Arguments.of(Aggregation.sum(), new Boolean[]{null, null, null}, null, 0.),
-        Arguments.of(Aggregation.sum(), vals, true, 0.),
-        Arguments.of(Aggregation.min(), vals, false, 0.),
-        Arguments.of(Aggregation.max(), vals, true, 0.),
-        Arguments.of(Aggregation.product(), vals, false, 0.),
-        Arguments.of(Aggregation.sumOfSquares(), vals, true, 0.),
-        Arguments.of(Aggregation.mean(), vals, 0.6, DELTAD),
-        Arguments.of(Aggregation.standardDeviation(), vals, 0.5477225575051662, DELTAD),
-        Arguments.of(Aggregation.variance(), vals, 0.3, DELTAD),
-        Arguments.of(Aggregation.any(), vals, true, 0.),
-        Arguments.of(Aggregation.all(), vals, false, 0.)
+        Arguments.of(ReductionAggregation.sum(), new Boolean[0], null, 0.),
+        Arguments.of(ReductionAggregation.sum(), new Boolean[]{null, null, null}, null, 0.),
+        Arguments.of(ReductionAggregation.sum(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.min(), vals, false, 0.),
+        Arguments.of(ReductionAggregation.max(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.product(), vals, false, 0.),
+        Arguments.of(ReductionAggregation.sumOfSquares(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.mean(), vals, 0.6, DELTAD),
+        Arguments.of(ReductionAggregation.standardDeviation(), vals, 0.5477225575051662, DELTAD),
+        Arguments.of(ReductionAggregation.variance(), vals, 0.3, DELTAD),
+        Arguments.of(ReductionAggregation.any(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.all(), vals, false, 0.)
     );
   }
 
   private static Stream<Arguments> createByteParams() {
     Byte[] vals = new Byte[]{-1, 7, 123, null, 50, 60, 100};
     return Stream.of(
-        Arguments.of(Aggregation.sum(), new Byte[0], null, 0.),
-        Arguments.of(Aggregation.sum(), new Byte[]{null, null, null}, null, 0.),
-        Arguments.of(Aggregation.sum(), vals, (byte) 83, 0.),
-        Arguments.of(Aggregation.min(), vals, (byte) -1, 0.),
-        Arguments.of(Aggregation.max(), vals, (byte) 123, 0.),
-        Arguments.of(Aggregation.product(), vals, (byte) 160, 0.),
-        Arguments.of(Aggregation.sumOfSquares(), vals, (byte) 47, 0.),
-        Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD),
-        Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
-        Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD),
-        Arguments.of(Aggregation.any(), vals, true, 0.),
-        Arguments.of(Aggregation.all(), vals, true, 0.)
+        Arguments.of(ReductionAggregation.sum(), new Byte[0], null, 0.),
+        Arguments.of(ReductionAggregation.sum(), new Byte[]{null, null, null}, null, 0.),
+        Arguments.of(ReductionAggregation.sum(), vals, (byte) 83, 0.),
+        Arguments.of(ReductionAggregation.min(), vals, (byte) -1, 0.),
+        Arguments.of(ReductionAggregation.max(), vals, (byte) 123, 0.),
+        Arguments.of(ReductionAggregation.product(), vals, (byte) 160, 0.),
+        Arguments.of(ReductionAggregation.sumOfSquares(), vals, (byte) 47, 0.),
+        Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD),
+        Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
+        Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD),
+        Arguments.of(ReductionAggregation.any(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.all(), vals, true, 0.)
     );
   }
 
   private static Stream<Arguments> createShortParams() {
     Short[] vals = new Short[]{-1, 7, 123, null, 50, 60, 100};
     return Stream.of(
-        Arguments.of(Aggregation.sum(), new Short[0], null, 0.),
-        Arguments.of(Aggregation.sum(), new Short[]{null, null, null}, null, 0.),
-        Arguments.of(Aggregation.sum(), vals, (short) 339, 0.),
-        Arguments.of(Aggregation.min(), vals, (short) -1, 0.),
-        Arguments.of(Aggregation.max(), vals, (short) 123, 0.),
-        Arguments.of(Aggregation.product(), vals, (short) -22624, 0.),
-        Arguments.of(Aggregation.sumOfSquares(), vals, (short) 31279, 0.),
-        Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD),
-        Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
-        Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD),
-        Arguments.of(Aggregation.any(), vals, true, 0.),
-        Arguments.of(Aggregation.all(), vals, true, 0.)
+        Arguments.of(ReductionAggregation.sum(), new Short[0], null, 0.),
+        Arguments.of(ReductionAggregation.sum(), new Short[]{null, null, null}, null, 0.),
+        Arguments.of(ReductionAggregation.sum(), vals, (short) 339, 0.),
+        Arguments.of(ReductionAggregation.min(), vals, (short) -1, 0.),
+        Arguments.of(ReductionAggregation.max(), vals, (short) 123, 0.),
+        Arguments.of(ReductionAggregation.product(), vals, (short) -22624, 0.),
+        Arguments.of(ReductionAggregation.sumOfSquares(), vals, (short) 31279, 0.),
+        Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD),
+        Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
+        Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD),
+        Arguments.of(ReductionAggregation.any(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.all(), vals, true, 0.)
     );
   }
 
   private static Stream<Arguments> createIntParams() {
     Integer[] vals = new Integer[]{-1, 7, 123, null, 50, 60, 100};
     return Stream.of(
-        Arguments.of(Aggregation.sum(), new Integer[0], null, 0.),
-        Arguments.of(Aggregation.sum(), new Integer[]{null, null, null}, null, 0.),
-        Arguments.of(Aggregation.sum(), vals, 339, 0.),
-        Arguments.of(Aggregation.min(), vals, -1, 0.),
-        Arguments.of(Aggregation.max(), vals, 123, 0.),
-        Arguments.of(Aggregation.product(), vals, -258300000, 0.),
-        Arguments.of(Aggregation.sumOfSquares(), vals, 31279, 0.),
-        Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD),
-        Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
-        Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD),
-        Arguments.of(Aggregation.any(), vals, true, 0.),
-        Arguments.of(Aggregation.all(), vals, true, 0.)
+        Arguments.of(ReductionAggregation.sum(), new Integer[0], null, 0.),
+        Arguments.of(ReductionAggregation.sum(), new Integer[]{null, null, null}, null, 0.),
+        Arguments.of(ReductionAggregation.sum(), vals, 339, 0.),
+        Arguments.of(ReductionAggregation.min(), vals, -1, 0.),
+        Arguments.of(ReductionAggregation.max(), vals, 123, 0.),
+        Arguments.of(ReductionAggregation.product(), vals, -258300000, 0.),
+        Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279, 0.),
+        Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD),
+        Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
+        Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD),
+        Arguments.of(ReductionAggregation.any(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.all(), vals, true, 0.)
     );
   }
 
   private static Stream<Arguments> createLongParams() {
     Long[] vals = new Long[]{-1L, 7L, 123L, null, 50L, 60L, 100L};
     return Stream.of(
-        Arguments.of(Aggregation.sum(), new Long[0], null, 0.),
-        Arguments.of(Aggregation.sum(), new Long[]{null, null, null}, null, 0.),
-        Arguments.of(Aggregation.sum(), vals, 339L, 0.),
-        Arguments.of(Aggregation.min(), vals, -1L, 0.),
-        Arguments.of(Aggregation.max(), vals, 123L, 0.),
-        Arguments.of(Aggregation.product(), vals, -258300000L, 0.),
-        Arguments.of(Aggregation.sumOfSquares(), vals, 31279L, 0.),
-        Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD),
-        Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
-        Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD),
-        Arguments.of(Aggregation.any(), vals, true, 0.),
-        Arguments.of(Aggregation.all(), vals, true, 0.),
-        Arguments.of(Aggregation.quantile(0.5), vals, 55.0, DELTAD),
-        Arguments.of(Aggregation.quantile(0.9), vals, 111.5, DELTAD)
+        Arguments.of(ReductionAggregation.sum(), new Long[0], null, 0.),
+        Arguments.of(ReductionAggregation.sum(), new Long[]{null, null, null}, null, 0.),
+        Arguments.of(ReductionAggregation.sum(), vals, 339L, 0.),
+        Arguments.of(ReductionAggregation.min(), vals, -1L, 0.),
+        Arguments.of(ReductionAggregation.max(), vals, 123L, 0.),
+        Arguments.of(ReductionAggregation.product(), vals, -258300000L, 0.),
+        Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279L, 0.),
+        Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD),
+        Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
+        Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD),
+        Arguments.of(ReductionAggregation.any(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.all(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.quantile(0.5), vals, 55.0, DELTAD),
+        Arguments.of(ReductionAggregation.quantile(0.9), vals, 111.5, DELTAD)
     );
   }
 
   private static Stream<Arguments> createFloatParams() {
     Float[] vals = new Float[]{-1f, 7f, 123f, null, 50f, 60f, 100f};
     return Stream.of(
-        Arguments.of(Aggregation.sum(), new Float[0], null, 0f),
-        Arguments.of(Aggregation.sum(), new Float[]{null, null, null}, null, 0f),
-        Arguments.of(Aggregation.sum(), vals, 339f, 0f),
-        Arguments.of(Aggregation.min(), vals, -1f, 0f),
-        Arguments.of(Aggregation.max(), vals, 123f, 0f),
-        Arguments.of(Aggregation.product(), vals, -258300000f, 0f),
-        Arguments.of(Aggregation.sumOfSquares(), vals, 31279f, 0f),
-        Arguments.of(Aggregation.mean(), vals, 56.5f, DELTAF),
-        Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839f, DELTAF),
-        Arguments.of(Aggregation.variance(), vals, 2425.1f, DELTAF),
-        Arguments.of(Aggregation.any(), vals, true, 0f),
-        Arguments.of(Aggregation.all(), vals, true, 0f)
+        Arguments.of(ReductionAggregation.sum(), new Float[0], null, 0f),
+        Arguments.of(ReductionAggregation.sum(), new Float[]{null, null, null}, null, 0f),
+        Arguments.of(ReductionAggregation.sum(), vals, 339f, 0f),
+        Arguments.of(ReductionAggregation.min(), vals, -1f, 0f),
+        Arguments.of(ReductionAggregation.max(), vals, 123f, 0f),
+        Arguments.of(ReductionAggregation.product(), vals, -258300000f, 0f),
+        Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279f, 0f),
+        Arguments.of(ReductionAggregation.mean(), vals, 56.5f, DELTAF),
+        Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839f, DELTAF),
+        Arguments.of(ReductionAggregation.variance(), vals, 2425.1f, DELTAF),
+        Arguments.of(ReductionAggregation.any(), vals, true, 0f),
+        Arguments.of(ReductionAggregation.all(), vals, true, 0f)
     );
   }
 
   private static Stream<Arguments> createDoubleParams() {
     Double[] vals = new Double[]{-1., 7., 123., null, 50., 60., 100.};
     return Stream.of(
-        Arguments.of(Aggregation.sum(), new Double[0], null, 0.),
-        Arguments.of(Aggregation.sum(), new Double[]{null, null, null}, null, 0.),
-        Arguments.of(Aggregation.sum(), vals, 339., 0.),
-        Arguments.of(Aggregation.min(), vals, -1., 0.),
-        Arguments.of(Aggregation.max(), vals, 123., 0.),
-        Arguments.of(Aggregation.product(), vals, -258300000., 0.),
-        Arguments.of(Aggregation.sumOfSquares(), vals, 31279., 0.),
-        Arguments.of(Aggregation.mean(), vals, 56.5, DELTAD),
-        Arguments.of(Aggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
-        Arguments.of(Aggregation.variance(), vals, 2425.1, DELTAD),
-        Arguments.of(Aggregation.any(), vals, true, 0.),
-        Arguments.of(Aggregation.all(), vals, true, 0.),
-        Arguments.of(Aggregation.quantile(0.5), vals, 55.0, DELTAD),
-        Arguments.of(Aggregation.quantile(0.9), vals, 111.5, DELTAD)
+        Arguments.of(ReductionAggregation.sum(), new Double[0], null, 0.),
+        Arguments.of(ReductionAggregation.sum(), new Double[]{null, null, null}, null, 0.),
+        Arguments.of(ReductionAggregation.sum(), vals, 339., 0.),
+        Arguments.of(ReductionAggregation.min(), vals, -1., 0.),
+        Arguments.of(ReductionAggregation.max(), vals, 123., 0.),
+        Arguments.of(ReductionAggregation.product(), vals, -258300000., 0.),
+        Arguments.of(ReductionAggregation.sumOfSquares(), vals, 31279., 0.),
+        Arguments.of(ReductionAggregation.mean(), vals, 56.5, DELTAD),
+        Arguments.of(ReductionAggregation.standardDeviation(), vals, 49.24530434467839, DELTAD),
+        Arguments.of(ReductionAggregation.variance(), vals, 2425.1, DELTAD),
+        Arguments.of(ReductionAggregation.any(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.all(), vals, true, 0.),
+        Arguments.of(ReductionAggregation.quantile(0.5), vals, 55.0, DELTAD),
+        Arguments.of(ReductionAggregation.quantile(0.9), vals, 111.5, DELTAD)
     );
   }
 
   private static Stream<Arguments> createTimestampDaysParams() {
     Integer[] vals = new Integer[]{-1, 7, 123, null, 50, 60, 100};
     return Stream.of(
-        Arguments.of(Aggregation.max(), new Integer[0], null),
-        Arguments.of(Aggregation.max(), new Integer[]{null, null, null}, null),
-        Arguments.of(Aggregation.max(), vals, 123),
-        Arguments.of(Aggregation.min(), vals, -1)
+        Arguments.of(ReductionAggregation.max(), new Integer[0], null),
+        Arguments.of(ReductionAggregation.max(), new Integer[]{null, null, null}, null),
+        Arguments.of(ReductionAggregation.max(), vals, 123),
+        Arguments.of(ReductionAggregation.min(), vals, -1)
     );
   }
 
   private static Stream<Arguments> createTimestampResolutionParams() {
     Long[] vals = new Long[]{-1L, 7L, 123L, null, 50L, 60L, 100L};
     return Stream.of(
-        Arguments.of(Aggregation.max(), new Long[0], null),
-        Arguments.of(Aggregation.max(), new Long[]{null, null, null}, null),
-        Arguments.of(Aggregation.min(), vals, -1L),
-        Arguments.of(Aggregation.max(), vals, 123L)
+        Arguments.of(ReductionAggregation.max(), new Long[0], null),
+        Arguments.of(ReductionAggregation.max(), new Long[]{null, null, null}, null),
+        Arguments.of(ReductionAggregation.min(), vals, -1L),
+        Arguments.of(ReductionAggregation.max(), vals, 123L)
     );
   }
 
-  private static void assertEqualsDelta(Aggregation op, Scalar expected, Scalar result,
+  private static void assertEqualsDelta(ReductionAggregation op, Scalar expected, Scalar result,
                                         Double percentage) {
-    if (FLOAT_REDUCTIONS.contains(op.kind)) {
+    if (FLOAT_REDUCTIONS.contains(op.getWrapped().kind)) {
       assertEqualsWithinPercentage(expected.getDouble(), result.getDouble(), percentage);
     } else {
       assertEquals(expected, result);
     }
   }
 
-  private static void assertEqualsDelta(Aggregation op, Scalar expected, Scalar result,
+  private static void assertEqualsDelta(ReductionAggregation op, Scalar expected, Scalar result,
                                         Float percentage) {
-    if (FLOAT_REDUCTIONS.contains(op.kind)) {
+    if (FLOAT_REDUCTIONS.contains(op.getWrapped().kind)) {
       assertEqualsWithinPercentage(expected.getFloat(), result.getFloat(), percentage);
     } else {
       assertEquals(expected, result);
@@ -255,7 +255,7 @@ private static void assertEqualsDelta(Aggregation op, Scalar expected, Scalar re
 
   @ParameterizedTest
   @MethodSource("createBooleanParams")
-  void testBoolean(Aggregation op, Boolean[] values, Object expectedObject, Double delta) {
+  void testBoolean(ReductionAggregation op, Boolean[] values, Object expectedObject, Double delta) {
     try (Scalar expected = buildExpectedScalar(op, DType.BOOL8, expectedObject);
          ColumnVector v = ColumnVector.fromBoxedBooleans(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -265,7 +265,7 @@ void testBoolean(Aggregation op, Boolean[] values, Object expectedObject, Double
 
   @ParameterizedTest
   @MethodSource("createByteParams")
-  void testByte(Aggregation op, Byte[] values, Object expectedObject, Double delta) {
+  void testByte(ReductionAggregation op, Byte[] values, Object expectedObject, Double delta) {
     try (Scalar expected = buildExpectedScalar(op, DType.INT8, expectedObject);
          ColumnVector v = ColumnVector.fromBoxedBytes(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -275,7 +275,7 @@ void testByte(Aggregation op, Byte[] values, Object expectedObject, Double delta
 
   @ParameterizedTest
   @MethodSource("createShortParams")
-  void testShort(Aggregation op, Short[] values, Object expectedObject, Double delta) {
+  void testShort(ReductionAggregation op, Short[] values, Object expectedObject, Double delta) {
     try (Scalar expected = buildExpectedScalar(op, DType.INT16, expectedObject);
          ColumnVector v = ColumnVector.fromBoxedShorts(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -285,7 +285,7 @@ void testShort(Aggregation op, Short[] values, Object expectedObject, Double del
 
   @ParameterizedTest
   @MethodSource("createIntParams")
-  void testInt(Aggregation op, Integer[] values, Object expectedObject, Double delta) {
+  void testInt(ReductionAggregation op, Integer[] values, Object expectedObject, Double delta) {
     try (Scalar expected = buildExpectedScalar(op, DType.INT32, expectedObject);
          ColumnVector v = ColumnVector.fromBoxedInts(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -295,7 +295,7 @@ void testInt(Aggregation op, Integer[] values, Object expectedObject, Double del
 
   @ParameterizedTest
   @MethodSource("createLongParams")
-  void testLong(Aggregation op, Long[] values, Object expectedObject, Double delta) {
+  void testLong(ReductionAggregation op, Long[] values, Object expectedObject, Double delta) {
     try (Scalar expected = buildExpectedScalar(op, DType.INT64, expectedObject);
          ColumnVector v = ColumnVector.fromBoxedLongs(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -305,7 +305,7 @@ void testLong(Aggregation op, Long[] values, Object expectedObject, Double delta
 
   @ParameterizedTest
   @MethodSource("createFloatParams")
-  void testFloat(Aggregation op, Float[] values, Object expectedObject, Float delta) {
+  void testFloat(ReductionAggregation op, Float[] values, Object expectedObject, Float delta) {
     try (Scalar expected = buildExpectedScalar(op, DType.FLOAT32, expectedObject);
          ColumnVector v = ColumnVector.fromBoxedFloats(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -315,7 +315,7 @@ void testFloat(Aggregation op, Float[] values, Object expectedObject, Float delt
 
   @ParameterizedTest
   @MethodSource("createDoubleParams")
-  void testDouble(Aggregation op, Double[] values, Object expectedObject, Double delta) {
+  void testDouble(ReductionAggregation op, Double[] values, Object expectedObject, Double delta) {
     try (Scalar expected = buildExpectedScalar(op, DType.FLOAT64, expectedObject);
          ColumnVector v = ColumnVector.fromBoxedDoubles(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -325,7 +325,7 @@ void testDouble(Aggregation op, Double[] values, Object expectedObject, Double d
 
   @ParameterizedTest
   @MethodSource("createTimestampDaysParams")
-  void testTimestampDays(Aggregation op, Integer[] values, Object expectedObject) {
+  void testTimestampDays(ReductionAggregation op, Integer[] values, Object expectedObject) {
     try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_DAYS, expectedObject);
          ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -335,7 +335,7 @@ void testTimestampDays(Aggregation op, Integer[] values, Object expectedObject)
 
   @ParameterizedTest
   @MethodSource("createTimestampResolutionParams")
-  void testTimestampSeconds(Aggregation op, Long[] values, Object expectedObject) {
+  void testTimestampSeconds(ReductionAggregation op, Long[] values, Object expectedObject) {
     try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_SECONDS, expectedObject);
          ColumnVector v = ColumnVector.timestampSecondsFromBoxedLongs(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -345,7 +345,7 @@ void testTimestampSeconds(Aggregation op, Long[] values, Object expectedObject)
 
   @ParameterizedTest
   @MethodSource("createTimestampResolutionParams")
-  void testTimestampMilliseconds(Aggregation op, Long[] values, Object expectedObject) {
+  void testTimestampMilliseconds(ReductionAggregation op, Long[] values, Object expectedObject) {
     try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_MILLISECONDS, expectedObject);
          ColumnVector v = ColumnVector.timestampMilliSecondsFromBoxedLongs(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -355,7 +355,7 @@ void testTimestampMilliseconds(Aggregation op, Long[] values, Object expectedObj
 
   @ParameterizedTest
   @MethodSource("createTimestampResolutionParams")
-  void testTimestampMicroseconds(Aggregation op, Long[] values, Object expectedObject) {
+  void testTimestampMicroseconds(ReductionAggregation op, Long[] values, Object expectedObject) {
     try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_MICROSECONDS, expectedObject);
          ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(values);
          Scalar result = v.reduce(op, expected.getType())) {
@@ -365,7 +365,7 @@ void testTimestampMicroseconds(Aggregation op, Long[] values, Object expectedObj
 
   @ParameterizedTest
   @MethodSource("createTimestampResolutionParams")
-  void testTimestampNanoseconds(Aggregation op, Long[] values, Object expectedObject) {
+  void testTimestampNanoseconds(ReductionAggregation op, Long[] values, Object expectedObject) {
     try (Scalar expected = buildExpectedScalar(op, DType.TIMESTAMP_NANOSECONDS, expectedObject);
          ColumnVector v = ColumnVector.timestampNanoSecondsFromBoxedLongs(values);
          Scalar result = v.reduce(op, expected.getType())) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 360f3c04f5b..cc030c392cb 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -25,11 +25,17 @@
 import ai.rapids.cudf.HostColumnVector.StructData;
 import ai.rapids.cudf.HostColumnVector.StructType;
 
-import ai.rapids.cudf.ast.BinaryExpression;
+import ai.rapids.cudf.ast.BinaryOperation;
 import ai.rapids.cudf.ast.BinaryOperator;
 import ai.rapids.cudf.ast.ColumnReference;
 import ai.rapids.cudf.ast.CompiledExpression;
 import ai.rapids.cudf.ast.TableReference;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.OriginalType;
 import org.junit.jupiter.api.Test;
 
 import java.io.ByteArrayInputStream;
@@ -43,18 +49,14 @@
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;
 
+import static ai.rapids.cudf.ParquetColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
+import static ai.rapids.cudf.Table.removeNullMasksIfNeeded;
 import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -104,7 +106,7 @@ public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
    * @param colName The name of the column
    */
   public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true);
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
   }
 
   /**
@@ -114,7 +116,7 @@ public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, Str
    * @param colName The name of the host column
    */
   public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true);
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
   }
 
   /**
@@ -123,7 +125,7 @@ public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVe
    * @param cv The input Struct column
    */
   public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
-    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true);
+    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
   }
 
   /**
@@ -133,13 +135,14 @@ public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView c
    * @param length The number of rows to consider
    * @param cv The input Struct column
    * @param colName The name of the column
-   * @param enableNullCheck Whether to check for nulls in the Struct column
+   * @param enableNullCountCheck Whether to check for nulls in the Struct column
+   * @param enableNullabilityCheck Whether the table have a validity mask
    */
   public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck) {
+      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
     try (HostColumnVector hostExpected = expected.copyToHost();
          HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck);
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
     }
   }
 
@@ -149,12 +152,13 @@ public static void assertPartialStructColumnsAreEqual(ColumnView expected, long
    * @param cv The input column
    * @param colName The name of the column
    * @param enableNullCheck Whether to check for nulls in the column
+   * @param enableNullabilityCheck Whether the table have a validity mask
    */
   public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck) {
+      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
     try (HostColumnVector hostExpected = expected.copyToHost();
          HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck);
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
     }
   }
 
@@ -165,18 +169,21 @@ public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOff
    * @param length  number of rows from starting offset
    * @param cv The input host column
    * @param colName The name of the host column
-   * @param enableNullCheck Whether to check for nulls in the host column
+   * @param enableNullCountCheck Whether to check for nulls in the host column
    */
   public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
-                                                  HostColumnVectorCore cv, String colName, boolean enableNullCheck) {
+                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
     assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
     assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
     assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
-    if (enableNullCheck) {
+    if (enableNullCountCheck) {
       assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
     } else {
       // TODO add in a proper check when null counts are supported by serializing a partitioned column
     }
+    if (enableNullabilityCheck) {
+      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
+    }
     DType type = expected.getType();
     for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
       long tableRow = expectedRow - rowOffset;
@@ -262,7 +269,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
           }
           assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
               numChildRows, cv.getNestedChildren().get(0), colName + " list child",
-              enableNullCheck);
+              enableNullCountCheck, enableNullabilityCheck);
           break;
         case STRUCT:
           List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
@@ -273,7 +280,7 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
             String childName = colName + " child " + i;
             assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
             assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
-                colName, enableNullCheck);
+                colName, enableNullCountCheck, enableNullabilityCheck);
           }
           break;
         default:
@@ -289,9 +296,10 @@ public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, l
    * @param length the number of rows to check
    * @param table the input table to compare against expected
    * @param enableNullCheck whether to check for nulls or not
+   * @param enableNullabilityCheck whether the table have a validity mask
    */
   public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
-                                                 boolean enableNullCheck) {
+                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
     assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
     assertEquals(length, table.getRowCount(), "ROW COUNT");
     for (int col = 0; col < expected.getNumberOfColumns(); col++) {
@@ -301,7 +309,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l
       if (rowOffset != 0 || length != expected.getRowCount()) {
         name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
       }
-      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck);
+      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
     }
   }
 
@@ -311,7 +319,7 @@ public static void assertPartialTablesAreEqual(Table expected, long rowOffset, l
    * @param table the input table to compare against expected
    */
   public static void assertTablesAreEqual(Table expected, Table table) {
-    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true);
+    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
   }
 
   void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table table) {
@@ -1489,20 +1497,118 @@ void testLeftJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testLeftHashJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 8, 9)
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3)
+             .build()) {
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftHashJoinGatherMapsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 8, 9)
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 3)
+             .build()) {
+      long rowCount = leftKeys.leftJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftHashJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testLeftHashJoinGatherMapsNullsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys,true);
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.leftJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.leftJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testConditionalLeftJoinGatherMaps() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
-         Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
          Table expected = new Table.TestBuilder()
              .column(  0,   1, 2, 2, 2,   3,   4, 5, 5,   6, 7,   8, 9, 9)
              .column(inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1)
              .build();
          CompiledExpression condition = expr.compile()) {
-      GatherMap[] maps = left.leftJoinGatherMaps(right, condition, false);
+      GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, false);
       try {
         verifyJoinGatherMaps(maps, expected);
       } finally {
@@ -1516,7 +1622,7 @@ void testConditionalLeftJoinGatherMaps() {
   @Test
   void testConditionalLeftJoinGatherMapsNulls() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1530,7 +1636,65 @@ void testConditionalLeftJoinGatherMapsNulls() {
              .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
              .build();
          CompiledExpression condition = expr.compile()) {
-      GatherMap[] maps = left.leftJoinGatherMaps(right, condition, true);
+      GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, true);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testConditionalLeftJoinGatherMapsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2, 2, 2,   3,   4, 5, 5,   6, 7,   8, 9, 9)
+             .column(inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1)
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      long rowCount = left.conditionalLeftJoinRowCount(right, condition, false);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, false, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testConditionalLeftJoinGatherMapsNullsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table right = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      long rowCount = left.conditionalLeftJoinRowCount(right, condition, true);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = left.conditionalLeftJoinGatherMaps(right, condition, true, rowCount);
       try {
         verifyJoinGatherMaps(maps, expected);
       } finally {
@@ -1583,19 +1747,113 @@ void testInnerJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testInnerHashJoinGatherMaps() {
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .column(2, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerHashJoinGatherMapsWithCount() {
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .column(2, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.innerJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerHashJoinGatherMapsNulls() {
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 7, 8, 8, 9) // left
+             .column(2, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testInnerHashJoinGatherMapsNullsWithCount() {
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 7, 8, 8, 9) // left
+             .column(2, 0, 1, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.innerJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.innerJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testConditionalInnerJoinGatherMaps() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
-         Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
          Table expected = new Table.TestBuilder()
              .column(2, 2, 2, 5, 5, 7, 9, 9)
              .column(0, 1, 3, 0, 1, 1, 0, 1)
              .build();
          CompiledExpression condition = expr.compile()) {
-      GatherMap[] maps = left.innerJoinGatherMaps(right, condition, false);
+      GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, false);
       try {
         verifyJoinGatherMaps(maps, expected);
       } finally {
@@ -1608,7 +1866,61 @@ void testConditionalInnerJoinGatherMaps() {
 
   @Test
   void testConditionalInnerJoinGatherMapsNulls() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table right = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 7, 8, 8, 9) // left
+             .column(2, 0, 1, 0, 1, 3) // right
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, true);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testConditionalInnerJoinGatherMapsWithCount() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 2, 2, 5, 5, 7, 9, 9)
+             .column(0, 1, 3, 0, 1, 1, 0, 1)
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      long rowCount = left.conditionalInnerJoinRowCount(right, condition, false);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, false, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testConditionalInnerJoinGatherMapsNullsWithCount() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1622,7 +1934,9 @@ void testConditionalInnerJoinGatherMapsNulls() {
              .column(2, 0, 1, 0, 1, 3) // right
              .build();
          CompiledExpression condition = expr.compile()) {
-      GatherMap[] maps = left.innerJoinGatherMaps(right, condition, true);
+      long rowCount = left.conditionalInnerJoinRowCount(right, condition, true);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = left.conditionalInnerJoinGatherMaps(right, condition, true, rowCount);
       try {
         verifyJoinGatherMaps(maps, expected);
       } finally {
@@ -1677,20 +1991,118 @@ void testFullJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testFullHashJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullHashJoinGatherMapsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, null, 1, 7, 4, 6, 5, 8).build();
+         Table rightKeys = new Table.TestBuilder().column(6, 5, 9, 8, 10, null).build();
+         HashJoin rightHash = new HashJoin(rightKeys, false);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.fullJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullHashJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testFullHashJoinGatherMapsNullsWithCount() {
+    final int inv = Integer.MIN_VALUE;
+    try (Table leftKeys = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table rightKeys = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         HashJoin rightHash = new HashJoin(rightKeys, true);
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 7, 8, 8, 9) // left
+             .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
+             .build()) {
+      long rowCount = leftKeys.fullJoinRowCount(rightHash);
+      assertEquals(expected.getRowCount(), rowCount);
+      GatherMap[] maps = leftKeys.fullJoinGatherMaps(rightHash, rowCount);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testConditionalFullJoinGatherMaps() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
-         Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
          Table expected = new Table.TestBuilder()
              .column(inv, inv, inv,   0,   1, 2, 2, 2,   3,   4, 5, 5,   6, 7,   8, 9, 9)
              .column(  2,   4,   5, inv, inv, 0, 1, 3, inv, inv, 0, 1, inv, 1, inv, 0, 1)
              .build();
          CompiledExpression condition = expr.compile()) {
-      GatherMap[] maps = left.fullJoinGatherMaps(right, condition, false);
+      GatherMap[] maps = left.conditionalFullJoinGatherMaps(right, condition, false);
       try {
         verifyJoinGatherMaps(maps, expected);
       } finally {
@@ -1704,7 +2116,7 @@ void testConditionalFullJoinGatherMaps() {
   @Test
   void testConditionalFullJoinGatherMapsNulls() {
     final int inv = Integer.MIN_VALUE;
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1718,7 +2130,7 @@ void testConditionalFullJoinGatherMapsNulls() {
              .column(  4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, 0, 1, 3) // right
              .build();
          CompiledExpression condition = expr.compile()) {
-      GatherMap[] maps = left.fullJoinGatherMaps(right, condition, true);
+      GatherMap[] maps = left.conditionalFullJoinGatherMaps(right, condition, true);
       try {
         verifyJoinGatherMaps(maps, expected);
       } finally {
@@ -1759,23 +2171,25 @@ void testLeftSemiJoinGatherMapNulls() {
 
   @Test
   void testConditionalLeftSemiJoinGatherMap() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
-         Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
          Table expected = new Table.TestBuilder()
              .column(2, 5, 7, 9) // left
              .build();
          CompiledExpression condition = expr.compile();
-         GatherMap map = left.leftSemiJoinGatherMap(right, condition, false)) {
+         GatherMap map = left.conditionalLeftSemiJoinGatherMap(right, condition, false)) {
       verifySemiJoinGatherMap(map, expected);
     }
   }
 
   @Test
   void testConditionalLeftSemiJoinGatherMapNulls() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1788,11 +2202,57 @@ void testConditionalLeftSemiJoinGatherMapNulls() {
              .column(2, 7, 8, 9) // left
              .build();
          CompiledExpression condition = expr.compile();
-         GatherMap map = left.leftSemiJoinGatherMap(right, condition, true)) {
+         GatherMap map = left.conditionalLeftSemiJoinGatherMap(right, condition, true)) {
       verifySemiJoinGatherMap(map, expected);
     }
   }
 
+  @Test
+  void testConditionalLeftSemiJoinGatherMapWithCount() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 5, 7, 9) // left
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      long rowCount = left.conditionalLeftSemiJoinRowCount(right, condition, false);
+      assertEquals(expected.getRowCount(), rowCount);
+      try (GatherMap map =
+               left.conditionalLeftSemiJoinGatherMap(right, condition, false, rowCount)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
+  @Test
+  void testConditionalLeftSemiJoinGatherMapNullsWithCount() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table right = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8, 9) // left
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      long rowCount = left.conditionalLeftSemiJoinRowCount(right, condition, true);
+      assertEquals(expected.getRowCount(), rowCount);
+      try (GatherMap map =
+               left.conditionalLeftSemiJoinGatherMap(right, condition, true, rowCount)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
   @Test
   void testAntiSemiJoinGatherMap() {
     try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1823,23 +2283,25 @@ void testAntiSemiJoinGatherMapNulls() {
 
   @Test
   void testConditionalLeftAntiJoinGatherMap() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.GREATER,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
-         Table right = new Table.TestBuilder().column(6, 5, 9, 8, 10, 32).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
          Table expected = new Table.TestBuilder()
              .column(0, 1, 3, 4, 6, 8) // left
              .build();
          CompiledExpression condition = expr.compile();
-         GatherMap map = left.leftAntiJoinGatherMap(right, condition, false)) {
+         GatherMap map = left.conditionalLeftAntiJoinGatherMap(right, condition, false)) {
       verifySemiJoinGatherMap(map, expected);
     }
   }
 
   @Test
   void testConditionalAntiSemiJoinGatherMapNulls() {
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.EQUAL,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
         new ColumnReference(0, TableReference.LEFT),
         new ColumnReference(0, TableReference.RIGHT));
     try (Table left = new Table.TestBuilder()
@@ -1852,11 +2314,57 @@ void testConditionalAntiSemiJoinGatherMapNulls() {
              .column(0, 1, 3, 4, 5, 6) // left
              .build();
          CompiledExpression condition = expr.compile();
-         GatherMap map = left.leftAntiJoinGatherMap(right, condition, true)) {
+         GatherMap map = left.conditionalLeftAntiJoinGatherMap(right, condition, true)) {
       verifySemiJoinGatherMap(map, expected);
     }
   }
 
+  @Test
+  void testConditionalLeftAntiJoinGatherMapWithCount() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5).build();
+         Table expected = new Table.TestBuilder()
+             .column(0, 1, 3, 4, 6, 8) // left
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      long rowCount = left.conditionalLeftAntiJoinRowCount(right, condition, false);
+      assertEquals(expected.getRowCount(), rowCount);
+      try (GatherMap map =
+               left.conditionalLeftAntiJoinGatherMap(right, condition, false, rowCount)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
+  @Test
+  void testConditionalAntiSemiJoinGatherMapNullsWithCount() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.EQUAL,
+        new ColumnReference(0, TableReference.LEFT),
+        new ColumnReference(0, TableReference.RIGHT));
+    try (Table left = new Table.TestBuilder()
+        .column(2, 3, 9, 0, 1, 7, 4, null, null, 8)
+        .build();
+         Table right = new Table.TestBuilder()
+             .column(null, null, 9, 8, 10, 32)
+             .build();
+         Table expected = new Table.TestBuilder()
+             .column(0, 1, 3, 4, 5, 6) // left
+             .build();
+         CompiledExpression condition = expr.compile()) {
+      long rowCount = left.conditionalLeftAntiJoinRowCount(right, condition, true);
+      assertEquals(expected.getRowCount(), rowCount);
+      try (GatherMap map =
+               left.conditionalLeftAntiJoinGatherMap(right, condition, true, rowCount)) {
+        verifySemiJoinGatherMap(map, expected);
+      }
+    }
+  }
+
   @Test
   void testBoundsNulls() {
     boolean[] descFlags = new boolean[1];
@@ -2728,7 +3236,7 @@ void testSerializationRoundTripConcatHostSide() throws IOException {
           try (Table found = JCudfSerialization.readAndConcat(
               headers.toArray(new JCudfSerialization.SerializedTableHeader[headers.size()]),
               buffers.toArray(new HostMemoryBuffer[buffers.size()]))) {
-            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false);
+            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found, false, false);
           }
         } finally {
           for (HostMemoryBuffer buff: buffers) {
@@ -2781,7 +3289,7 @@ void testConcatHost() throws IOException {
         try (Table result = JCudfSerialization.readAndConcat(
             new JCudfSerialization.SerializedTableHeader[] {header, header},
             new HostMemoryBuffer[] {buff, buff})) {
-          assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false);
+          assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), result, false, false);
         }
       }
     }
@@ -2822,7 +3330,7 @@ void testSerializationRoundTripSlicedHostSide() throws IOException {
               buffers.toArray(new HostMemoryBuffer[buffers.size()]), bout2);
           ByteArrayInputStream bin2 = new ByteArrayInputStream(bout2.toByteArray());
           try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin2)) {
-            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false);
+            assertPartialTablesAreEqual(t, 0, t.getRowCount(), found.getTable(), false, false);
             assertEquals(found.getTable(), found.getContiguousTable().getTable());
             assertNotNull(found.getContiguousTable().getBuffer());
           }
@@ -2848,7 +3356,7 @@ void testSerializationRoundTripSliced() throws IOException {
           JCudfSerialization.writeToStream(t, bout, i, len);
           ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray());
           try (JCudfSerialization.TableAndRowCountPair found = JCudfSerialization.readTableFrom(bin)) {
-            assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount());
+            assertPartialTablesAreEqual(t, i, len, found.getTable(), i == 0 && len == t.getRowCount(), false);
             assertEquals(found.getTable(), found.getContiguousTable().getTable());
             assertNotNull(found.getContiguousTable().getBuffer());
           }
@@ -2902,12 +3410,12 @@ void testGroupByScan() {
               .withKeysSorted(true)
               .withKeysDescending(false, false)
               .build(), 0, 1)
-          .scan(Aggregation.sum().onColumn(2),
-              Aggregation.count(NullPolicy.INCLUDE).onColumn(2),
-              Aggregation.min().onColumn(2),
-              Aggregation.max().onColumn(2),
-              Aggregation.rank().onColumn(3),
-              Aggregation.denseRank().onColumn(3));
+          .scan(GroupByScanAggregation.sum().onColumn(2),
+              GroupByScanAggregation.count(NullPolicy.INCLUDE).onColumn(2),
+              GroupByScanAggregation.min().onColumn(2),
+              GroupByScanAggregation.max().onColumn(2),
+              GroupByScanAggregation.rank().onColumn(3),
+              GroupByScanAggregation.denseRank().onColumn(3));
            Table expected = new Table.TestBuilder()
                .column( "1",  "1",  "1",  "1",  "1",  "1",  "1",  "2",  "2",  "2",  "2")
                .column(   0,    1,    3,    3,    5,    5,    5,    5,    5,    5,    5)
@@ -2957,7 +3465,7 @@ void testGroupByUniqueCount() {
             .build()) {
       try (Table t3 = t1
               .groupBy(0, 1)
-              .aggregate(Aggregation.nunique().onColumn(0));
+              .aggregate(GroupByAggregation.nunique().onColumn(0));
            Table sorted = t3.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
                    .column( "1",  "1",  "1",  "1")
@@ -2978,7 +3486,7 @@ void testGroupByUniqueCountNulls() {
             .build()) {
       try (Table t3 = t1
               .groupBy(0, 1)
-              .aggregate(Aggregation.nunique(NullPolicy.INCLUDE).onColumn(0));
+              .aggregate(GroupByAggregation.nunique(NullPolicy.INCLUDE).onColumn(0));
            Table sorted = t3.orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
                    .column( "1",  "1",  "1",  "1")
@@ -2997,7 +3505,7 @@ void testGroupByCount() {
                                            .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0)
                                            .build()) {
       try (Table t3 = t1.groupBy(0, 1)
-          .aggregate(Aggregation.count().onColumn(0));
+          .aggregate(GroupByAggregation.count().onColumn(0));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -3048,9 +3556,9 @@ void testWindowingCount() {
             .build()) {
 
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window));
+              .aggregateWindows(RollingAggregation.count().onColumn(3).overWindow(window));
                Table decWindowAggResults = decSorted.groupBy(0, 4)
-                   .aggregateWindows(Aggregation.count().onColumn(3).overWindow(window));
+                   .aggregateWindows(RollingAggregation.count().onColumn(3).overWindow(window));
                ColumnVector expect = ColumnVector.fromBoxedInts(2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2)) {
             assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
             assertColumnsAreEqual(expect, decWindowAggResults.getColumn(0));
@@ -3088,9 +3596,9 @@ void testWindowingMin() {
             .build()) {
 
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation.min().onColumn(3).overWindow(window));
+              .aggregateWindows(RollingAggregation.min().onColumn(3).overWindow(window));
                Table decWindowAggResults = decSorted.groupBy(0, 4)
-                   .aggregateWindows(Aggregation.min().onColumn(6).overWindow(window));
+                   .aggregateWindows(RollingAggregation.min().onColumn(6).overWindow(window));
                ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6);
                ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 5, 1, 1, 1, 7, 7, 2, 2, 0, 0, 0, 6)) {
             assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
@@ -3129,9 +3637,9 @@ void testWindowingMax() {
             .build()) {
 
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation.max().onColumn(3).overWindow(window));
+              .aggregateWindows(RollingAggregation.max().onColumn(3).overWindow(window));
                Table decWindowAggResults = decSorted.groupBy(0, 4)
-                   .aggregateWindows(Aggregation.max().onColumn(6).overWindow(window));
+                   .aggregateWindows(RollingAggregation.max().onColumn(6).overWindow(window));
                ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6);
                ColumnVector decExpect = ColumnVector.decimalFromLongs(2, 7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6)) {
             assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
@@ -3163,7 +3671,7 @@ void testWindowingSum() {
             .build()) {
 
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation.sum().onColumn(3).overWindow(window));
+              .aggregateWindows(RollingAggregation.sum().onColumn(3).overWindow(window));
                ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L)) {
             assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           }
@@ -3199,12 +3707,12 @@ void testWindowingRowNumber() {
              WindowOptions options = windowBuilder.window(two, one).build();
              WindowOptions options1 = windowBuilder.window(two, one).build()) {
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation
+              .aggregateWindows(RollingAggregation
                   .rowNumber()
                   .onColumn(3)
                   .overWindow(options));
                Table decWindowAggResults = decSorted.groupBy(0, 4)
-                   .aggregateWindows(Aggregation
+                   .aggregateWindows(RollingAggregation
                        .rowNumber()
                        .onColumn(6)
                        .overWindow(options1));
@@ -3219,12 +3727,12 @@ void testWindowingRowNumber() {
              WindowOptions options = windowBuilder.window(three, two).build();
              WindowOptions options1 = windowBuilder.window(three, two).build()) {
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation
+              .aggregateWindows(RollingAggregation
                   .rowNumber()
                   .onColumn(3)
                   .overWindow(options));
                Table decWindowAggResults = decSorted.groupBy(0, 4)
-                   .aggregateWindows(Aggregation
+                   .aggregateWindows(RollingAggregation
                        .rowNumber()
                        .onColumn(6)
                        .overWindow(options1));
@@ -3239,12 +3747,12 @@ void testWindowingRowNumber() {
              WindowOptions options = windowBuilder.window(four, three).build();
              WindowOptions options1 = windowBuilder.window(four, three).build()) {
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation
+              .aggregateWindows(RollingAggregation
                   .rowNumber()
                   .onColumn(3)
                   .overWindow(options));
                Table decWindowAggResults = decSorted.groupBy(0, 4)
-                   .aggregateWindows(Aggregation
+                   .aggregateWindows(RollingAggregation
                        .rowNumber()
                        .onColumn(6)
                        .overWindow(options1));
@@ -3259,8 +3767,8 @@ void testWindowingRowNumber() {
 
   @Test
   void testWindowingCollectList() {
-    Aggregation aggCollectWithNulls = Aggregation.collectList(NullPolicy.INCLUDE);
-    Aggregation aggCollect = Aggregation.collectList();
+    RollingAggregation aggCollectWithNulls = RollingAggregation.collectList(NullPolicy.INCLUDE);
+    RollingAggregation aggCollect = RollingAggregation.collectList();
     try (Scalar two = Scalar.fromInt(2);
          Scalar one = Scalar.fromInt(1);
          WindowOptions winOpts = WindowOptions.builder()
@@ -3335,12 +3843,12 @@ void testWindowingCollectList() {
 
   @Test
   void testWindowingCollectSet() {
-    Aggregation aggCollect = Aggregation.collectSet();
-    Aggregation aggCollectWithEqNulls = Aggregation.collectSet(NullPolicy.INCLUDE,
+    RollingAggregation aggCollect = RollingAggregation.collectSet();
+    RollingAggregation aggCollectWithEqNulls = RollingAggregation.collectSet(NullPolicy.INCLUDE,
         NullEquality.EQUAL, NaNEquality.UNEQUAL);
-    Aggregation aggCollectWithUnEqNulls = Aggregation.collectSet(NullPolicy.INCLUDE,
+    RollingAggregation aggCollectWithUnEqNulls = RollingAggregation.collectSet(NullPolicy.INCLUDE,
         NullEquality.UNEQUAL, NaNEquality.UNEQUAL);
-    Aggregation aggCollectWithEqNaNs = Aggregation.collectSet(NullPolicy.INCLUDE,
+    RollingAggregation aggCollectWithEqNaNs = RollingAggregation.collectSet(NullPolicy.INCLUDE,
         NullEquality.EQUAL, NaNEquality.ALL_EQUAL);
 
     try (Scalar two = Scalar.fromInt(2);
@@ -3473,22 +3981,22 @@ void testWindowingLead() {
              Scalar one = Scalar.fromInt(1);
              WindowOptions options = windowBuilder.window(two, one).build();
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(0)
                      .onColumn(3) // Int Agg Column
                      .overWindow(options));
              Table decWindowAggResults = decSorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(0)
                      .onColumn(6) // Decimal Agg Column
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(0)
                      .onColumn(7) // List Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(0)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3517,22 +4025,22 @@ void testWindowingLead() {
              Scalar one = Scalar.fromInt(1);
              WindowOptions options = windowBuilder.window(zero, one).build();
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(1)
                      .onColumn(3) //Int Agg COLUMN
                      .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(1)
                      .onColumn(6) //Decimal Agg COLUMN
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(1)
                      .onColumn(7) //LIST Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(1)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3575,22 +4083,22 @@ null, new StructData(13, "s13"), new StructData(14, "s14"), null,
                  new StructData(-111, "s111"), new StructData(null, "s112"), new StructData(-222, "s222"), new StructData(-333, "s333"));
 
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(1, defaultOutput)
                      .onColumn(3) //Int Agg COLUMN
                      .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(1, decDefaultOutput)
                      .onColumn(6) //Decimal Agg COLUMN
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(1, listDefaultOutput)
                      .onColumn(7) //LIST Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(1, structDefaultOutput)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3619,22 +4127,22 @@ null, new StructData(13, "s13"), new StructData(14, "s14"), new StructData(-14,
              Scalar one = Scalar.fromInt(1);
              WindowOptions options = windowBuilder.window(zero, one).build();
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(3)
                      .onColumn(3) //Int Agg COLUMN
                      .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lead(3)
                      .onColumn(6) //Decimal Agg COLUMN
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(3)
                      .onColumn(7) //LIST Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lead(3)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3694,22 +4202,22 @@ void testWindowingLag() {
              Scalar one = Scalar.fromInt(1);
              WindowOptions options = windowBuilder.window(two, one).build();
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(0)
                      .onColumn(3) //Int Agg COLUMN
                      .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(0)
                      .onColumn(6) //Decimal Agg COLUMN
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(0)
                      .onColumn(7) //LIST Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(0)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3737,22 +4245,22 @@ void testWindowingLag() {
              Scalar two = Scalar.fromInt(2);
              WindowOptions options = windowBuilder.window(two, zero).build();
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(1)
                      .onColumn(3) //Int Agg COLUMN
                      .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(1)
                      .onColumn(6) //Decimal Agg COLUMN
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(1)
                      .onColumn(7) //LIST Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(1)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3794,22 +4302,22 @@ null, new StructData(111, "s111"), new StructData(null, "s112"), new StructData(
                  new StructData(-11, "s11"), null, new StructData(-13, "s13"), new StructData(-14, "s14"),
                  new StructData(-111, "s111"), new StructData(null, "s112"), new StructData(-222, "s222"), new StructData(-333, "s333"));
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(1, defaultOutput)
                      .onColumn(3) //Int Agg COLUMN
                      .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(1, decDefaultOutput)
                      .onColumn(6) //Decimal Agg COLUMN
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(1, listDefaultOutput)
                      .onColumn(7) //LIST Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(1, structDefaultOutput)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3838,22 +4346,22 @@ null, new StructData(111, "s111"), new StructData(null, "s112"), new StructData(
              Scalar one = Scalar.fromInt(1);
              WindowOptions options = windowBuilder.window(one, zero).build();
              Table windowAggResults = sorted.groupBy(0, 1)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(3)
                      .onColumn(3) //Int Agg COLUMN
                      .overWindow(options));
              Table decWindowAggResults = sorted.groupBy(0, 4)
-                 .aggregateWindows(Aggregation
+                 .aggregateWindows(RollingAggregation
                      .lag(3)
                      .onColumn(6) //Decimal Agg COLUMN
                      .overWindow(options));
              Table listWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(3)
                      .onColumn(7) //LIST Agg COLUMN
                      .overWindow(options));
              Table structWindowAggResults = sorted.groupBy(0, 1).aggregateWindows(
-                 Aggregation
+                 RollingAggregation
                      .lag(3)
                      .onColumn(8) //STRUCT Agg COLUMN
                      .overWindow(options));
@@ -3896,7 +4404,7 @@ void testWindowingMean() {
                  .build()) {
 
           try (Table windowAggResults = sorted.groupBy(0, 1)
-              .aggregateWindows(Aggregation.mean().onColumn(3).overWindow(window));
+              .aggregateWindows(RollingAggregation.mean().onColumn(3).overWindow(window));
                ColumnVector expect = ColumnVector.fromBoxedDoubles(6.0d, 5.0d, 5.0d, 5.0d, 8.0d, 8.0d, 7.0d, 6.0d, 4.0d, 4.0d, 4.0d, 6.0d)) {
             assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
           }
@@ -3941,10 +4449,10 @@ void testWindowingOnMultipleDifferentColumns() {
 
           try (Table windowAggResults = sorted.groupBy(0, 1)
               .aggregateWindows(
-                  Aggregation.sum().onColumn(3).overWindow(window_1),
-                  Aggregation.max().onColumn(3).overWindow(window_1),
-                  Aggregation.sum().onColumn(3).overWindow(window_2),
-                  Aggregation.min().onColumn(2).overWindow(window_3)
+                  RollingAggregation.sum().onColumn(3).overWindow(window_1),
+                  RollingAggregation.max().onColumn(3).overWindow(window_1),
+                  RollingAggregation.sum().onColumn(3).overWindow(window_2),
+                  RollingAggregation.min().onColumn(2).overWindow(window_3)
               );
                ColumnVector expect_0 = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 10L, 16L, 24L, 19L, 10L, 8L, 14L, 12L, 12L);
                ColumnVector expect_1 = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 6);
@@ -3979,8 +4487,8 @@ void testWindowingWithoutGroupByColumns() {
                  .build()) {
 
           try (Table windowAggResults = sorted.groupBy().aggregateWindows(
-              Aggregation.sum().onColumn(1).overWindow(window));
-               ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 17L, 25L, 24L, 19L, 18L, 10L, 14L, 12L, 12L);
+              RollingAggregation.sum().onColumn(1).overWindow(window));
+               ColumnVector expectAggResult = ColumnVector.fromBoxedLongs(12L, 13L, 15L, 17L, 25L, 24L, 19L, 18L, 10L, 14L, 12L, 12L)
           ) {
             assertColumnsAreEqual(expectAggResult, windowAggResults.getColumn(0));
           }
@@ -4054,7 +4562,7 @@ void testRangeWindowingCount() {
                 .orderByColumnIndex(orderIndex)
                 .build()) {
               try (Table windowAggResults = sorted.groupBy(0, 1).aggregateWindowsOverRanges(
-                  Aggregation.count().onColumn(2).overWindow(window));
+                  RollingAggregation.count().onColumn(2).overWindow(window));
                    ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 2, 4, 4, 4, 4, 4, 4, 5, 5, 3)) {
                 assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
               }
@@ -4098,7 +4606,7 @@ void testRangeWindowingLead() {
                 .build()) {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverRanges(Aggregation.lead(1)
+                  .aggregateWindowsOverRanges(RollingAggregation.lead(1)
                       .onColumn(2)
                       .overWindow(window));
                    ColumnVector expect = ColumnVector.fromBoxedInts(5, 1, 9, null, 9, 8, 2, null, 0, 6, 6, 8, null)) {
@@ -4144,7 +4652,7 @@ void testRangeWindowingMax() {
                 .build()) {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverRanges(Aggregation.max().onColumn(2).overWindow(window));
+                  .aggregateWindowsOverRanges(RollingAggregation.max().onColumn(2).overWindow(window));
                    ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8)) {
                 assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
               }
@@ -4158,7 +4666,7 @@ void testRangeWindowingMax() {
                      .build()) {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindows(Aggregation.max().onColumn(2).overWindow(window));
+                  .aggregateWindows(RollingAggregation.max().onColumn(2).overWindow(window));
                    ColumnVector expect = ColumnVector.fromBoxedInts(7, 7, 9, 9, 9, 9, 9, 8, 8, 8, 6, 8, 8)) {
                 assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
               }
@@ -4202,7 +4710,7 @@ void testRangeWindowingRowNumber() {
                 .build()) {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverRanges(Aggregation.rowNumber().onColumn(2).overWindow(window));
+                  .aggregateWindowsOverRanges(RollingAggregation.rowNumber().onColumn(2).overWindow(window));
                    ColumnVector expect = ColumnVector.fromBoxedInts(1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 5)) {
                 assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
               }
@@ -4254,12 +4762,12 @@ void testRangeWindowingCountDescendingTimestamps() {
                   .window(preceding_1, following_1)
                   .orderByColumnIndex(orderIndex)
                   .orderByDescending()
-                  .build();) {
+                  .build()) {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
                   .aggregateWindowsOverRanges(
-                      Aggregation.count().onColumn(2).overWindow(window_0),
-                      Aggregation.sum().onColumn(2).overWindow(window_1));
+                      RollingAggregation.count().onColumn(2).overWindow(window_0),
+                      RollingAggregation.sum().onColumn(2).overWindow(window_1));
                    ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 5, 5, 5);
                    ColumnVector expect_1 = ColumnVector.fromBoxedLongs(7L, 13L, 13L, 22L, 7L, 24L, 24L, 26L, 8L, 8L, 14L, 28L, 28L)) {
                 assertColumnsAreEqual(expect_0, windowAggResults.getColumn(0));
@@ -4303,7 +4811,7 @@ void testRangeWindowingWithoutGroupByColumns() {
                 .build();) {
 
               try (Table windowAggResults = sorted.groupBy()
-                  .aggregateWindowsOverRanges(Aggregation.count().onColumn(1).overWindow(window));
+                  .aggregateWindowsOverRanges(RollingAggregation.count().onColumn(1).overWindow(window));
                    ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 6, 6, 6, 6, 7, 7, 6, 6, 5, 5, 3)) {
                 assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
               }
@@ -4333,7 +4841,7 @@ void testRangeWindowingOrderByUnsupportedDataTypeExceptions() {
         assertThrows(IllegalArgumentException.class,
             () -> table
                 .groupBy(0, 1)
-                .aggregateWindowsOverRanges(Aggregation.max().onColumn(2).overWindow(rangeBasedWindow)));
+                .aggregateWindowsOverRanges(RollingAggregation.max().onColumn(2).overWindow(rangeBasedWindow)));
       }
     }
   }
@@ -4353,7 +4861,7 @@ void testInvalidWindowTypeExceptions() {
           .minPeriods(1)
           .window(one, one)
           .build()) {
-        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindowsOverRanges(Aggregation.max().onColumn(3).overWindow(rowBasedWindow)));
+        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindowsOverRanges(RollingAggregation.max().onColumn(3).overWindow(rowBasedWindow)));
       }
 
       try (WindowOptions rangeBasedWindow = WindowOptions.builder()
@@ -4361,7 +4869,7 @@ void testInvalidWindowTypeExceptions() {
           .window(one, one)
           .orderByColumnIndex(2)
           .build()) {
-        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindows(Aggregation.max().onColumn(3).overWindow(rangeBasedWindow)));
+        assertThrows(IllegalArgumentException.class, () -> table.groupBy(0, 1).aggregateWindows(RollingAggregation.max().onColumn(3).overWindow(rangeBasedWindow)));
       }
     }
   }
@@ -4399,7 +4907,7 @@ void testRangeWindowingCountUnboundedPreceding() {
                 .build();) {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
-                  .aggregateWindowsOverRanges(Aggregation.count().onColumn(2).overWindow(window));
+                  .aggregateWindowsOverRanges(RollingAggregation.count().onColumn(2).overWindow(window));
                    ColumnVector expect = ColumnVector.fromBoxedInts(3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5)) {
                 assertColumnsAreEqual(expect, windowAggResults.getColumn(0));
               }
@@ -4475,11 +4983,11 @@ void testRangeWindowingCountUnboundedASCWithNullsFirst() {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
                   .aggregateWindowsOverRanges(
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
-                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
-                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
                    ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 5, 5, 6, 2, 2, 4, 4, 6, 6, 7);
                    ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 3, 1, 7, 7, 5, 5, 3, 3, 1);
                    ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
@@ -4570,11 +5078,11 @@ void testRangeWindowingCountUnboundedDESCWithNullsFirst() {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
                   .aggregateWindowsOverRanges(
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
-                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
-                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
                    ColumnVector expect_0 = ColumnVector.fromBoxedInts(3, 3, 3, 4, 6, 6, 2, 2, 3, 5, 5, 7, 7);
                    ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 6, 3, 2, 2, 7, 7, 5, 4, 4, 2, 2);
                    ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
@@ -4658,11 +5166,11 @@ void testRangeWindowingCountUnboundedASCWithNullsLast() {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
                   .aggregateWindowsOverRanges(
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
-                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
-                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
                    ColumnVector expect_0 = ColumnVector.fromBoxedInts(2, 2, 3, 6, 6, 6, 2, 2, 4, 4, 5, 7, 7);
                    ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 6, 4, 3, 3, 3, 7, 7, 5, 5, 3, 2, 2);
                    ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
@@ -4752,11 +5260,11 @@ void testRangeWindowingCountUnboundedDESCWithNullsLast() {
 
               try (Table windowAggResults = sorted.groupBy(0, 1)
                   .aggregateWindowsOverRanges(
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
-                      Aggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
-                      Aggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
-                      Aggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingOneFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(onePrecedingUnboundedFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndFollowing),
+                      RollingAggregation.count().onColumn(2).overWindow(unboundedPrecedingAndCurrentRow),
+                      RollingAggregation.count().onColumn(2).overWindow(currentRowAndUnboundedFollowing));
                    ColumnVector expect_0 = ColumnVector.fromBoxedInts(1, 3, 3, 6, 6, 6, 1, 3, 3, 5, 5, 7, 7);
                    ColumnVector expect_1 = ColumnVector.fromBoxedInts(6, 5, 5, 3, 3, 3, 7, 6, 6, 4, 4, 2, 2);
                    ColumnVector expect_2 = ColumnVector.fromBoxedInts(6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7);
@@ -4784,9 +5292,9 @@ void testGroupByCountWithNulls() {
                                            .column(   1,    1,    1, null,    1,    1)
                                            .build()) {
       try (Table tmp = t1.groupBy(0).aggregate(
-          Aggregation.count().onColumn(1),
-          Aggregation.count().onColumn(2),
-          Aggregation.count().onColumn(3));
+          GroupByAggregation.count().onColumn(1),
+          GroupByAggregation.count().onColumn(2),
+          GroupByAggregation.count().onColumn(3));
            Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
@@ -4824,10 +5332,10 @@ void testGroupByCountWithNullsIncluded() {
             .column(   1,    1,    1, null,    1,    1)
             .build()) {
       try (Table tmp = t1.groupBy(0).aggregate(
-          Aggregation.count(NullPolicy.INCLUDE).onColumn(1),
-          Aggregation.count(NullPolicy.INCLUDE).onColumn(2),
-          Aggregation.count(NullPolicy.INCLUDE).onColumn(3),
-          Aggregation.count().onColumn(3));
+          GroupByAggregation.count(NullPolicy.INCLUDE).onColumn(1),
+          GroupByAggregation.count(NullPolicy.INCLUDE).onColumn(2),
+          GroupByAggregation.count(NullPolicy.INCLUDE).onColumn(3),
+          GroupByAggregation.count().onColumn(3));
            Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
@@ -4875,9 +5383,9 @@ void testGroupByCountWithCollapsingNulls() {
           .build();
 
       try (Table tmp = t1.groupBy(options, 0).aggregate(
-          Aggregation.count().onColumn(1),
-          Aggregation.count().onColumn(2),
-          Aggregation.count().onColumn(3));
+          GroupByAggregation.count().onColumn(1),
+          GroupByAggregation.count().onColumn(2),
+          GroupByAggregation.count().onColumn(3));
            Table t3 = tmp.orderBy(OrderByArg.asc(0, true));
            HostColumnVector groupCol = t3.getColumn(0).copyToHost();
            HostColumnVector countCol = t3.getColumn(1).copyToHost();
@@ -4908,7 +5416,7 @@ void testGroupByMax() {
                                            .column(   1,    3,    3,    5,    5,    0)
                                            .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.max().onColumn(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.max().onColumn(2));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -4943,7 +5451,7 @@ void testGroupByArgMax() {
             .column(17.0, 14.0, 14.0, 17.0, 17.1, 17.0)
             .build()) {
       try (Table t3 = t1.groupBy(0, 1)
-              .aggregate(Aggregation.argMax().onColumn(2));
+              .aggregate(GroupByAggregation.argMax().onColumn(2));
            Table sorted = t3
               .orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
@@ -4965,7 +5473,7 @@ void testGroupByArgMin() {
             .column(17.0, 14.0, 14.0, 17.0, 17.1, 17.0)
             .build()) {
       try (Table t3 = t1.groupBy(0, 1)
-              .aggregate(Aggregation.argMin().onColumn(2));
+              .aggregate(GroupByAggregation.argMin().onColumn(2));
            Table sorted = t3
                    .orderBy(OrderByArg.asc(0), OrderByArg.asc(1), OrderByArg.asc(2));
            Table expected = new Table.TestBuilder()
@@ -4983,7 +5491,7 @@ void testGroupByMinBool() {
     try (Table t1 = new Table.TestBuilder()
         .column(true, null, false, true, null, null)
         .column(   1,    1,     2,    2,    3,    3).build();
-         Table other = t1.groupBy(1).aggregate(Aggregation.min().onColumn(0));
+         Table other = t1.groupBy(1).aggregate(GroupByAggregation.min().onColumn(0));
          Table ordered = other.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column(1, 2, 3)
@@ -4998,7 +5506,7 @@ void testGroupByMaxBool() {
     try (Table t1 = new Table.TestBuilder()
         .column(false, null, false, true, null, null)
         .column(   1,    1,     2,    2,    3,    3).build();
-         Table other = t1.groupBy(1).aggregate(Aggregation.max().onColumn(0));
+         Table other = t1.groupBy(1).aggregate(GroupByAggregation.max().onColumn(0));
          Table ordered = other.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column(1, 2, 3)
@@ -5025,12 +5533,12 @@ void testGroupByDuplicateAggregates() {
              .column(   1,    2,    2,    1).build()) {
       try (Table t3 = t1.groupBy(0, 1)
           .aggregate(
-              Aggregation.max().onColumn(2),
-              Aggregation.min().onColumn(2),
-              Aggregation.min().onColumn(2),
-              Aggregation.max().onColumn(2),
-              Aggregation.min().onColumn(2),
-              Aggregation.count().onColumn(1));
+              GroupByAggregation.max().onColumn(2),
+              GroupByAggregation.min().onColumn(2),
+              GroupByAggregation.min().onColumn(2),
+              GroupByAggregation.max().onColumn(2),
+              GroupByAggregation.min().onColumn(2),
+              GroupByAggregation.count().onColumn(1));
           Table t4 = t3.orderBy(OrderByArg.asc(2))) {
         // verify t4
         assertEquals(4, t4.getRowCount());
@@ -5053,7 +5561,7 @@ void testGroupByMin() {
                                            .column(   1,    3,    3,    5,    5,    0)
                                            .column(  12,   14,   13,   17,   17,   17)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.min().onColumn(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.min().onColumn(2));
            HostColumnVector aggOut0 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -5088,7 +5596,7 @@ void testGroupBySum() {
                                            .column(   1,    3,    3,    5,    5,    0)
                                            .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.sum().onColumn(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.sum().onColumn(2));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -5121,7 +5629,7 @@ void testGroupByM2() {
     try (Table input = new Table.TestBuilder().column(1, 2, 3, 1, 2, 2, 1, 3, 3, 2)
              .column(0, 1, -2, 3, -4, -5, -6, 7, -8, 9)
              .build();
-         Table results = input.groupBy(0).aggregate(Aggregation.M2()
+         Table results = input.groupBy(0).aggregate(GroupByAggregation.M2()
                .onColumn(1));
          Table expected = new Table.TestBuilder().column(1, 2, 3)
              .column(42.0, 122.75, 114.0)
@@ -5134,7 +5642,7 @@ void testGroupByM2() {
     try (Table input = new Table.TestBuilder().column(1, 2, 5, 3, 4, 5, 2, 3, 2, 5)
              .column(0, null, null, 2, 3, null, 5, 6, 7, null)
              .build();
-         Table results = input.groupBy(0).aggregate(Aggregation.M2()
+         Table results = input.groupBy(0).aggregate(GroupByAggregation.M2()
              .onColumn(1));
          Table expected = new Table.TestBuilder().column(1, 2, 3, 4, 5)
              .column(0.0, 2.0, 8.0, 0.0, null)
@@ -5146,7 +5654,7 @@ void testGroupByM2() {
     try (Table input = new Table.TestBuilder().column(4, 3, 1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4, 4)
              .column(null, null, 0.0, 1.0, 2.0, 3.0, 4.0, Double.NaN, 6.0, 7.0, 8.0, 9.0, 10.0, Double.NaN)
              .build();
-         Table results = input.groupBy(0).aggregate(Aggregation.M2()
+         Table results = input.groupBy(0).aggregate(GroupByAggregation.M2()
              .onColumn(1));
          Table expected = new Table.TestBuilder().column(1, 2, 3, 4, null)
              .column(18.0, Double.NaN, 18.0, Double.NaN, 0.0)
@@ -5179,7 +5687,7 @@ void testGroupByM2() {
                      Double.NEGATIVE_INFINITY,
                      Double.POSITIVE_INFINITY)
              .build();
-         Table results = input.groupBy(0).aggregate(Aggregation.M2()
+         Table results = input.groupBy(0).aggregate(GroupByAggregation.M2()
              .onColumn(1));
          Table expected = new Table.TestBuilder().column(1, 2, 3, 4, 5)
              .column(Double.NaN, Double.NaN, Double.NaN, Double.NaN, 12.5)
@@ -5237,7 +5745,7 @@ void testGroupByMergeM2() {
              partialResults3,
              partialResults4);
            Table finalResults = concatenatedResults.groupBy(0).aggregate(
-             Aggregation.mergeM2().onColumn(1))
+               GroupByAggregation.mergeM2().onColumn(1))
            ) {
         assertTablesAreEqual(expected, finalResults);
       }
@@ -5255,7 +5763,7 @@ void testGroupByFirstExcludeNulls() {
                  .column(13, 14)
                  .build();
          Table found = input.groupBy(0).aggregate(
-             Aggregation.nth(0, NullPolicy.EXCLUDE).onColumn(1))) {
+             GroupByAggregation.nth(0, NullPolicy.EXCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -5271,7 +5779,7 @@ void testGroupByLastExcludeNulls() {
                  .column(12, 15)
                  .build();
          Table found = input.groupBy(0).aggregate(
-             Aggregation.nth(-1, NullPolicy.EXCLUDE).onColumn(1))) {
+             GroupByAggregation.nth(-1, NullPolicy.EXCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -5287,7 +5795,7 @@ void testGroupByFirstIncludeNulls() {
                  .column(null, 14)
                  .build();
          Table found = input.groupBy(0).aggregate(
-             Aggregation.nth(0, NullPolicy.INCLUDE).onColumn(1))) {
+             GroupByAggregation.nth(0, NullPolicy.INCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -5303,7 +5811,7 @@ void testGroupByLastIncludeNulls() {
                  .column(12, null)
                  .build();
          Table found = input.groupBy(0).aggregate(
-             Aggregation.nth(-1, NullPolicy.INCLUDE).onColumn(1))) {
+             GroupByAggregation.nth(-1, NullPolicy.INCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -5314,7 +5822,7 @@ void testGroupByAvg() {
                                            .column( 1,  3,  3,  5,  5,  0)
                                            .column(12, 14, 13,  1, 17, 17)
                                            .build()) {
-      try (Table t3 = t1.groupBy(0, 1).aggregate(Aggregation.mean().onColumn(2));
+      try (Table t3 = t1.groupBy(0, 1).aggregate(GroupByAggregation.mean().onColumn(2));
            HostColumnVector aggOut1 = t3.getColumn(2).copyToHost()) {
         // verify t3
         assertEquals(4, t3.getRowCount());
@@ -5349,11 +5857,11 @@ void testMultiAgg() {
                                            .column(  3,   1,   7,  -1,   9,    0)
                                            .build()) {
       try (Table t2 = t1.groupBy(0, 1).aggregate(
-          Aggregation.count().onColumn(0),
-          Aggregation.max().onColumn(3),
-          Aggregation.min().onColumn(2),
-          Aggregation.mean().onColumn(2),
-          Aggregation.sum().onColumn(2));
+          GroupByAggregation.count().onColumn(0),
+          GroupByAggregation.max().onColumn(3),
+          GroupByAggregation.min().onColumn(2),
+          GroupByAggregation.mean().onColumn(2),
+          GroupByAggregation.sum().onColumn(2));
            HostColumnVector countOut = t2.getColumn(2).copyToHost();
            HostColumnVector maxOut = t2.getColumn(3).copyToHost();
            HostColumnVector minOut = t2.getColumn(4).copyToHost();
@@ -5419,7 +5927,7 @@ void testSumWithStrings() {
         .column(5289L, 5203L, 5303L, 5206L)
         .build();
          Table result = t.groupBy(0).aggregate(
-             Aggregation.sum().onColumn(1));
+             GroupByAggregation.sum().onColumn(1));
          Table expected = new Table.TestBuilder()
              .column("1-URGENT", "3-MEDIUM")
              .column(5289L + 5303L, 5203L + 5206L)
@@ -5517,7 +6025,7 @@ void testGroupByCollectListIncludeNulls() {
                  Arrays.asList(0))
              .build();
          Table found = input.groupBy(0).aggregate(
-             Aggregation.collectList(NullPolicy.INCLUDE).onColumn(1))) {
+             GroupByAggregation.collectList(NullPolicy.INCLUDE).onColumn(1))) {
       assertTablesAreEqual(expected, found);
     }
   }
@@ -5563,8 +6071,8 @@ void testGroupByMergeLists() {
                  Arrays.asList(new StructData(333, "s333"), new StructData(222, "s222"), new StructData(111, "s111")),
                  Arrays.asList(new StructData(222, "s222"), new StructData(444, "s444")))
              .build();
-         Table retListOfInts = input.groupBy(0).aggregate(Aggregation.mergeLists().onColumn(1));
-         Table retListOfStructs = input.groupBy(0).aggregate(Aggregation.mergeLists().onColumn(2))) {
+         Table retListOfInts = input.groupBy(0).aggregate(GroupByAggregation.mergeLists().onColumn(1));
+         Table retListOfStructs = input.groupBy(0).aggregate(GroupByAggregation.mergeLists().onColumn(2))) {
       assertTablesAreEqual(expectedListOfInts, retListOfInts);
       assertTablesAreEqual(expectedListOfStructs, retListOfStructs);
     }
@@ -5573,7 +6081,7 @@ void testGroupByMergeLists() {
   @Test
   void testGroupByCollectSetIncludeNulls() {
     // test with null unequal and nan unequal
-    Aggregation collectSet = Aggregation.collectSet(NullPolicy.INCLUDE,
+    GroupByAggregation collectSet = GroupByAggregation.collectSet(NullPolicy.INCLUDE,
         NullEquality.UNEQUAL, NaNEquality.UNEQUAL);
     try (Table input = new Table.TestBuilder()
         .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4)
@@ -5589,7 +6097,7 @@ void testGroupByCollectSetIncludeNulls() {
       assertTablesAreEqual(expected, found);
     }
     // test with null equal and nan unequal
-    collectSet = Aggregation.collectSet(NullPolicy.INCLUDE,
+    collectSet = GroupByAggregation.collectSet(NullPolicy.INCLUDE,
         NullEquality.EQUAL, NaNEquality.UNEQUAL);
     try (Table input = new Table.TestBuilder()
         .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4)
@@ -5610,7 +6118,7 @@ void testGroupByCollectSetIncludeNulls() {
       assertTablesAreEqual(expected, found);
     }
     // test with null equal and nan equal
-    collectSet = Aggregation.collectSet(NullPolicy.INCLUDE,
+    collectSet = GroupByAggregation.collectSet(NullPolicy.INCLUDE,
         NullEquality.EQUAL, NaNEquality.ALL_EQUAL);
     try (Table input = new Table.TestBuilder()
         .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4)
@@ -5671,10 +6179,10 @@ void testGroupByMergeSets() {
                  Arrays.asList(1e-3, 1e3, Double.NaN),
                  Arrays.asList())
              .build();
-         Table retListOfInts = input.groupBy(0).aggregate(Aggregation.mergeSets().onColumn(1));
-         Table retListOfDoubles = input.groupBy(0).aggregate(Aggregation.mergeSets().onColumn(2));
+         Table retListOfInts = input.groupBy(0).aggregate(GroupByAggregation.mergeSets().onColumn(1));
+         Table retListOfDoubles = input.groupBy(0).aggregate(GroupByAggregation.mergeSets().onColumn(2));
          Table retListOfDoublesNaNEq = input.groupBy(0).aggregate(
-             Aggregation.mergeSets(NullEquality.UNEQUAL, NaNEquality.ALL_EQUAL).onColumn(2))) {
+             GroupByAggregation.mergeSets(NullEquality.UNEQUAL, NaNEquality.ALL_EQUAL).onColumn(2))) {
       assertTablesAreEqual(expectedListOfInts, retListOfInts);
       assertTablesAreEqual(expectedListOfDoubles, retListOfDoubles);
       assertTablesAreEqual(expectedListOfDoublesNaNEq, retListOfDoublesNaNEq);
@@ -5853,6 +6361,121 @@ void testAllFilteredFromValidity() {
     }
   }
 
+  ColumnView replaceValidity(ColumnView cv, DeviceMemoryBuffer validity, long nullCount) {
+    assert (validity.length >= BitVectorHelper.getValidityAllocationSizeInBytes(cv.rows));
+    if (cv.type.isNestedType()) {
+      ColumnView[] children = cv.getChildColumnViews();
+      try {
+        return new ColumnView(cv.type,
+            cv.rows,
+            Optional.of(nullCount),
+            validity,
+            cv.getOffsets(),
+            children);
+      } finally {
+        for (ColumnView v : children) {
+          if (v != null) {
+            v.close();
+          }
+        }
+      }
+    } else {
+      return new ColumnView(cv.type, cv.rows, Optional.of(nullCount), cv.getData(), validity, cv.getOffsets());
+    }
+  }
+
+  @Test
+  void testRemoveNullMasksIfNeeded() {
+    ListType nestedType = new ListType(true, new StructType(false,
+        new BasicType(true, DType.INT32),
+        new BasicType(true, DType.INT64)));
+
+    List data1 = Arrays.asList(10, 20L);
+    List data2 = Arrays.asList(50, 60L);
+    HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1);
+    HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2);
+
+    //First we create ColumnVectors
+    try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, 2, 3);
+         ColumnVector nonNullVector2 = ColumnVector.fromStrings("1", "2", "3");
+         ColumnVector nonNullVector1 = ColumnVector.fromLists(nestedType,
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2))) {
+      //Then we take the created ColumnVectors and add validity masks even though the nullCount = 0
+      long allocSize = BitVectorHelper.getValidityAllocationSizeInBytes(nonNullVector0.rows);
+      try (DeviceMemoryBuffer dm0 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm1 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm2 = DeviceMemoryBuffer.allocate(allocSize);
+           DeviceMemoryBuffer dm3_child =
+               DeviceMemoryBuffer.allocate(BitVectorHelper.getValidityAllocationSizeInBytes(2))) {
+        Cuda.memset(dm0.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm1.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm2.address, (byte) 0xFF, allocSize);
+        Cuda.memset(dm3_child.address, (byte) 0xFF,
+            BitVectorHelper.getValidityAllocationSizeInBytes(2));
+
+        try (ColumnView cv0View = replaceValidity(nonNullVector0, dm0, 0);
+             ColumnVector cv0 = cv0View.copyToColumnVector();
+             ColumnView struct = nonNullVector1.getChildColumnView(0);
+             ColumnView structChild0 = struct.getChildColumnView(0);
+             ColumnView newStructChild0 = replaceValidity(structChild0, dm3_child, 0);
+             ColumnView newStruct = struct.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStructChild0});
+             ColumnView list = nonNullVector1.replaceChildrenWithViews(new int[]{0}, new ColumnView[]{newStruct});
+             ColumnView cv1View = replaceValidity(list, dm1, 0);
+             ColumnVector cv1 = cv1View.copyToColumnVector();
+             ColumnView cv2View = replaceValidity(nonNullVector2, dm2, 0);
+             ColumnVector cv2 = cv2View.copyToColumnVector()) {
+
+          try (Table t = new Table(new ColumnVector[]{cv0, cv1, cv2});
+               Table tableWithoutNullMask = removeNullMasksIfNeeded(t);
+               ColumnView tableStructChild0 = t.getColumn(1).getChildColumnView(0).getChildColumnView(0);
+               ColumnVector tableStructChild0Cv = tableStructChild0.copyToColumnVector();
+               Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1,
+                nonNullVector2})) {
+            assertTrue(t.getColumn(0).hasValidityVector());
+            assertTrue(t.getColumn(1).hasValidityVector());
+            assertTrue(t.getColumn(2).hasValidityVector());
+            assertTrue(tableStructChild0Cv.hasValidityVector());
+
+            assertPartialTablesAreEqual(expected,
+                0,
+                expected.getRowCount(),
+                tableWithoutNullMask,
+                true,
+                true);
+          }
+        }
+      }
+    }
+  }
+
+  @Test
+  void testRemoveNullMasksIfNeededWithNulls() {
+    ListType nestedType = new ListType(true, new StructType(true,
+        new BasicType(true, DType.INT32),
+        new BasicType(true, DType.INT64)));
+
+    List data1 = Arrays.asList(0, 10L);
+    List data2 = Arrays.asList(50, null);
+    HostColumnVector.StructData structData1 = new HostColumnVector.StructData(data1);
+    HostColumnVector.StructData structData2 = new HostColumnVector.StructData(data2);
+
+    //First we create ColumnVectors
+    try (ColumnVector nonNullVector0 = ColumnVector.fromBoxedInts(1, null, 2, 3);
+         ColumnVector nonNullVector1 = ColumnVector.fromStrings("1", "2", null, "3");
+         ColumnVector nonNullVector2 = ColumnVector.fromLists(nestedType,
+             Arrays.asList(structData1, structData2),
+             null,
+             Arrays.asList(structData1, structData2),
+             Arrays.asList(structData1, structData2))) {
+      try (Table expected = new Table(new ColumnVector[]{nonNullVector0, nonNullVector1, nonNullVector2});
+           Table unchangedTable = removeNullMasksIfNeeded(expected)) {
+        assertTablesAreEqual(expected, unchangedTable);
+      }
+    }
+  }
+
   @Test
   void testMismatchedSizesForFilter() {
     Boolean[] maskVals = new Boolean[3];
@@ -6002,6 +6625,40 @@ void testParquetWriteToBufferChunkedInt96() {
     }
   }
 
+  @Test
+  void testParquetWriteMap() throws IOException {
+    ParquetWriterOptions options = ParquetWriterOptions.builder()
+        .withMapColumn(mapColumn("my_map",
+            new ParquetColumnWriterOptions("key0", false),
+            new ParquetColumnWriterOptions("value0"))).build();
+    File f = File.createTempFile("test-map", ".parquet");
+    List<HostColumnVector.StructData> list1 =
+        Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b")));
+    List<HostColumnVector.StructData> list2 =
+        Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "c")));
+    List<HostColumnVector.StructData> list3 =
+     Arrays.asList(new HostColumnVector.StructData(Arrays.asList("e", "d")));
+    HostColumnVector.StructType structType = new HostColumnVector.StructType(true,
+     Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
+        new HostColumnVector.BasicType(true, DType.STRING)));
+    try (Table t0 = new Table(ColumnVector.fromLists(new HostColumnVector.ListType(true,
+     structType), list1, list2, list3))) {
+      try (TableWriter writer = Table.writeParquetChunked(options, f)) {
+        writer.write(t0);
+      }
+      ParquetFileReader reader =
+       ParquetFileReader.open(HadoopInputFile.fromPath(new Path(f.getAbsolutePath()),
+           new Configuration()));
+      MessageType schema = reader.getFooter().getFileMetaData().getSchema();
+      assertEquals(OriginalType.MAP, schema.getType("my_map").getOriginalType());
+    }
+    try (ColumnVector cv = Table.readParquet(f).getColumn(0);
+         ColumnVector res = cv.getMapValue(Scalar.fromString("a"));
+         ColumnVector expected = ColumnVector.fromStrings("b", "c", null)) {
+      assertColumnsAreEqual(expected, res);
+    }
+  }
+
   @Test
   void testParquetWriteToBufferChunkedWithNested() {
     ParquetWriterOptions options = ParquetWriterOptions.builder()
diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
index 5a64fd6ab09..13af9aff682 100644
--- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
@@ -42,16 +42,14 @@ public class CompiledExpressionTest extends CudfTestBase {
   public void testColumnReferenceTransform() {
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build()) {
       // use an implicit table reference
-      UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY,
-          new ColumnReference(1));
+      ColumnReference expr = new ColumnReference(1);
       try (CompiledExpression compiledExpr = expr.compile();
            ColumnVector actual = compiledExpr.computeColumn(t)) {
         assertColumnsAreEqual(t.getColumn(1), actual);
       }
 
       // use an explicit table reference
-      expr = new UnaryExpression(UnaryOperator.IDENTITY,
-          new ColumnReference(1, TableReference.LEFT));
+      expr = new ColumnReference(1, TableReference.LEFT);
       try (CompiledExpression compiledExpr = expr.compile();
            ColumnVector actual = compiledExpr.computeColumn(t)) {
         assertColumnsAreEqual(t.getColumn(1), actual);
@@ -61,22 +59,19 @@ public void testColumnReferenceTransform() {
 
   @Test
   public void testInvalidColumnReferenceTransform() {
-    // verify attempting to reference an invalid table remaps to the only valid table
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY,
-        new ColumnReference(1, TableReference.RIGHT));
+    // Verify that computeColumn throws when passed an expression operating on TableReference.RIGHT.
+    ColumnReference expr = new ColumnReference(1, TableReference.RIGHT);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
-         CompiledExpression compiledExpr = expr.compile();
-         ColumnVector actual = compiledExpr.computeColumn(t)) {
-      assertColumnsAreEqual(t.getColumn(1), actual);
+         CompiledExpression compiledExpr = expr.compile()) {
+      Assertions.assertThrows(CudfException.class, () -> compiledExpr.computeColumn(t).close());
     }
   }
 
   @Test
   public void testBooleanLiteralTransform() {
     try (Table t = new Table.TestBuilder().column(true, false, null).build()) {
-      Literal trueLiteral = Literal.ofBoolean(true);
-      UnaryExpression trueExpr = new UnaryExpression(UnaryOperator.IDENTITY, trueLiteral);
-      try (CompiledExpression trueCompiledExpr = trueExpr.compile();
+      Literal expr = Literal.ofBoolean(true);
+      try (CompiledExpression trueCompiledExpr = expr.compile();
            ColumnVector trueExprActual = trueCompiledExpr.computeColumn(t);
            ColumnVector trueExprExpected = ColumnVector.fromBoxedBooleans(true, true, true)) {
         assertColumnsAreEqual(trueExprExpected, trueExprActual);
@@ -84,7 +79,7 @@ public void testBooleanLiteralTransform() {
 
       // Uncomment the following after https://github.com/rapidsai/cudf/issues/8831 is fixed
       // Literal nullLiteral = Literal.ofBoolean(null);
-      // UnaryExpression nullExpr = new UnaryExpression(AstOperator.IDENTITY, nullLiteral);
+      // UnaryOperation nullExpr = new UnaryOperation(AstOperator.IDENTITY, nullLiteral);
       // try (CompiledExpression nullCompiledExpr = nullExpr.compile();
       //      ColumnVector nullExprActual = nullCompiledExpr.computeColumn(t);
       //      ColumnVector nullExprExpected = ColumnVector.fromBoxedBooleans(null, null, null)) {
@@ -98,8 +93,7 @@ public void testBooleanLiteralTransform() {
   // @NullSource
   @ValueSource(bytes = 0x12)
   public void testByteLiteralTransform(Byte value) {
-    Literal literal = Literal.ofByte(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofByte(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -113,8 +107,7 @@ public void testByteLiteralTransform(Byte value) {
   // @NullSource
   @ValueSource(shorts = 0x1234)
   public void testShortLiteralTransform(Short value) {
-    Literal literal = Literal.ofShort(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofShort(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -128,8 +121,7 @@ public void testShortLiteralTransform(Short value) {
   // @NullSource
   @ValueSource(ints = 0x12345678)
   public void testIntLiteralTransform(Integer value) {
-    Literal literal = Literal.ofInt(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofInt(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -143,8 +135,7 @@ public void testIntLiteralTransform(Integer value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testLongLiteralTransform(Long value) {
-    Literal literal = Literal.ofLong(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofLong(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -158,8 +149,7 @@ public void testLongLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(floats = { 123456.789f, Float.NaN, Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY} )
   public void testFloatLiteralTransform(Float value) {
-    Literal literal = Literal.ofFloat(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofFloat(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -173,8 +163,7 @@ public void testFloatLiteralTransform(Float value) {
   // @NullSource
   @ValueSource(doubles = { 123456.789f, Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY} )
   public void testDoubleLiteralTransform(Double value) {
-    Literal literal = Literal.ofDouble(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDouble(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -188,8 +177,7 @@ public void testDoubleLiteralTransform(Double value) {
   // @NullSource
   @ValueSource(ints = 0x12345678)
   public void testTimestampDaysLiteralTransform(Integer value) {
-    Literal literal = Literal.ofTimestampDaysFromInt(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampDaysFromInt(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -204,8 +192,7 @@ public void testTimestampDaysLiteralTransform(Integer value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_SECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_SECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -220,8 +207,7 @@ public void testTimestampSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampMilliSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_MILLISECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_MILLISECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -236,8 +222,7 @@ public void testTimestampMilliSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampMicroSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_MICROSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_MICROSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -252,8 +237,7 @@ public void testTimestampMicroSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testTimestampNanoSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofTimestampFromLong(DType.TIMESTAMP_NANOSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofTimestampFromLong(DType.TIMESTAMP_NANOSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -268,8 +252,7 @@ public void testTimestampNanoSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(ints = 0x12345678)
   public void testDurationDaysLiteralTransform(Integer value) {
-    Literal literal = Literal.ofDurationDaysFromInt(value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationDaysFromInt(value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -284,8 +267,7 @@ public void testDurationDaysLiteralTransform(Integer value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_SECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_SECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -300,8 +282,7 @@ public void testDurationSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationMilliSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_MILLISECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_MILLISECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -316,8 +297,7 @@ public void testDurationMilliSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationMicroSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_MICROSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_MICROSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -332,8 +312,7 @@ public void testDurationMicroSecondsLiteralTransform(Long value) {
   // @NullSource
   @ValueSource(longs = 0x1234567890abcdefL)
   public void testDurationNanoSecondsLiteralTransform(Long value) {
-    Literal literal = Literal.ofDurationFromLong(DType.DURATION_NANOSECONDS, value);
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, literal);
+    Literal expr = Literal.ofDurationFromLong(DType.DURATION_NANOSECONDS, value);
     try (Table t = new Table.TestBuilder().column(5, 4, 3, 2, 1).column(6, 7, 8, null, 10).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -360,7 +339,7 @@ private static <T, U, R> ArrayList<R> mapArray(T[] in1, U[] in2, BiFunction<T, U
     return result;
   }
 
-  private static Stream<Arguments> createUnaryDoubleExpressionParams() {
+  private static Stream<Arguments> createUnaryDoubleOperationParams() {
     Double[] input = new Double[] { -5., 4.5, null, 2.7, 1.5 };
     return Stream.of(
         Arguments.of(UnaryOperator.IDENTITY, input, Arrays.asList(input)),
@@ -384,10 +363,10 @@ private static Stream<Arguments> createUnaryDoubleExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createUnaryDoubleExpressionParams")
-  void testUnaryDoubleExpressionTransform(UnaryOperator op, Double[] input,
+  @MethodSource("createUnaryDoubleOperationParams")
+  void testUnaryDoubleOperationTransform(UnaryOperator op, Double[] input,
                                           List<Double> expectedValues) {
-    UnaryExpression expr = new UnaryExpression(op, new ColumnReference(0));
+    UnaryOperation expr = new UnaryOperation(op, new ColumnReference(0));
     try (Table t = new Table.TestBuilder().column(input).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -398,17 +377,17 @@ void testUnaryDoubleExpressionTransform(UnaryOperator op, Double[] input,
   }
 
   @Test
-  void testUnaryShortExpressionTransform() {
+  void testUnaryShortOperationTransform() {
     Short[] input = new Short[] { -5, 4, null, 2, 1 };
     try (Table t = new Table.TestBuilder().column(input).build()) {
-      UnaryExpression expr = new UnaryExpression(UnaryOperator.IDENTITY, new ColumnReference(0));
+      ColumnReference expr = new ColumnReference(0);
       try (CompiledExpression compiledExpr = expr.compile();
            ColumnVector actual = compiledExpr.computeColumn(t)) {
         assertColumnsAreEqual(t.getColumn(0), actual);
       }
 
-      expr = new UnaryExpression(UnaryOperator.BIT_INVERT, new ColumnReference(0));
-      try (CompiledExpression compiledExpr = expr.compile();
+      UnaryOperation expr2 = new UnaryOperation(UnaryOperator.BIT_INVERT, new ColumnReference(0));
+      try (CompiledExpression compiledExpr = expr2.compile();
            ColumnVector actual = compiledExpr.computeColumn(t);
            ColumnVector expected = ColumnVector.fromBoxedInts(4, -5, null, -3, -2)) {
         assertColumnsAreEqual(expected, actual);
@@ -417,8 +396,8 @@ void testUnaryShortExpressionTransform() {
   }
 
   @Test
-  void testUnaryLogicalExpressionTransform() {
-    UnaryExpression expr = new UnaryExpression(UnaryOperator.NOT, new ColumnReference(0));
+  void testUnaryLogicalOperationTransform() {
+    UnaryOperation expr = new UnaryOperation(UnaryOperator.NOT, new ColumnReference(0));
     try (Table t = new Table.TestBuilder().column(-5L, 0L, null, 2L, 1L).build();
          CompiledExpression compiledExpr = expr.compile();
          ColumnVector actual = compiledExpr.computeColumn(t);
@@ -427,7 +406,7 @@ void testUnaryLogicalExpressionTransform() {
     }
   }
 
-  private static Stream<Arguments> createBinaryFloatExpressionParams() {
+  private static Stream<Arguments> createBinaryFloatOperationParams() {
     Float[] in1 = new Float[] { -5f, 4.5f, null, 2.7f };
     Float[] in2 = new Float[] { 123f, -456f, null, 0f };
     return Stream.of(
@@ -443,10 +422,10 @@ private static Stream<Arguments> createBinaryFloatExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryFloatExpressionParams")
-  void testBinaryFloatExpressionTransform(BinaryOperator op, Float[] in1, Float[] in2,
+  @MethodSource("createBinaryFloatOperationParams")
+  void testBinaryFloatOperationTransform(BinaryOperator op, Float[] in1, Float[] in2,
                                           List<Float> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -458,7 +437,7 @@ void testBinaryFloatExpressionTransform(BinaryOperator op, Float[] in1, Float[]
     }
   }
 
-  private static Stream<Arguments> createBinaryDoublePromotedExpressionParams() {
+  private static Stream<Arguments> createBinaryDoublePromotedOperationParams() {
     Float[] in1 = new Float[] { -5f, 4.5f, null, 2.7f };
     Float[] in2 = new Float[] { 123f, -456f, null, 0f };
     return Stream.of(
@@ -469,10 +448,10 @@ private static Stream<Arguments> createBinaryDoublePromotedExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryDoublePromotedExpressionParams")
-  void testBinaryDoublePromotedExpressionTransform(BinaryOperator op, Float[] in1, Float[] in2,
+  @MethodSource("createBinaryDoublePromotedOperationParams")
+  void testBinaryDoublePromotedOperationTransform(BinaryOperator op, Float[] in1, Float[] in2,
                                                    List<Double> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -484,7 +463,7 @@ void testBinaryDoublePromotedExpressionTransform(BinaryOperator op, Float[] in1,
     }
   }
 
-  private static Stream<Arguments> createBinaryComparisonExpressionParams() {
+  private static Stream<Arguments> createBinaryComparisonOperationParams() {
     Integer[] in1 = new Integer[] { -5, 4, null, 2, -3 };
     Integer[] in2 = new Integer[] { 123, -456, null, 0, -3 };
     return Stream.of(
@@ -498,10 +477,10 @@ private static Stream<Arguments> createBinaryComparisonExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryComparisonExpressionParams")
-  void testBinaryComparisonExpressionTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
+  @MethodSource("createBinaryComparisonOperationParams")
+  void testBinaryComparisonOperationTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
                                                List<Boolean> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -513,7 +492,7 @@ void testBinaryComparisonExpressionTransform(BinaryOperator op, Integer[] in1, I
     }
   }
 
-  private static Stream<Arguments> createBinaryBitwiseExpressionParams() {
+  private static Stream<Arguments> createBinaryBitwiseOperationParams() {
     Integer[] in1 = new Integer[] { -5, 4, null, 2, -3 };
     Integer[] in2 = new Integer[] { 123, -456, null, 0, -3 };
     return Stream.of(
@@ -523,10 +502,10 @@ private static Stream<Arguments> createBinaryBitwiseExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryBitwiseExpressionParams")
-  void testBinaryBitwiseExpressionTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
+  @MethodSource("createBinaryBitwiseOperationParams")
+  void testBinaryBitwiseOperationTransform(BinaryOperator op, Integer[] in1, Integer[] in2,
                                             List<Integer> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -538,7 +517,7 @@ void testBinaryBitwiseExpressionTransform(BinaryOperator op, Integer[] in1, Inte
     }
   }
 
-  private static Stream<Arguments> createBinaryBooleanExpressionParams() {
+  private static Stream<Arguments> createBinaryBooleanOperationParams() {
     Boolean[] in1 = new Boolean[] { false, true, null, true, false };
     Boolean[] in2 = new Boolean[] { true, null, null, true, false };
     return Stream.of(
@@ -547,10 +526,10 @@ private static Stream<Arguments> createBinaryBooleanExpressionParams() {
   }
 
   @ParameterizedTest
-  @MethodSource("createBinaryBooleanExpressionParams")
-  void testBinaryBooleanExpressionTransform(BinaryOperator op, Boolean[] in1, Boolean[] in2,
+  @MethodSource("createBinaryBooleanOperationParams")
+  void testBinaryBooleanOperationTransform(BinaryOperator op, Boolean[] in1, Boolean[] in2,
                                             List<Boolean> expectedValues) {
-    BinaryExpression expr = new BinaryExpression(op,
+    BinaryOperation expr = new BinaryOperation(op,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(in1).column(in2).build();
@@ -563,9 +542,9 @@ void testBinaryBooleanExpressionTransform(BinaryOperator op, Boolean[] in1, Bool
   }
 
   @Test
-  void testMismatchedBinaryExpressionTypes() {
+  void testMismatchedBinaryOperationTypes() {
     // verify expression fails to transform if operands are not the same type
-    BinaryExpression expr = new BinaryExpression(BinaryOperator.ADD,
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.ADD,
         new ColumnReference(0),
         new ColumnReference(1));
     try (Table t = new Table.TestBuilder().column(1, 2, 3).column(1L, 2L, 3L).build();
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 2d52b517242..6b5e5b858f0 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -8,6 +8,7 @@
 
 import rmm
 
+from cudf.api.types import dtype
 from cudf import core, datasets, testing
 from cudf._version import get_versions
 from cudf.api.extensions import (
@@ -15,34 +16,36 @@
     register_index_accessor,
     register_series_accessor,
 )
-from cudf.core import (
+from cudf.core.scalar import (
     NA,
+    Scalar,
+)
+from cudf.core.index import (
     BaseIndex,
     CategoricalIndex,
-    DataFrame,
     DatetimeIndex,
     Float32Index,
     Float64Index,
     Index,
+    GenericIndex,
     Int8Index,
     Int16Index,
     Int32Index,
     Int64Index,
     IntervalIndex,
-    MultiIndex,
     RangeIndex,
-    Scalar,
-    Series,
+    StringIndex,
     TimedeltaIndex,
     UInt8Index,
     UInt16Index,
     UInt32Index,
     UInt64Index,
-    cut,
-    from_pandas,
     interval_range,
-    merge,
 )
+from cudf.core.dataframe import DataFrame, from_pandas, merge
+from cudf.core.series import Series
+from cudf.core.multiindex import MultiIndex
+from cudf.core.cut import cut
 from cudf.core.algorithms import factorize
 from cudf.core.dtypes import (
     CategoricalDtype,
@@ -73,7 +76,14 @@
     tan,
     true_divide,
 )
-from cudf.core.reshape import concat, get_dummies, melt, merge_sorted
+from cudf.core.reshape import (
+    concat,
+    get_dummies,
+    melt,
+    merge_sorted,
+    pivot,
+    unstack,
+)
 from cudf.core.series import isclose
 from cudf.core.tools.datetimes import DateOffset, to_datetime
 from cudf.core.tools.numeric import to_numeric
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index fe9ed4d4934..83ab02351f2 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -18,44 +18,44 @@
 ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES"
 
 _PANDAS_TO_AVRO_SCHEMA_MAP = {
-    np.dtype("int8"): "int",
+    cudf.dtype("int8"): "int",
     pd.Int8Dtype(): ["int", "null"],
     pd.Int16Dtype(): ["int", "null"],
     pd.Int32Dtype(): ["int", "null"],
     pd.Int64Dtype(): ["long", "null"],
     pd.BooleanDtype(): ["boolean", "null"],
     pd.StringDtype(): ["string", "null"],
-    np.dtype("bool_"): "boolean",
-    np.dtype("int16"): "int",
-    np.dtype("int32"): "int",
-    np.dtype("int64"): "long",
-    np.dtype("O"): "string",
-    np.dtype("str"): "string",
-    np.dtype("float32"): "float",
-    np.dtype("float64"): "double",
-    np.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"},
-    np.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
-    np.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
+    cudf.dtype("bool_"): "boolean",
+    cudf.dtype("int16"): "int",
+    cudf.dtype("int32"): "int",
+    cudf.dtype("int64"): "long",
+    cudf.dtype("O"): "string",
+    cudf.dtype("str"): "string",
+    cudf.dtype("float32"): "float",
+    cudf.dtype("float64"): "double",
+    cudf.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"},
+    cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
+    cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
 }
 
 PANDAS_TO_ORC_TYPES = {
-    np.dtype("int8"): pyorc.TinyInt(),
+    cudf.dtype("int8"): pyorc.TinyInt(),
     pd.Int8Dtype(): pyorc.TinyInt(),
     pd.Int16Dtype(): pyorc.SmallInt(),
     pd.Int32Dtype(): pyorc.Int(),
     pd.Int64Dtype(): pyorc.BigInt(),
     pd.BooleanDtype(): pyorc.Boolean(),
-    np.dtype("bool_"): pyorc.Boolean(),
-    np.dtype("int16"): pyorc.SmallInt(),
-    np.dtype("int32"): pyorc.Int(),
-    np.dtype("int64"): pyorc.BigInt(),
-    np.dtype("O"): pyorc.String(),
+    cudf.dtype("bool_"): pyorc.Boolean(),
+    cudf.dtype("int16"): pyorc.SmallInt(),
+    cudf.dtype("int32"): pyorc.Int(),
+    cudf.dtype("int64"): pyorc.BigInt(),
+    cudf.dtype("O"): pyorc.String(),
     pd.StringDtype(): pyorc.String(),
-    np.dtype("float32"): pyorc.Float(),
-    np.dtype("float64"): pyorc.Double(),
-    np.dtype("<M8[ns]"): pyorc.Timestamp(),
-    np.dtype("<M8[ms]"): pyorc.Timestamp(),
-    np.dtype("<M8[us]"): pyorc.Timestamp(),
+    cudf.dtype("float32"): pyorc.Float(),
+    cudf.dtype("float64"): pyorc.Double(),
+    cudf.dtype("<M8[ns]"): pyorc.Timestamp(),
+    cudf.dtype("<M8[ms]"): pyorc.Timestamp(),
+    cudf.dtype("<M8[us]"): pyorc.Timestamp(),
 }
 
 ORC_TO_PANDAS_TYPES = {
@@ -64,10 +64,10 @@
     pyorc.Boolean().name: pd.BooleanDtype(),
     pyorc.SmallInt().name: pd.Int16Dtype(),
     pyorc.BigInt().name: pd.Int64Dtype(),
-    pyorc.String().name: np.dtype("O"),
-    pyorc.Float().name: np.dtype("float32"),
-    pyorc.Double().name: np.dtype("float64"),
-    pyorc.Timestamp().name: np.dtype("<M8[ns]"),
+    pyorc.String().name: cudf.dtype("O"),
+    pyorc.Float().name: cudf.dtype("float32"),
+    pyorc.Double().name: cudf.dtype("float64"),
+    pyorc.Timestamp().name: cudf.dtype("<M8[ns]"),
 }
 
 
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 0293518a5d9..02f0444e413 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -15,6 +15,7 @@
     interop,
     join,
     json,
+    labeling,
     merge,
     null_mask,
     nvtext,
diff --git a/python/cudf/cudf/_lib/aggregation.pxd b/python/cudf/cudf/_lib/aggregation.pxd
index f608dab3fe1..84bcaed1b36 100644
--- a/python/cudf/cudf/_lib/aggregation.pxd
+++ b/python/cudf/cudf/_lib/aggregation.pxd
@@ -2,7 +2,12 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.aggregation cimport aggregation, rolling_aggregation
+from cudf._lib.cpp.aggregation cimport (
+    aggregation,
+    groupby_aggregation,
+    groupby_scan_aggregation,
+    rolling_aggregation,
+)
 
 
 cdef class Aggregation:
@@ -11,5 +16,13 @@ cdef class Aggregation:
 cdef class RollingAggregation:
     cdef unique_ptr[rolling_aggregation] c_obj
 
+cdef class GroupbyAggregation:
+    cdef unique_ptr[groupby_aggregation] c_obj
+
+cdef class GroupbyScanAggregation:
+    cdef unique_ptr[groupby_scan_aggregation] c_obj
+
 cdef Aggregation make_aggregation(op, kwargs=*)
 cdef RollingAggregation make_rolling_aggregation(op, kwargs=*)
+cdef GroupbyAggregation make_groupby_aggregation(op, kwargs=*)
+cdef GroupbyScanAggregation make_groupby_scan_aggregation(op, kwargs=*)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 4c94452c73d..da407cdbfa8 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -27,6 +27,8 @@ from cudf._lib.types import Interpolation
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 cimport cudf._lib.cpp.types as libcudf_types
 
+import cudf
+
 
 class AggregationKind(Enum):
     SUM = libcudf_aggregation.aggregation.Kind.SUM
@@ -153,6 +155,32 @@ cdef class Aggregation:
                 size))
         return agg
 
+    @classmethod
+    def first(cls):
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_nth_element_aggregation[aggregation](
+                0,
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.EXCLUDE
+                )
+            )
+        )
+        return agg
+
+    @classmethod
+    def last(cls):
+        cdef Aggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_nth_element_aggregation[aggregation](
+                -1,
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.EXCLUDE
+                )
+            )
+        )
+        return agg
+
     @classmethod
     def any(cls):
         cdef Aggregation agg = cls()
@@ -251,7 +279,7 @@ cdef class Aggregation:
         nb_type = numpy_support.from_dtype(kwargs['dtype'])
         type_signature = (nb_type[:],)
         compiled_op = cudautils.compile_udf(op, type_signature)
-        output_np_dtype = np.dtype(compiled_op[1])
+        output_np_dtype = cudf.dtype(compiled_op[1])
         cpp_str = compiled_op[0].encode('UTF-8')
         if output_np_dtype not in np_to_cudf_types:
             raise TypeError(
@@ -395,7 +423,7 @@ cdef class RollingAggregation:
         nb_type = numpy_support.from_dtype(kwargs['dtype'])
         type_signature = (nb_type[:],)
         compiled_op = cudautils.compile_udf(op, type_signature)
-        output_np_dtype = np.dtype(compiled_op[1])
+        output_np_dtype = cudf.dtype(compiled_op[1])
         cpp_str = compiled_op[0].encode('UTF-8')
         if output_np_dtype not in np_to_cudf_types:
             raise TypeError(
@@ -433,6 +461,299 @@ cdef class RollingAggregation:
             ))
         return agg
 
+cdef class GroupbyAggregation:
+    """A Cython wrapper for groupby aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
+    @property
+    def kind(self):
+        return AggregationKind(self.c_obj.get()[0].kind).name
+
+    @classmethod
+    def sum(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_sum_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def min(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_min_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def max(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_max_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def idxmin(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmin_aggregation[
+                groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def idxmax(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_argmax_aggregation[
+                groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def mean(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_mean_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def count(cls, dropna=True):
+        cdef libcudf_types.null_policy c_null_handling
+        if dropna:
+            c_null_handling = libcudf_types.null_policy.EXCLUDE
+        else:
+            c_null_handling = libcudf_types.null_policy.INCLUDE
+
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[groupby_aggregation](
+                c_null_handling
+            ))
+        return agg
+
+    @classmethod
+    def size(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.make_count_aggregation[groupby_aggregation](
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.INCLUDE)
+            ))
+        return agg
+
+    @classmethod
+    def collect(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_collect_list_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def nunique(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nunique_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def nth(cls, libcudf_types.size_type size):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nth_element_aggregation[groupby_aggregation](size))
+        return agg
+
+    @classmethod
+    def product(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_product_aggregation[groupby_aggregation]())
+        return agg
+    prod = product
+
+    @classmethod
+    def sum_of_squares(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_sum_of_squares_aggregation[groupby_aggregation]()
+        )
+        return agg
+
+    @classmethod
+    def var(cls, ddof=1):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_variance_aggregation[groupby_aggregation](ddof))
+        return agg
+
+    @classmethod
+    def std(cls, ddof=1):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_std_aggregation[groupby_aggregation](ddof))
+        return agg
+
+    @classmethod
+    def median(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_median_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def quantile(cls, q=0.5, interpolation="linear"):
+        cdef GroupbyAggregation agg = cls()
+
+        if not pd.api.types.is_list_like(q):
+            q = [q]
+
+        cdef vector[double] c_q = q
+        cdef libcudf_types.interpolation c_interp = (
+            <libcudf_types.interpolation> (
+                <underlying_type_t_interpolation> (
+                    Interpolation[interpolation.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.make_quantile_aggregation[groupby_aggregation](
+                c_q, c_interp)
+        )
+        return agg
+
+    @classmethod
+    def unique(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_collect_set_aggregation[groupby_aggregation]())
+        return agg
+
+    @classmethod
+    def first(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nth_element_aggregation[groupby_aggregation](
+                0,
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.EXCLUDE
+                )
+            )
+        )
+        return agg
+
+    @classmethod
+    def last(cls):
+        cdef GroupbyAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_nth_element_aggregation[groupby_aggregation](
+                -1,
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.EXCLUDE
+                )
+            )
+        )
+        return agg
+
+cdef class GroupbyScanAggregation:
+    """A Cython wrapper for groupby scan aggregations.
+
+    **This class should never be instantiated using a standard constructor,
+    only using one of its many factories.** These factories handle mapping
+    different cudf operations to their libcudf analogs, e.g.
+    `cudf.DataFrame.idxmin` -> `libcudf.argmin`. Additionally, they perform
+    any additional configuration needed to translate Python arguments into
+    their corresponding C++ types (for instance, C++ enumerations used for
+    flag arguments). The factory approach is necessary to support operations
+    like `df.agg(lambda x: x.sum())`; such functions are called with this
+    class as an argument to generation the desired aggregation.
+    """
+    @property
+    def kind(self):
+        return AggregationKind(self.c_obj.get()[0].kind).name
+
+    @classmethod
+    def sum(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_sum_aggregation[groupby_scan_aggregation]())
+        return agg
+
+    @classmethod
+    def min(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_min_aggregation[groupby_scan_aggregation]())
+        return agg
+
+    @classmethod
+    def max(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_max_aggregation[groupby_scan_aggregation]())
+        return agg
+
+    @classmethod
+    def count(cls, dropna=True):
+        cdef libcudf_types.null_policy c_null_handling
+        if dropna:
+            c_null_handling = libcudf_types.null_policy.EXCLUDE
+        else:
+            c_null_handling = libcudf_types.null_policy.INCLUDE
+
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_count_aggregation[groupby_scan_aggregation](c_null_handling))
+        return agg
+
+    @classmethod
+    def size(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_count_aggregation[groupby_scan_aggregation](
+                <libcudf_types.null_policy><underlying_type_t_null_policy>(
+                    NullHandling.INCLUDE)
+            ))
+        return agg
+
+    @classmethod
+    def cumcount(cls):
+        cdef GroupbyScanAggregation agg = cls()
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_count_aggregation[groupby_scan_aggregation](
+                libcudf_types.null_policy.INCLUDE
+            ))
+        return agg
+
+    # scan aggregations
+    # TODO: update this after adding per algorithm aggregation derived types
+    # https://github.com/rapidsai/cudf/issues/7106
+    cumsum = sum
+    cummin = min
+    cummax = max
+
+
 cdef Aggregation make_aggregation(op, kwargs=None):
     r"""
     Parameters
@@ -508,3 +829,79 @@ cdef RollingAggregation make_rolling_aggregation(op, kwargs=None):
     else:
         raise TypeError(f"Unknown aggregation {op}")
     return agg
+
+cdef GroupbyAggregation make_groupby_aggregation(op, kwargs=None):
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    GroupbyAggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    cdef GroupbyAggregation agg
+    if isinstance(op, str):
+        agg = getattr(GroupbyAggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = GroupbyAggregation.collect()
+        elif "dtype" in kwargs:
+            agg = GroupbyAggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(GroupbyAggregation)
+    else:
+        raise TypeError(f"Unknown aggregation {op}")
+    return agg
+
+cdef GroupbyScanAggregation make_groupby_scan_aggregation(op, kwargs=None):
+    r"""
+    Parameters
+    ----------
+    op : str or callable
+        If callable, must meet one of the following requirements:
+
+        * Is of the form lambda x: x.agg(*args, **kwargs), where
+          `agg` is the name of a supported aggregation. Used to
+          to specify aggregations that take arguments, e.g.,
+          `lambda x: x.quantile(0.5)`.
+        * Is a user defined aggregation function that operates on
+          group values. In this case, the output dtype must be
+          specified in the `kwargs` dictionary.
+    \*\*kwargs : dict, optional
+        Any keyword arguments to be passed to the op.
+
+    Returns
+    -------
+    GroupbyScanAggregation
+    """
+    if kwargs is None:
+        kwargs = {}
+
+    cdef GroupbyScanAggregation agg
+    if isinstance(op, str):
+        agg = getattr(GroupbyScanAggregation, op)(**kwargs)
+    elif callable(op):
+        if op is list:
+            agg = GroupbyScanAggregation.collect()
+        elif "dtype" in kwargs:
+            agg = GroupbyScanAggregation.from_udf(op, **kwargs)
+        else:
+            agg = op(GroupbyScanAggregation)
+    else:
+        raise TypeError(f"Unknown aggregation {op}")
+    return agg
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index 52ddbd8b8fb..5b644fda2f8 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -12,6 +12,7 @@ from cudf._lib.cpp.io.types cimport table_with_metadata
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.io.utils cimport make_source_info
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
@@ -52,4 +53,4 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
 
     names = [name.decode() for name in c_result.metadata.column_names]
 
-    return Table.from_unique_ptr(move(c_result.tbl), column_names=names)
+    return data_from_unique_ptr(move(c_result.tbl), column_names=names)
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index e8305ecaf2d..7e0be09236f 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -28,6 +28,7 @@ from cudf.utils.dtypes import is_scalar, is_string_dtype
 
 cimport cudf._lib.cpp.binaryop as cpp_binaryop
 from cudf._lib.cpp.binaryop cimport binary_operator
+import cudf
 
 
 class BinaryOperation(IntEnum):
@@ -211,7 +212,7 @@ def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
     cdef type_id tid = (
         <type_id> (
             <underlying_type_t_type_id> (
-                np_to_cudf_types[np.dtype(dtype)]
+                np_to_cudf_types[cudf.dtype(dtype)]
             )
         )
     )
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index 86778e0a9e1..5266d0ac773 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -15,6 +15,7 @@ from cudf._lib.cpp.concatenate cimport (
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.table cimport Table
 from cudf._lib.utils cimport (
+    data_from_unique_ptr,
     make_column_views,
     make_table_data_views,
     make_table_views,
@@ -52,7 +53,8 @@ cpdef concat_tables(object tables, bool ignore_index=False):
         c_views = make_table_data_views(tables)
     with nogil:
         c_result = move(libcudf_concatenate_tables(c_views))
-    return Table.from_unique_ptr(
+
+    return data_from_unique_ptr(
         move(c_result),
         column_names=tables[0]._column_names,
         index_names=None if ignore_index else tables[0]._index_names
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index d114a04eec4..ed31574b4a5 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -11,6 +11,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer
+
 from cudf.core.buffer import Buffer
 
 from cudf._lib.column cimport Column
@@ -35,6 +36,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
+from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
@@ -178,7 +180,7 @@ def gather(
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names,
         index_names=(
@@ -210,19 +212,17 @@ def _scatter_table(Table source_table, Column scatter_map,
             )
         )
 
-    out_table = Table.from_unique_ptr(
+    data, _ = data_from_unique_ptr(
         move(c_result),
         column_names=target_table._column_names,
         index_names=None
     )
 
-    out_table._index = (
+    return data, (
         None if target_table._index is None else target_table._index.copy(
             deep=False)
     )
 
-    return out_table
-
 
 def _scatter_scalar(scalars, Column scatter_map,
                     Table target_table, bool bounds_check=True):
@@ -250,19 +250,17 @@ def _scatter_scalar(scalars, Column scatter_map,
             )
         )
 
-    out_table = Table.from_unique_ptr(
+    data, _ = data_from_unique_ptr(
         move(c_result),
         column_names=target_table._column_names,
         index_names=None
     )
 
-    out_table._index = (
+    return data, (
         None if target_table._index is None else target_table._index.copy(
             deep=False)
     )
 
-    return out_table
-
 
 def scatter(object input, object scatter_map, Table target,
             bool bounds_check=True):
@@ -306,7 +304,7 @@ def _reverse_table(Table source_table):
             reverse_table_view
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names,
         index_names=source_table._index_names
@@ -371,7 +369,7 @@ def table_empty_like(Table input_table, bool keep_index=True):
     with nogil:
         c_result = move(cpp_copying.empty_like(input_table_view))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=input_table._column_names,
         index_names=(
@@ -434,8 +432,8 @@ def table_slice(Table input_table, object indices, bool keep_index=True):
         )
 
     num_of_result_cols = c_result.size()
-    result =[
-        Table.from_table_view(
+    return [
+        data_from_table_view(
             c_result[i],
             input_table,
             column_names=input_table._column_names,
@@ -446,8 +444,6 @@ def table_slice(Table input_table, object indices, bool keep_index=True):
             )
         ) for i in range(num_of_result_cols)]
 
-    return result
-
 
 def column_split(Column input_column, object splits):
 
@@ -505,8 +501,8 @@ def table_split(Table input_table, object splits, bool keep_index=True):
         )
 
     num_of_result_cols = c_result.size()
-    result = [
-        Table.from_table_view(
+    return [
+        data_from_table_view(
             c_result[i],
             input_table,
             column_names=input_table._column_names,
@@ -515,8 +511,6 @@ def table_split(Table input_table, object splits, bool keep_index=True):
             else None
         ) for i in range(num_of_result_cols)]
 
-    return result
-
 
 def _copy_if_else_column_column(Column lhs, Column rhs, Column boolean_mask):
 
@@ -642,7 +636,7 @@ def _boolean_mask_scatter_table(Table input_table, Table target_table,
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=target_table._column_names,
         index_names=target_table._index._column_names
@@ -672,13 +666,15 @@ def _boolean_mask_scatter_scalar(list input_scalars, Table target_table,
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=target_table._column_names,
         index_names=target_table._index._column_names
     )
 
 
+# TODO: This function is currently unused but should be used in
+# ColumnBase.__setitem__, see https://github.com/rapidsai/cudf/issues/8667.
 def boolean_mask_scatter(object input, Table target_table,
                          Column boolean_mask):
 
@@ -755,7 +751,7 @@ def sample(Table input, size_type n,
             cpp_copying.sample(tbl_view, n, replacement, seed)
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_output),
         column_names=input._column_names,
         index_names=(
@@ -791,12 +787,12 @@ cdef class _CPackedColumns:
         """
         Construct a ``PackedColumns`` object from a ``cudf.DataFrame``.
         """
-        from cudf.core import RangeIndex, dtypes
+        import cudf.core.dtypes
 
         cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns)
 
         if keep_index and (
-            not isinstance(input_table.index, RangeIndex)
+            not isinstance(input_table.index, cudf.RangeIndex)
             or input_table.index.start != 0
             or input_table.index.stop != len(input_table)
             or input_table.index.step != 1
@@ -809,7 +805,7 @@ cdef class _CPackedColumns:
         p.column_names = input_table._column_names
         p.column_dtypes = {}
         for name, col in input_table._data.items():
-            if isinstance(col.dtype, dtypes._BaseDtype):
+            if isinstance(col.dtype, cudf.core.dtypes._BaseDtype):
                 p.column_dtypes[name] = col.dtype
 
         p.c_obj = move(cpp_copying.pack(input_table_view))
@@ -887,12 +883,12 @@ cdef class _CPackedColumns:
         return p
 
     def unpack(self):
-        output_table = Table.from_table_view(
+        output_table = Table(*data_from_table_view(
             cpp_copying.unpack(self.c_obj),
             self,
             self.column_names,
             self.index_names
-        )
+        ))
 
         for name, dtype in self.column_dtypes.items():
             output_table._data[name] = (
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index b13815c925d..13bfa49057c 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -43,6 +43,12 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
     cdef cppclass rolling_aggregation:
         aggregation.Kind kind
 
+    cdef cppclass groupby_aggregation:
+        aggregation.Kind kind
+
+    cdef cppclass groupby_scan_aggregation:
+        aggregation.Kind kind
+
     ctypedef enum udf_type:
         CUDA 'cudf::udf_type::CUDA'
         PTX 'cudf::udf_type::PTX'
@@ -87,6 +93,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         size_type n
     ) except +
 
+    cdef unique_ptr[T] make_nth_element_aggregation[T](
+        size_type n,
+        null_policy null_handling
+    ) except +
+
     cdef unique_ptr[T] make_collect_list_aggregation[T]() except +
 
     cdef unique_ptr[T] make_collect_set_aggregation[T]() except +
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 29a6518fae8..a318dc68ac9 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -122,7 +122,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         vector[size_type] splits
     ) except +
 
-    cdef struct packed_columns:
+    cdef cppclass packed_columns:
         unique_ptr[metadata] metadata_
         unique_ptr[device_buffer] gpu_data
 
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 56ebc3a77fc..ef97be3cf9e 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -18,3 +18,8 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     ) except +
     cdef unique_ptr[column] day_of_year(const column_view& column) except +
     cdef unique_ptr[column] is_leap_year(const column_view& column) except +
+    cdef unique_ptr[column] last_day_of_month(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] extract_quarter(const column_view& column) except +
+    cdef unique_ptr[column] days_in_month(const column_view& column) except +
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2d8f251799d..2ecdf76842f 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -5,7 +5,10 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport aggregation
+from cudf._lib.cpp.aggregation cimport (
+    groupby_aggregation,
+    groupby_scan_aggregation,
+)
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
@@ -26,7 +29,12 @@ cdef extern from "cudf/groupby.hpp" \
     cdef cppclass aggregation_request:
         aggregation_request() except +
         column_view values
-        vector[unique_ptr[aggregation]] aggregations
+        vector[unique_ptr[groupby_aggregation]] aggregations
+
+    cdef cppclass scan_request:
+        scan_request() except +
+        column_view values
+        vector[unique_ptr[groupby_scan_aggregation]] aggregations
 
     cdef cppclass aggregation_result:
         vector[unique_ptr[column]] results
@@ -76,7 +84,7 @@ cdef extern from "cudf/groupby.hpp" \
             unique_ptr[table],
             vector[aggregation_result]
         ] scan(
-            const vector[aggregation_request]& requests,
+            const vector[scan_request]& requests,
         ) except +
 
         pair[
diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd
index c5e235b5697..4afd8732320 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -49,8 +50,10 @@ cdef extern from "cudf/io/csv.hpp" \
         cudf_io_types.quote_style get_quoting() except+
         char get_quotechar() except+
         bool is_enabled_doublequote() except+
-        vector[string] get_infer_date_names() except+
-        vector[int] get_infer_date_indexes() except+
+        vector[string] get_parse_dates_names() except+
+        vector[int] get_parse_dates_indexes() except+
+        vector[string] get_parse_hex_names() except+
+        vector[int] get_parse_hex_indexes() except+
 
         # Conversion settings
         vector[string] get_dtype() except+
@@ -92,11 +95,14 @@ cdef extern from "cudf/io/csv.hpp" \
         void set_quoting(cudf_io_types.quote_style style) except+
         void set_quotechar(char val) except+
         void set_doublequote(bool val) except+
-        void set_infer_date_names(vector[string]) except+
-        void set_infer_date_indexes(vector[int]) except+
+        void set_parse_dates(vector[string]) except+
+        void set_parse_dates(vector[int]) except+
+        void set_parse_hex(vector[string]) except+
+        void set_parse_hex(vector[int]) except+
 
         # Conversion settings
-        void set_dtypes(vector[string] types) except+
+        void set_dtypes(vector[data_type] types) except+
+        void set_dtypes(map[string, data_type] types) except+
         void set_true_values(vector[string] vals) except+
         void set_false_values(vector[string] vals) except+
         void set_na_values(vector[string] vals) except+
@@ -157,11 +163,15 @@ cdef extern from "cudf/io/csv.hpp" \
         ) except+
         csv_reader_options_builder& quotechar(char val) except+
         csv_reader_options_builder& doublequote(bool val) except+
-        csv_reader_options_builder& infer_date_names(vector[string]) except+
-        csv_reader_options_builder& infer_date_indexes(vector[int]) except+
+        csv_reader_options_builder& parse_dates(vector[string]) except+
+        csv_reader_options_builder& parse_dates(vector[int]) except+
 
         # Conversion settings
         csv_reader_options_builder& dtypes(vector[string] types) except+
+        csv_reader_options_builder& dtypes(vector[data_type] types) except+
+        csv_reader_options_builder& dtypes(
+            map[string, data_type] types
+        ) except+
         csv_reader_options_builder& true_values(vector[string] vals) except+
         csv_reader_options_builder& false_values(vector[string] vals) except+
         csv_reader_options_builder& na_values(vector[string] vals) except+
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index 6f20195e87f..2c65e329bb0 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -25,7 +26,8 @@ cdef extern from "cudf/io/json.hpp" \
         bool is_enabled_dayfirst() except+
 
         # setter
-        void set_dtypes(vector[string] types) except+
+        void set_dtypes(vector[data_type] types) except+
+        void set_dtypes(map[string, data_type] types) except+
         void set_compression(
             cudf_io_types.compression_type compression
         ) except+
@@ -47,6 +49,12 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& dtypes(
             vector[string] types
         ) except+
+        json_reader_options_builder& dtypes(
+            vector[data_type] types
+        ) except+
+        json_reader_options_builder& dtypes(
+            map[string, data_type] types
+        ) except+
         json_reader_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except+
diff --git a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd b/python/cudf/cudf/_lib/cpp/strings/repeat.pxd
new file mode 100644
index 00000000000..2a6754b9a11
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/strings/repeat.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.types cimport size_type
+
+
+cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
+        nogil:
+
+    cdef unique_ptr[column] repeat_strings(
+        column_view strings,
+        size_type repeat) except +
+
+    cdef unique_ptr[column] repeat_strings(
+        column_view strings,
+        column_view repeats) except +
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 773e81a0a7b..812d614e6d3 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,11 +1,16 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.types cimport dtype_to_data_type
+
 import numpy as np
 import pandas as pd
 
@@ -40,6 +45,7 @@ from cudf._lib.cpp.io.types cimport (
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.io.utils cimport make_sink_info, make_source_info
 from cudf._lib.table cimport Table, make_table_view
+from cudf._lib.utils cimport data_from_unique_ptr
 
 ctypedef int32_t underlying_type_t_compression
 
@@ -68,6 +74,12 @@ class Compression(IntEnum):
     )
 
 
+CSV_HEX_TYPE_MAP = {
+    "hex": np.dtype("int64"),
+    "hex64": np.dtype("int64"),
+    "hex32": np.dtype("int32")
+}
+
 cdef csv_reader_options make_csv_reader_options(
     object datasource,
     object lineterminator,
@@ -116,9 +128,12 @@ cdef csv_reader_options make_csv_reader_options(
     cdef vector[string] c_use_cols_names
     cdef size_type c_nrows = nrows if nrows is not None else -1
     cdef quote_style c_quoting
-    cdef vector[string] c_infer_date_names
-    cdef vector[int] c_infer_date_indexes
-    cdef vector[string] c_dtypes
+    cdef vector[string] c_parse_dates_names
+    cdef vector[int] c_parse_dates_indexes
+    cdef vector[string] c_hex_col_names
+    cdef vector[data_type] c_dtypes_list
+    cdef map[string, data_type] c_dtypes_map
+    cdef vector[int] c_hex_col_indexes
     cdef vector[string] c_true_values
     cdef vector[string] c_false_values
     cdef vector[string] c_na_values
@@ -220,48 +235,61 @@ cdef csv_reader_options make_csv_reader_options(
                 "`parse_dates`: non-lists are unsupported")
         for col in parse_dates:
             if isinstance(col, str):
-                c_infer_date_names.push_back(str(col).encode())
+                c_parse_dates_names.push_back(str(col).encode())
             elif isinstance(col, int):
-                c_infer_date_indexes.push_back(col)
+                c_parse_dates_indexes.push_back(col)
             else:
                 raise NotImplementedError(
                     "`parse_dates`: Nesting is unsupported")
-        csv_reader_options_c.set_infer_date_names(c_infer_date_names)
-        csv_reader_options_c.set_infer_date_indexes(c_infer_date_indexes)
+        csv_reader_options_c.set_parse_dates(c_parse_dates_names)
+        csv_reader_options_c.set_parse_dates(c_parse_dates_indexes)
 
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
-            c_dtypes.reserve(len(dtype))
             for k, v in dtype.items():
-                c_dtypes.push_back(
-                    str(
-                        str(k)+":"+
-                        _get_cudf_compatible_str_from_dtype(v)
-                    ).encode()
-                )
+                col_type = v
+                if v in CSV_HEX_TYPE_MAP:
+                    col_type = CSV_HEX_TYPE_MAP[v]
+                    c_hex_col_names.push_back(str(k).encode())
+
+                c_dtypes_map[str(k).encode()] = \
+                    _get_cudf_data_type_from_dtype(
+                        cudf.dtype(col_type))
+            csv_reader_options_c.set_dtypes(c_dtypes_map)
+            csv_reader_options_c.set_parse_hex(c_hex_col_names)
         elif (
             cudf.utils.dtypes.is_scalar(dtype) or
             isinstance(dtype, (
                 np.dtype, pd.core.dtypes.dtypes.ExtensionDtype, type
             ))
         ):
-            c_dtypes.reserve(1)
-            c_dtypes.push_back(
-                _get_cudf_compatible_str_from_dtype(dtype).encode()
+            c_dtypes_list.reserve(1)
+            if dtype in CSV_HEX_TYPE_MAP:
+                dtype = CSV_HEX_TYPE_MAP[dtype]
+                c_hex_col_indexes.push_back(0)
+
+            c_dtypes_list.push_back(
+                _get_cudf_data_type_from_dtype(dtype)
             )
+            csv_reader_options_c.set_dtypes(c_dtypes_list)
+            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
         elif isinstance(dtype, abc.Iterable):
-            c_dtypes.reserve(len(dtype))
-            for col_dtype in dtype:
-                c_dtypes.push_back(
-                    _get_cudf_compatible_str_from_dtype(col_dtype).encode()
+            c_dtypes_list.reserve(len(dtype))
+            for index, col_dtype in enumerate(dtype):
+                if col_dtype in CSV_HEX_TYPE_MAP:
+                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
+                    c_hex_col_indexes.push_back(index)
+
+                c_dtypes_list.push_back(
+                    _get_cudf_data_type_from_dtype(col_dtype)
                 )
+            csv_reader_options_c.set_dtypes(c_dtypes_list)
+            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
         else:
             raise ValueError(
                 "dtype should be a scalar/str/list-like/dict-like"
             )
 
-        csv_reader_options_c.set_dtypes(c_dtypes)
-
     if true_values is not None:
         c_true_values.reserve(len(true_values))
         for tv in true_values:
@@ -358,7 +386,7 @@ def read_csv(
 
     See Also
     --------
-    cudf.io.csv.read_csv
+    cudf.read_csv
     """
 
     if not isinstance(datasource, (BytesIO, StringIO, bytes,
@@ -393,7 +421,7 @@ def read_csv(
         c_result = move(cpp_read_csv(read_csv_options_c))
 
     meta_names = [name.decode() for name in c_result.metadata.column_names]
-    df = cudf.DataFrame._from_table(Table.from_unique_ptr(
+    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
         move(c_result.tbl),
         column_names=meta_names
     ))
@@ -428,7 +456,7 @@ cpdef write_csv(
 
     See Also
     --------
-    cudf.io.csv.to_csv
+    cudf.to_csv
     """
     cdef table_view input_table_view = \
         table.view() if index is True else table.data_view()
@@ -483,7 +511,7 @@ cpdef write_csv(
         cpp_write_csv(options)
 
 
-def _get_cudf_compatible_str_from_dtype(dtype):
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
     # TODO: Remove this Error message once the
     # following issue is fixed:
     # https://github.com/rapidsai/cudf/issues/3960
@@ -493,29 +521,38 @@ def _get_cudf_compatible_str_from_dtype(dtype):
             "supported in CSV reader"
         )
 
-    if (
-        str(dtype) in cudf.utils.dtypes.ALL_TYPES or
-        str(dtype) in {
-            "hex", "hex32", "hex64", "date", "date32", "timestamp",
-            "timestamp[us]", "timestamp[s]", "timestamp[ms]", "timestamp[ns]",
-            "date64"
-        }
-    ):
-        return str(dtype)
-    pd_dtype = pd.core.dtypes.common.pandas_dtype(dtype)
-
-    if pd_dtype in cudf.utils.dtypes.pandas_dtypes_to_cudf_dtypes:
-        return str(cudf.utils.dtypes.pandas_dtypes_to_cudf_dtypes[pd_dtype])
-    elif isinstance(pd_dtype, np.dtype) and pd_dtype.kind in ("O", "U"):
-        return "str"
-    elif (
-        pd_dtype in cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes or
-        str(pd_dtype) in cudf.utils.dtypes.ALL_TYPES or
-        cudf.utils.dtypes.is_categorical_dtype(pd_dtype)
-    ):
-        return str(pd_dtype)
-    else:
-        raise ValueError(f"dtype not understood: {dtype}")
+    if isinstance(dtype, str):
+        if str(dtype) == "date32":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_DAYS
+            )
+        elif str(dtype) in ("date", "date64"):
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
+            )
+        elif str(dtype) == "timestamp":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
+            )
+        elif str(dtype) == "timestamp[us]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MICROSECONDS
+            )
+        elif str(dtype) == "timestamp[s]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_SECONDS
+            )
+        elif str(dtype) == "timestamp[ms]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
+            )
+        elif str(dtype) == "timestamp[ns]":
+            return libcudf_types.data_type(
+                libcudf_types.type_id.TIMESTAMP_NANOSECONDS
+            )
+
+    dtype = cudf.dtype(dtype)
+    return dtype_to_data_type(dtype)
 
 
 def columns_apply_na_rep(column_names, na_rep):
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 3b13cedcfd7..1b152f1a3b7 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -60,6 +60,8 @@ def extract_datetime_component(Column col, object field):
 
 
 def is_leap_year(Column col):
+    """Returns a boolean indicator whether the year of the date is a leap year
+    """
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
 
@@ -67,3 +69,39 @@ def is_leap_year(Column col):
         c_result = move(libcudf_datetime.is_leap_year(col_view))
 
     return Column.from_unique_ptr(move(c_result))
+
+
+def extract_quarter(Column col):
+    """
+    Returns a column which contains the corresponding quarter of the year
+    for every timestamp inside the input column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+
+    with nogil:
+        c_result = move(libcudf_datetime.extract_quarter(col_view))
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def days_in_month(Column col):
+    """Extracts the number of days in the month of the date
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+
+    with nogil:
+        c_result = move(libcudf_datetime.days_in_month(col_view))
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def last_day_of_month(Column col):
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+
+    with nogil:
+        c_result = move(libcudf_datetime.last_day_of_month(col_view))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx
index d9fdf72415c..99a3957006b 100644
--- a/python/cudf/cudf/_lib/filling.pyx
+++ b/python/cudf/cudf/_lib/filling.pyx
@@ -16,6 +16,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
@@ -70,7 +71,7 @@ def _repeat_via_column(Table inp, Column count, bool check_count):
             c_check_count
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=inp._column_names,
         index_names=inp._index_names
@@ -87,7 +88,7 @@ def _repeat_via_size_type(Table inp, size_type count):
             count
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=inp._column_names,
         index_names=inp._index_names
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 12e3f65a8a2..d7416625248 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -22,6 +22,8 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+import cudf
+
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.table cimport Table
@@ -30,7 +32,12 @@ from cudf._lib.scalar import as_device_scalar
 
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.aggregation cimport Aggregation, make_aggregation
+from cudf._lib.aggregation cimport (
+    GroupbyAggregation,
+    GroupbyScanAggregation,
+    make_groupby_aggregation,
+    make_groupby_scan_aggregation,
+)
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
@@ -39,6 +46,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.utils cimport data_from_unique_ptr
 
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. These strings must be elements of the AggregationKind enum.
@@ -91,41 +99,24 @@ cdef class GroupBy:
         c_grouped_values = move(c_groups.values)
         c_group_offsets = c_groups.offsets
 
-        grouped_keys = Table.from_unique_ptr(
+        grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
             move(c_grouped_keys),
             column_names=range(c_grouped_keys.get()[0].num_columns())
-        )
-        grouped_values = Table.from_unique_ptr(
+        ))
+        grouped_values = data_from_unique_ptr(
             move(c_grouped_values),
             index_names=values._index_names,
             column_names=values._column_names
         )
         return grouped_keys, grouped_values, c_group_offsets
 
-    def aggregate(self, Table values, aggregations):
-        """
-        Parameters
-        ----------
-        values : Table
-        aggregations
-            A dict mapping column names in `Table` to a list of aggregations
-            to perform on that column
-
-            Each aggregation may be specified as:
-            - a string (e.g., "max")
-            - a lambda/function
-
-        Returns
-        -------
-        Table of aggregated values
-        """
+    def aggregate_internal(self, Table values, aggregations):
         from cudf.core.column_accessor import ColumnAccessor
         cdef vector[libcudf_groupby.aggregation_request] c_agg_requests
         cdef libcudf_groupby.aggregation_request c_agg_request
         cdef Column col
-        cdef Aggregation agg_obj
+        cdef GroupbyAggregation agg_obj
 
-        cdef bool scan = _is_all_scan_aggregate(aggregations)
         allow_empty = all(len(v) == 0 for v in aggregations.values())
 
         included_aggregations = defaultdict(list)
@@ -151,7 +142,7 @@ cdef class GroupBy:
 
             c_agg_request = move(libcudf_groupby.aggregation_request())
             for agg in aggs:
-                agg_obj = make_aggregation(agg)
+                agg_obj = make_groupby_aggregation(agg)
                 if (valid_aggregations == "ALL"
                         or agg_obj.kind in valid_aggregations):
                     included_aggregations[col_name].append(agg)
@@ -172,32 +163,92 @@ cdef class GroupBy:
             vector[libcudf_groupby.aggregation_result]
         ] c_result
 
-        try:
-            with nogil:
-                if scan:
-                    c_result = move(
-                        self.c_obj.get()[0].scan(
-                            c_agg_requests
-                        )
-                    )
-                else:
-                    c_result = move(
-                        self.c_obj.get()[0].aggregate(
-                            c_agg_requests
-                        )
+        with nogil:
+            c_result = move(
+                self.c_obj.get()[0].aggregate(
+                    c_agg_requests
+                )
+            )
+
+        grouped_keys, _ = data_from_unique_ptr(
+            move(c_result.first),
+            column_names=self.keys._column_names
+        )
+
+        result_data = ColumnAccessor(multiindex=True)
+        # Note: This loop relies on the included_aggregations dict being
+        # insertion ordered to map results to requested aggregations by index.
+        for i, col_name in enumerate(included_aggregations):
+            for j, agg_name in enumerate(included_aggregations[col_name]):
+                if callable(agg_name):
+                    agg_name = agg_name.__name__
+                result_data[(col_name, agg_name)] = (
+                    Column.from_unique_ptr(move(c_result.second[i].results[j]))
+                )
+
+        return result_data, cudf.Index._from_data(grouped_keys)
+
+    def scan_internal(self, Table values, aggregations):
+        from cudf.core.column_accessor import ColumnAccessor
+        cdef vector[libcudf_groupby.scan_request] c_agg_requests
+        cdef libcudf_groupby.scan_request c_agg_request
+        cdef Column col
+        cdef GroupbyScanAggregation agg_obj
+
+        allow_empty = all(len(v) == 0 for v in aggregations.values())
+
+        included_aggregations = defaultdict(list)
+        for i, (col_name, aggs) in enumerate(aggregations.items()):
+            col = values._data[col_name]
+            dtype = col.dtype
+
+            valid_aggregations = (
+                _LIST_AGGS if is_list_dtype(dtype)
+                else _STRING_AGGS if is_string_dtype(dtype)
+                else _CATEGORICAL_AGGS if is_categorical_dtype(dtype)
+                else _STRUCT_AGGS if is_struct_dtype(dtype)
+                else _INTERVAL_AGGS if is_interval_dtype(dtype)
+                else _DECIMAL_AGGS if is_decimal_dtype(dtype)
+                else "ALL"
+            )
+            if (valid_aggregations is _DECIMAL_AGGS
+                    and rmm._cuda.gpu.runtimeGetVersion() < 11000):
+                raise RuntimeError(
+                    "Decimal aggregations are only supported on CUDA >= 11 "
+                    "due to an nvcc compiler bug."
+                )
+
+            c_agg_request = move(libcudf_groupby.scan_request())
+            for agg in aggs:
+                agg_obj = make_groupby_scan_aggregation(agg)
+                if (valid_aggregations == "ALL"
+                        or agg_obj.kind in valid_aggregations):
+                    included_aggregations[col_name].append(agg)
+                    c_agg_request.aggregations.push_back(
+                        move(agg_obj.c_obj)
                     )
-        except RuntimeError as e:
-            # TODO: remove this try..except after
-            # https://github.com/rapidsai/cudf/issues/7611
-            # is resolved
-            if ("make_empty_column") in str(e):
-                raise NotImplementedError(
-                    "Aggregation not supported for empty columns"
-                ) from e
-            else:
-                raise
-
-        grouped_keys = Table.from_unique_ptr(
+            if not c_agg_request.aggregations.empty():
+                c_agg_request.values = col.view()
+                c_agg_requests.push_back(
+                    move(c_agg_request)
+                )
+
+        if c_agg_requests.empty() and not allow_empty:
+            raise DataError("All requested aggregations are unsupported.")
+
+        cdef pair[
+            unique_ptr[table],
+            vector[libcudf_groupby.aggregation_result]
+        ] c_result
+
+        with nogil:
+            c_result = move(
+                self.c_obj.get()[0].scan(
+                    c_agg_requests
+                )
+            )
+
+        grouped_keys, _ = data_from_unique_ptr(
             move(c_result.first),
             column_names=self.keys._column_names
         )
@@ -213,7 +264,29 @@ cdef class GroupBy:
                     Column.from_unique_ptr(move(c_result.second[i].results[j]))
                 )
 
-        return Table(data=result_data, index=grouped_keys)
+        return result_data, cudf.Index._from_data(grouped_keys)
+
+    def aggregate(self, Table values, aggregations):
+        """
+        Parameters
+        ----------
+        values : Table
+        aggregations
+            A dict mapping column names in `Table` to a list of aggregations
+            to perform on that column
+
+            Each aggregation may be specified as:
+            - a string (e.g., "max")
+            - a lambda/function
+
+        Returns
+        -------
+        Table of aggregated values
+        """
+        if _is_all_scan_aggregate(aggregations):
+            return self.scan_internal(values, aggregations)
+
+        return self.aggregate_internal(values, aggregations)
 
     def shift(self, Table values, int periods, list fill_values):
         cdef table_view view = values.view()
@@ -238,16 +311,16 @@ cdef class GroupBy:
                 self.c_obj.get()[0].shift(view, offsets, c_fill_values)
             )
 
-        grouped_keys = Table.from_unique_ptr(
+        grouped_keys = cudf.Index._from_data(*data_from_unique_ptr(
             move(c_result.first),
             column_names=self.keys._column_names
-        )
+        ))
 
-        shifted = Table.from_unique_ptr(
+        shifted, _ = data_from_unique_ptr(
             move(c_result.second), column_names=values._column_names
         )
 
-        return Table(data=shifted._data, index=grouped_keys)
+        return shifted, grouped_keys
 
     def replace_nulls(self, Table values, object method):
         cdef table_view val_view = values.view()
@@ -265,12 +338,10 @@ cdef class GroupBy:
                 self.c_obj.get()[0].replace_nulls(val_view, policies)
             )
 
-        grouped_result = Table.from_unique_ptr(
+        return data_from_unique_ptr(
             move(c_result.second), column_names=values._column_names
-        )
+        )[0]
 
-        result = Table(data=grouped_result._data)
-        return result
 
 _GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"}
 
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 198e7a748c9..137b19ef69c 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -15,6 +15,7 @@ from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def hash_partition(Table source_table, object columns_to_hash,
@@ -41,12 +42,14 @@ def hash_partition(Table source_table, object columns_to_hash,
     # the original table (`source_table`) is empty. We need to
     # return a list of zeros in this case.
     return (
-        Table.from_unique_ptr(
+        *data_from_unique_ptr(
             move(c_result.first),
             column_names=source_table._column_names,
-            index_names=source_table._index_names if(
-                keep_index is True)
-            else None
+            index_names=(
+                source_table._index_names
+                if keep_index is True
+                else None
+            )
 
         ),
         list(c_result.second) if c_result.second.size()
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 08ea58e4587..234513733d1 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -21,6 +21,7 @@ from cudf._lib.cpp.interop cimport (
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def from_dlpack(dlpack_capsule):
@@ -40,7 +41,7 @@ def from_dlpack(dlpack_capsule):
             cpp_from_dlpack(dlpack_tensor)
         )
 
-    res = Table.from_unique_ptr(
+    res = data_from_unique_ptr(
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
@@ -164,10 +165,8 @@ def from_arrow(
     with nogil:
         c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))
 
-    out_table = Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=column_names,
         index_names=index_names
     )
-
-    return out_table
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 82ad9d67f78..66d93ffc531 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -3,6 +3,7 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
+from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.types cimport (
     column_name_info,
     data_sink,
@@ -17,3 +18,7 @@ cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
 cdef update_struct_field_names(
     Table table,
     vector[column_name_info]& schema_info)
+cdef Column update_column_struct_field_names(
+    Column col,
+    column_name_info& info
+)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 72ab64f6249..d26cf19deaf 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -127,12 +127,12 @@ cdef update_struct_field_names(
     vector[column_name_info]& schema_info
 ):
     for i, (name, col) in enumerate(table._data.items()):
-        table._data[name] = _update_column_struct_field_names(
+        table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
         )
 
 
-cdef Column _update_column_struct_field_names(
+cdef Column update_column_struct_field_names(
     Column col,
     column_name_info& info
 ):
@@ -149,7 +149,7 @@ cdef Column _update_column_struct_field_names(
     if col.children:
         children = list(col.children)
         for i, child in enumerate(children):
-            children[i] = _update_column_struct_field_names(
+            children[i] = update_column_struct_field_names(
                 child,
                 info.children[i]
             )
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 4a15edf8a19..68d9da57e83 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -10,18 +10,22 @@ import os
 import cudf
 
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
     read_json as libcudf_read_json,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.io.utils cimport make_source_info
 from cudf._lib.table cimport Table
+from cudf._lib.types cimport dtype_to_data_type
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 cpdef read_json(object filepaths_or_buffers,
@@ -50,7 +54,8 @@ cpdef read_json(object filepaths_or_buffers,
             filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()
 
     # Setup arguments
-    cdef vector[string] c_dtypes
+    cdef vector[data_type] c_dtypes_list
+    cdef map[string, data_type] c_dtypes_map
     cdef cudf_io_types.compression_type c_compression
     # Determine byte read offsets if applicable
     cdef size_type c_range_offset = (
@@ -70,40 +75,36 @@ cpdef read_json(object filepaths_or_buffers,
             c_compression = cudf_io_types.compression_type.AUTO
     else:
         c_compression = cudf_io_types.compression_type.NONE
-
+    is_list_like_dtypes = False
     if dtype is False:
         raise ValueError("False value is unsupported for `dtype`")
     elif dtype is not True:
         if isinstance(dtype, abc.Mapping):
-            c_dtypes.reserve(len(dtype))
             for k, v in dtype.items():
-                if cudf.utils.dtypes.is_categorical_dtype(v):
-                    raise NotImplementedError(
-                        "CategoricalDtype as dtype is not yet "
-                        "supported in JSON reader"
-                    )
-                c_dtypes.push_back(str(str(k) + ":" + str(v)).encode())
+                c_dtypes_map[str(k).encode()] = \
+                    _get_cudf_data_type_from_dtype(v)
         elif not isinstance(dtype, abc.Iterable):
             raise TypeError("`dtype` must be 'list like' or 'dict'")
         else:
-            c_dtypes.reserve(len(dtype))
+            is_list_like_dtypes = True
+            c_dtypes_list.reserve(len(dtype))
             for col_dtype in dtype:
-                if cudf.utils.dtypes.is_categorical_dtype(col_dtype):
-                    raise NotImplementedError(
-                        "CategoricalDtype as dtype is not yet "
-                        "supported in JSON reader"
-                    )
-                c_dtypes.push_back(str(col_dtype).encode())
+                c_dtypes_list.push_back(
+                    _get_cudf_data_type_from_dtype(
+                        col_dtype))
 
     cdef json_reader_options opts = move(
         json_reader_options.builder(make_source_info(filepaths_or_buffers))
-        .dtypes(c_dtypes)
         .compression(c_compression)
         .lines(c_lines)
         .byte_range_offset(c_range_offset)
         .byte_range_size(c_range_size)
         .build()
     )
+    if is_list_like_dtypes:
+        opts.set_dtypes(c_dtypes_list)
+    else:
+        opts.set_dtypes(c_dtypes_map)
 
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_out_table
@@ -112,5 +113,15 @@ cpdef read_json(object filepaths_or_buffers,
         c_out_table = move(libcudf_read_json(opts))
 
     column_names = [x.decode() for x in c_out_table.metadata.column_names]
-    return Table.from_unique_ptr(move(c_out_table.tbl),
-                                 column_names=column_names)
+    return data_from_unique_ptr(move(c_out_table.tbl),
+                                column_names=column_names)
+
+cdef data_type _get_cudf_data_type_from_dtype(object dtype) except +:
+    if cudf.utils.dtypes.is_categorical_dtype(dtype):
+        raise NotImplementedError(
+            "CategoricalDtype as dtype is not yet "
+            "supported in JSON reader"
+        )
+
+    dtype = cudf.dtype(dtype)
+    return dtype_to_data_type(dtype)
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 8ada3376fdb..59c3a4b89dc 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -43,6 +43,7 @@ from cudf.core.dtypes import ListDtype
 
 from cudf._lib.cpp.lists.contains cimport contains
 from cudf._lib.cpp.lists.extract cimport extract_list_element
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def count_elements(Column col):
@@ -72,7 +73,7 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
     with nogil:
         c_result = move(cpp_explode_outer(c_table_view, c_explode_column_idx))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=tbl._column_names,
         index_names=None if ignore_index else tbl._index_names
diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx
index cc2d405c207..83f088f4419 100644
--- a/python/cudf/cudf/_lib/merge.pyx
+++ b/python/cudf/cudf/_lib/merge.pyx
@@ -11,6 +11,7 @@ from cudf._lib.cpp.merge cimport merge as cpp_merge
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def merge_sorted(
@@ -102,7 +103,7 @@ def merge_sorted(
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names,
         index_names=index_names,
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 2470c15f541..995243c7ea7 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -40,6 +40,7 @@ from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.io.utils cimport (
     make_sink_info,
     make_source_info,
+    update_column_struct_field_names,
     update_struct_field_names,
 )
 from cudf._lib.table cimport Table
@@ -50,7 +51,7 @@ from cudf._lib.types cimport underlying_type_t_type_id
 
 import numpy as np
 
-from cudf._lib.utils cimport get_column_names
+from cudf._lib.utils cimport data_from_unique_ptr, get_column_names
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
@@ -83,7 +84,7 @@ cpdef read_orc(object filepaths_or_buffers,
 
     See Also
     --------
-    cudf.io.orc.read_orc
+    cudf.read_orc
     """
     cdef orc_reader_options c_orc_reader_options = make_orc_reader_options(
         filepaths_or_buffers,
@@ -96,7 +97,7 @@ cpdef read_orc(object filepaths_or_buffers,
             if timestamp_type is None else
             <type_id>(
                 <underlying_type_t_type_id> (
-                    np_to_cudf_types[np.dtype(timestamp_type)]
+                    np_to_cudf_types[cudf.dtype(timestamp_type)]
                 )
             )
         ),
@@ -111,11 +112,16 @@ cpdef read_orc(object filepaths_or_buffers,
 
     names = [name.decode() for name in c_result.metadata.column_names]
 
-    tbl = Table.from_unique_ptr(move(c_result.tbl), names)
+    data, index = data_from_unique_ptr(move(c_result.tbl), names)
 
-    update_struct_field_names(tbl, c_result.metadata.schema_info)
+    data = {
+        name: update_column_struct_field_names(
+            col, c_result.metadata.schema_info[i]
+        )
+        for i, (name, col) in enumerate(data.items())
+    }
 
-    return tbl
+    return data, index
 
 
 cdef compression_type _get_comp_type(object compression):
@@ -136,7 +142,7 @@ cpdef write_orc(Table table,
 
     See Also
     --------
-    cudf.io.orc.read_orc
+    cudf.read_orc
     """
     cdef compression_type compression_ = _get_comp_type(compression)
     cdef table_metadata metadata_ = table_metadata()
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 52f3aada00b..95ae2202f68 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -26,7 +26,7 @@ from cudf.utils.dtypes import (
     np_to_pa_dtype,
 )
 
-from cudf._lib.utils cimport get_column_names
+from cudf._lib.utils cimport data_from_unique_ptr, get_column_names
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
@@ -178,12 +178,10 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                     for c in meta['columns']:
                         if c['field_name'] == idx_col:
                             index_col_names[idx_col] = c['name']
-    df = cudf.DataFrame._from_table(
-        Table.from_unique_ptr(
-            move(c_out_table.tbl),
-            column_names=column_names
-        )
-    )
+    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
+        move(c_out_table.tbl),
+        column_names=column_names
+    ))
 
     update_struct_field_names(df, c_out_table.metadata.schema_info)
 
@@ -201,7 +199,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
             meta_dtype = cols_dtype_map.get(col, None)
             df._data[col] = cudf.core.column.column_empty(
                 row_count=0,
-                dtype=np.dtype(meta_dtype)
+                dtype=cudf.dtype(meta_dtype)
             )
 
     # Set the index column
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 865138bec84..90aa6bb0344 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -16,6 +16,7 @@ from cudf._lib.table cimport Table
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def partition(Table source_table, Column partition_map,
@@ -44,7 +45,7 @@ def partition(Table source_table, Column partition_map,
         )
 
     return (
-        Table.from_unique_ptr(
+        *data_from_unique_ptr(
             move(c_result.first),
             column_names=source_table._column_names,
             index_names=source_table._index_names if(
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 45a4ff7c92c..76bf587237c 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -32,6 +32,7 @@ from cudf._lib.cpp.types cimport (
     order_info,
     sorted,
 )
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def quantile(
@@ -118,7 +119,7 @@ def quantiles(Table source_table,
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names
     )
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index fbed410de86..acca2694d10 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -13,6 +13,7 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def interleave_columns(Table source_table):
@@ -35,7 +36,7 @@ def tile(Table source_table, size_type count):
     with nogil:
         c_result = move(cpp_tile(c_view, c_count))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names,
         index_names=source_table._index_names
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 8b0a34b134e..fe11d5e2627 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -30,11 +30,12 @@ from cudf.core.dtypes import ListDtype, StructDtype
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.table cimport Table
+from cudf._lib.table cimport Table, make_table_view
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
 from cudf._lib.interop import from_arrow, to_arrow
 
+cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.scalar.scalar cimport (
     duration_scalar,
     fixed_point_scalar,
@@ -58,10 +59,9 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
+from cudf._lib.utils cimport data_from_table_view
 
-from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype, is_struct_dtype
-
-cimport cudf._lib.cpp.types as libcudf_types
+import cudf
 
 
 cdef class DeviceScalar:
@@ -80,7 +80,7 @@ cdef class DeviceScalar:
         dtype : dtype
             A NumPy dtype.
         """
-        self._dtype = dtype if dtype.kind != 'U' else np.dtype('object')
+        self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
         self._set_value(value, self._dtype)
 
     def _set_value(self, value, dtype):
@@ -119,9 +119,9 @@ cdef class DeviceScalar:
     def _to_host_scalar(self):
         if isinstance(self.dtype, cudf.Decimal64Dtype):
             result = _get_py_decimal_from_fixed_point(self.c_value)
-        elif is_struct_dtype(self.dtype):
+        elif cudf.api.types.is_struct_dtype(self.dtype):
             result = _get_py_dict_from_struct(self.c_value)
-        elif is_list_dtype(self.dtype):
+        elif cudf.api.types.is_list_dtype(self.dtype):
             result = _get_py_list_from_list(self.c_value)
         elif pd.api.types.is_string_dtype(self.dtype):
             result = _get_py_string_from_string(self.c_value)
@@ -308,7 +308,7 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
                                 object value,
                                 object dtype,
                                 bool valid=True):
-    value = _decimal_to_int64(value) if valid else 0
+    value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
     s.reset(
         new fixed_point_scalar[decimal64](
             <int64_t>np.int64(value), scale_type(-dtype.scale), valid
@@ -338,8 +338,8 @@ cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
             names=columns
         )
 
-    cdef Table table = from_arrow(pyarrow_table, column_names=columns)
-    cdef table_view struct_view = table.view()
+    data, _ = from_arrow(pyarrow_table, column_names=columns)
+    cdef table_view struct_view = make_table_view(data.values())
 
     s.reset(
         new struct_scalar(struct_view, valid)
@@ -352,11 +352,14 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s):
     cdef table_view struct_table_view = (<struct_scalar*>s.get()).view()
     columns = [str(i) for i in range(struct_table_view.num_columns())]
 
-    cdef Table to_arrow_table = Table.from_table_view(
+    data, _ = data_from_table_view(
         struct_table_view,
         None,
         column_names=columns
     )
+    cdef Table to_arrow_table = Table(
+        cudf.core.column_accessor.ColumnAccessor(data)
+    )
 
     python_dict = to_arrow(to_arrow_table, columns).to_pydict()
 
@@ -556,7 +559,7 @@ def _is_null_host_scalar(slr):
 def _create_proxy_nat_scalar(dtype):
     cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar)
 
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.char in 'mM':
         nat = dtype.type('NaT').astype(dtype)
         if dtype.type == np.datetime64:
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 1d15052e41a..a07017ef796 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -24,6 +24,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
 from cudf._lib.sort cimport underlying_type_t_rank_method
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def is_sorted(
@@ -276,9 +277,9 @@ def rank_columns(Table source_table, object method, str na_option,
 
     cdef unique_ptr[table] c_result
     c_result.reset(new table(move(c_results)))
-    out_table = Table.from_unique_ptr(
+    data, _ = data_from_unique_ptr(
         move(c_result),
-        column_names=source_table._column_names
+        column_names=source_table._column_names,
+        index_names=None
     )
-    out_table._index = source_table._index
-    return out_table
+    return data, source_table._index
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index a7326efcc03..f1eca64bb87 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -25,6 +25,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def drop_nulls(Table source_table, how="any", keys=None, thresh=None):
@@ -78,7 +79,7 @@ def drop_nulls(Table source_table, how="any", keys=None, thresh=None):
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names,
         index_names=(
@@ -115,7 +116,7 @@ def apply_boolean_mask(Table source_table, Column boolean_mask):
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names,
         index_names=(
@@ -192,7 +193,7 @@ def drop_duplicates(Table source_table,
             )
         )
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=source_table._column_names,
         index_names=(
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 8f65cc9fee5..25e4149183e 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -10,10 +10,6 @@ from cudf._lib.scalar cimport DeviceScalar
 
 from cudf._lib.types import np_to_cudf_types
 
-from cudf._lib.types cimport underlying_type_t_type_id
-
-from cudf.core.column.column import as_column
-
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -55,6 +51,9 @@ from cudf._lib.cpp.strings.convert.convert_urls cimport (
     url_encode as cpp_url_encode,
 )
 from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.types cimport underlying_type_t_type_id
+
+import cudf
 
 
 def floating_to_string(Column input_col):
@@ -115,7 +114,7 @@ def stod(Column input_col, **kwargs):
     A Column with strings cast to double
     """
 
-    return string_to_floating(input_col, np.dtype("float64"))
+    return string_to_floating(input_col, cudf.dtype("float64"))
 
 
 def ftos(Column input_col):
@@ -147,7 +146,7 @@ def stof(Column input_col, **kwargs):
     A Column with strings cast to float
     """
 
-    return string_to_floating(input_col, np.dtype("float32"))
+    return string_to_floating(input_col, cudf.dtype("float32"))
 
 
 def integer_to_string(Column input_col):
@@ -208,7 +207,7 @@ def stoi8(Column input_col, **kwargs):
     A Column with strings cast to int8
     """
 
-    return string_to_integer(input_col, np.dtype("int8"))
+    return string_to_integer(input_col, cudf.dtype("int8"))
 
 
 def i16tos(Column input_col):
@@ -240,7 +239,7 @@ def stoi16(Column input_col):
     A Column with strings cast to int16
     """
 
-    return string_to_integer(input_col, np.dtype("int16"))
+    return string_to_integer(input_col, cudf.dtype("int16"))
 
 
 def itos(Column input_col):
@@ -272,7 +271,7 @@ def stoi(Column input_col):
     A Column with strings cast to int32
     """
 
-    return string_to_integer(input_col, np.dtype("int32"))
+    return string_to_integer(input_col, cudf.dtype("int32"))
 
 
 def ltos(Column input_col):
@@ -304,7 +303,7 @@ def stol(Column input_col, **kwargs):
     A Column with strings cast to int64
     """
 
-    return string_to_integer(input_col, np.dtype("int64"))
+    return string_to_integer(input_col, cudf.dtype("int64"))
 
 
 def ui8tos(Column input_col):
@@ -336,7 +335,7 @@ def stoui8(Column input_col, **kwargs):
     A Column with strings cast to uint8
     """
 
-    return string_to_integer(input_col, np.dtype("uint8"))
+    return string_to_integer(input_col, cudf.dtype("uint8"))
 
 
 def ui16tos(Column input_col):
@@ -368,7 +367,7 @@ def stoui16(Column input_col, **kwargs):
     A Column with strings cast to uint16
     """
 
-    return string_to_integer(input_col, np.dtype("uint16"))
+    return string_to_integer(input_col, cudf.dtype("uint16"))
 
 
 def uitos(Column input_col):
@@ -400,7 +399,7 @@ def stoui(Column input_col, **kwargs):
     A Column with strings cast to uint32
     """
 
-    return string_to_integer(input_col, np.dtype("uint32"))
+    return string_to_integer(input_col, cudf.dtype("uint32"))
 
 
 def ultos(Column input_col):
@@ -432,7 +431,7 @@ def stoul(Column input_col, **kwargs):
     A Column with strings cast to uint64
     """
 
-    return string_to_integer(input_col, np.dtype("uint64"))
+    return string_to_integer(input_col, cudf.dtype("uint64"))
 
 
 def _to_booleans(Column input_col, object string_true="True"):
@@ -588,7 +587,7 @@ def istimestamp(
 
     """
     if input_col.size == 0:
-        return as_column([], dtype=kwargs.get('dtype'))
+        return cudf.core.column.as_column([], dtype=kwargs.get('dtype'))
     cdef column_view input_column_view = input_col.view()
     cdef string c_timestamp_format = <string>str(format).encode('UTF-8')
     cdef unique_ptr[column] c_result
@@ -745,7 +744,7 @@ def htoi(Column input_col, **kwargs):
     cdef column_view input_column_view = input_col.view()
     cdef type_id tid = <type_id> (
         <underlying_type_t_type_id> (
-            np_to_cudf_types[kwargs.get('dtype', np.dtype("int64"))]
+            np_to_cudf_types[kwargs.get('dtype', cudf.dtype("int64"))]
         )
     )
     cdef data_type c_out_type = data_type(tid)
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 866c2861995..598ac804dd6 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -64,6 +64,7 @@
 from cudf._lib.strings.findall import findall
 from cudf._lib.strings.json import get_json_object
 from cudf._lib.strings.padding import PadSide, center, ljust, pad, rjust, zfill
+from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence
 from cudf._lib.strings.replace import (
     insert,
     replace,
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 6eb8984b869..e35ab6489c6 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -6,11 +6,6 @@ from cudf._lib.column cimport Column
 
 from cudf._lib.types import np_to_cudf_types
 
-from cudf._lib.cpp.types cimport DECIMAL64
-from cudf._lib.types cimport underlying_type_t_type_id
-
-from cudf.core.column.column import as_column
-
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -22,7 +17,8 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.cpp.types cimport DECIMAL64, data_type, type_id
+from cudf._lib.types cimport underlying_type_t_type_id
 
 
 def from_decimal(Column input_col):
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 58558fade24..74d8e548ad1 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -11,6 +11,7 @@ from cudf._lib.cpp.strings.extract cimport extract as cpp_extract
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def extract(Column source_strings, object pattern):
@@ -31,7 +32,7 @@ def extract(Column source_strings, object pattern):
             pattern_string
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index cc5730c467d..702b0fc8053 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -12,6 +12,7 @@ from cudf._lib.cpp.strings.findall cimport findall_re as cpp_findall_re
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def findall(Column source_strings, pattern):
@@ -30,7 +31,7 @@ def findall(Column source_strings, pattern):
             pattern_string
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
new file mode 100644
index 00000000000..49a46f418b1
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -0,0 +1,49 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.strings cimport repeat as cpp_repeat
+from cudf._lib.cpp.types cimport size_type
+
+
+def repeat_scalar(Column source_strings,
+                  size_type repeats):
+    """
+    Returns a Column after repeating
+    each string in `source_strings`
+    `repeats` number of times.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_repeat.repeat_strings(
+            source_view,
+            repeats
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+def repeat_sequence(Column source_strings,
+                    Column repeats):
+    """
+    Returns a Column after repeating
+    each string in `source_strings`
+    `repeats` number of times.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef column_view repeats_view = repeats.view()
+
+    with nogil:
+        c_result = move(cpp_repeat.repeat_strings(
+            source_view,
+            repeats_view
+        ))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index 590de5bf526..0e62ab69298 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -17,6 +17,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def partition(Column source_strings,
@@ -40,7 +41,7 @@ def partition(Column source_strings,
             scalar_str[0]
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
@@ -67,7 +68,7 @@ def rpartition(Column source_strings,
             scalar_str[0]
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index 599f7602b51..a2ce237ced6 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -19,6 +19,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_unique_ptr
 
 
 def split(Column source_strings,
@@ -45,7 +46,7 @@ def split(Column source_strings,
             maxsplit
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
@@ -104,7 +105,7 @@ def rsplit(Column source_strings,
             maxsplit
         ))
 
-    return Table.from_unique_ptr(
+    return data_from_unique_ptr(
         move(c_result),
         column_names=range(0, c_result.get()[0].num_columns())
     )
diff --git a/python/cudf/cudf/_lib/table.pxd b/python/cudf/cudf/_lib/table.pxd
index e1bffbc3864..0730199c8a9 100644
--- a/python/cudf/cudf/_lib/table.pxd
+++ b/python/cudf/cudf/_lib/table.pxd
@@ -16,21 +16,6 @@ cdef class Table:
     cdef table_view index_view(self) except *
     cdef mutable_table_view mutable_index_view(self) except *
 
-    @staticmethod
-    cdef Table from_unique_ptr(
-        unique_ptr[table] c_tbl,
-        column_names,
-        index_names=*
-    )
-
-    @staticmethod
-    cdef Table from_table_view(
-        table_view,
-        owner,
-        column_names,
-        index_names=*
-    )
-
 cdef table_view make_table_view(columns) except *
 cdef mutable_table_view make_mutable_table_view(columns) except *
 cdef columns_from_ptr(unique_ptr[table] c_tbl)
diff --git a/python/cudf/cudf/_lib/table.pyi b/python/cudf/cudf/_lib/table.pyi
index 2a5dfb2a4dd..ccf0eab99dc 100644
--- a/python/cudf/cudf/_lib/table.pyi
+++ b/python/cudf/cudf/_lib/table.pyi
@@ -6,7 +6,7 @@ import cudf
 
 class Table(object):
     _data: cudf.core.column_accessor.ColumnAccessor
-    _index: Optional[cudf.core.index.Index]
+    _index: Optional[cudf.core.index.BaseIndex]
 
     def __init__(self, data: object = None, index: object = None) -> None: ...
 
diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index 07d7a0fcf02..2981a46a54a 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -4,8 +4,6 @@ import itertools
 
 import numpy as np
 
-from cudf.core.column_accessor import ColumnAccessor
-
 from cython.operator cimport dereference
 from libc.stdint cimport uintptr_t
 from libcpp.memory cimport unique_ptr
@@ -19,6 +17,8 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport mutable_table_view, table_view
 from cudf._lib.cpp.types cimport size_type
 
+import cudf
+
 
 cdef class Table:
     def __init__(self, object data=None, object index=None):
@@ -34,7 +34,7 @@ cdef class Table:
         """
         if data is None:
             data = {}
-        self._data = ColumnAccessor(data)
+        self._data = cudf.core.column_accessor.ColumnAccessor(data)
         self._index = index
 
     @property
@@ -71,106 +71,6 @@ cdef class Table:
         """
         return self._data.columns
 
-    @staticmethod
-    cdef Table from_unique_ptr(
-        unique_ptr[table] c_tbl,
-        object column_names,
-        object index_names=None
-    ):
-        """
-        Construct a Table from a unique_ptr to a cudf::table.
-
-        Parameters
-        ----------
-        c_tbl : unique_ptr[cudf::table]
-        index_names : iterable
-        column_names : iterable
-        """
-        cdef vector[unique_ptr[column]] columns
-        columns = move(c_tbl.get()[0].release())
-
-        cdef vector[unique_ptr[column]].iterator it = columns.begin()
-
-        # First construct the index, if any
-        cdef int i
-
-        index = None
-        if index_names is not None:
-            index_data = ColumnAccessor._create_unsafe(
-                {
-                    name: Column.from_unique_ptr(
-                        move(dereference(it + i))
-                    )
-                    for i, name in enumerate(index_names)
-                }
-            )
-            index = Table(data=index_data)
-
-        # Construct the data dict
-        cdef int n_index_columns = len(index_names) if index_names else 0
-        data = ColumnAccessor._create_unsafe(
-            {
-                name: Column.from_unique_ptr(
-                    move(dereference(it + i + n_index_columns))
-                )
-                for i, name in enumerate(column_names)
-            }
-        )
-
-        return Table(data=data, index=index)
-
-    @staticmethod
-    cdef Table from_table_view(
-        table_view tv,
-        object owner,
-        object column_names,
-        object index_names=None
-    ):
-        """
-        Given a ``cudf::table_view``, constructs a ``cudf.Table`` from it,
-        along with referencing an ``owner`` Python object that owns the memory
-        lifetime. If ``owner`` is a ``cudf.Table``, we reach inside of it and
-        reach inside of each ``cudf.Column`` to make the owner of each newly
-        created ``Buffer`` underneath the ``cudf.Column`` objects of the
-        created ``cudf.Table`` the respective ``Buffer`` from the relevant
-        ``cudf.Column`` of the ``owner`` ``cudf.Table``.
-        """
-        cdef size_type column_idx = 0
-        table_owner = isinstance(owner, Table)
-
-        # First construct the index, if any
-        index = None
-        if index_names is not None:
-            index_columns = []
-            for _ in index_names:
-                column_owner = owner
-                if table_owner:
-                    column_owner = owner._index._columns[column_idx]
-                index_columns.append(
-                    Column.from_column_view(
-                        tv.column(column_idx),
-                        column_owner
-                    )
-                )
-                column_idx += 1
-            index = Table(dict(zip(index_names, index_columns)))
-
-        # Construct the data dict
-        cdef size_type source_column_idx = 0
-        data_columns = []
-        for _ in column_names:
-            column_owner = owner
-            if table_owner:
-                column_owner = owner._columns[source_column_idx]
-            data_columns.append(
-                Column.from_column_view(tv.column(column_idx), column_owner)
-            )
-            column_idx += 1
-            source_column_idx += 1
-        data = dict(zip(column_names, data_columns))
-
-        return Table(data=data, index=index)
-
     cdef table_view view(self) except *:
         """
         Return a cudf::table_view of all columns (including index columns)
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 9a0c06a6fa1..9fada59640e 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -27,6 +27,7 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.utils cimport data_from_unique_ptr
 
 from numba.np import numpy_support
 
@@ -57,8 +58,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
     Given a mask buffer, returns a boolean column representng bit 0 -> False
     and 1 -> True within range of [begin_bit, end_bit),
     """
-    if not isinstance(mask_buffer, cudf.core.Buffer):
-        raise TypeError("mask_buffer is not an instance of cudf.core.Buffer")
+    if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
+        raise TypeError("mask_buffer is not an instance of "
+                        "cudf.core.buffer.Buffer")
     cdef bitmask_type* bit_mask = <bitmask_type*><uintptr_t>(mask_buffer.ptr)
 
     cdef unique_ptr[column] result
@@ -97,7 +99,7 @@ def transform(Column input, op):
     nb_signature = (nb_type,)
     compiled_op = cudautils.compile_udf(op, nb_signature)
     c_str = compiled_op[0].encode('UTF-8')
-    np_dtype = np.dtype(compiled_op[1])
+    np_dtype = cudf.dtype(compiled_op[1])
 
     try:
         c_tid = <type_id> (
@@ -151,7 +153,7 @@ def table_encode(Table input):
         c_result = move(libcudf_transform.encode(c_input))
 
     return (
-        Table.from_unique_ptr(
+        *data_from_unique_ptr(
             move(c_result.first),
             column_names=input._column_names,
         ),
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 7e4423419c9..0f8f0b6ea14 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -14,6 +14,7 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
 from cudf._lib.table cimport Table
+from cudf._lib.utils cimport data_from_table_view
 
 
 def transpose(Table source):
@@ -51,14 +52,14 @@ def transpose(Table source):
         c_result = move(cpp_transpose(c_input))
 
     result_owner = Column.from_unique_ptr(move(c_result.first))
-    result = Table.from_table_view(
+    data, _ = data_from_table_view(
         c_result.second,
         owner=result_owner,
         column_names=range(source._num_rows)
     )
 
     if cats is not None:
-        result = Table(index=result._index, data=[
+        data= [
             (name, cudf.core.column.column.build_categorical_column(
                 codes=cudf.core.column.column.as_column(
                     col.base_data, dtype=col.dtype),
@@ -67,7 +68,7 @@ def transpose(Table source):
                 categories=cats,
                 offset=col.offset,
             ))
-            for name, col in result._data.items()
-        ])
+            for name, col in data.items()
+        ]
 
-    return result
+    return data
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index d93e1b75376..d3a4c45f213 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -30,6 +30,7 @@ from cudf.utils.dtypes import (
 )
 
 cimport cudf._lib.cpp.types as libcudf_types
+import cudf
 
 
 class TypeId(IntEnum):
@@ -188,11 +189,11 @@ cdef dtype_from_lists_column_view(column_view cv):
     cdef column_view child = lv.get()[0].child()
 
     if child.type().id() == libcudf_types.type_id.LIST:
-        return ListDtype(dtype_from_lists_column_view(child))
+        return cudf.ListDtype(dtype_from_lists_column_view(child))
     elif child.type().id() == libcudf_types.type_id.EMPTY:
-        return ListDtype(np.dtype("int8"))
+        return cudf.ListDtype("int8")
     else:
-        return ListDtype(
+        return cudf.ListDtype(
             dtype_from_column_view(child)
         )
 
@@ -201,7 +202,7 @@ cdef dtype_from_structs_column_view(column_view cv):
         str(i): dtype_from_column_view(cv.child(i))
         for i in range(cv.num_children())
     }
-    return StructDtype(fields)
+    return cudf.StructDtype(fields)
 
 cdef dtype_from_column_view(column_view cv):
     cdef libcudf_types.type_id tid = cv.type().id()
@@ -210,26 +211,26 @@ cdef dtype_from_column_view(column_view cv):
     elif tid == libcudf_types.type_id.STRUCT:
         return dtype_from_structs_column_view(cv)
     elif tid == libcudf_types.type_id.DECIMAL64:
-        return Decimal64Dtype(
-            precision=Decimal64Dtype.MAX_PRECISION,
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
             scale=-cv.type().scale()
         )
     elif tid == libcudf_types.type_id.DECIMAL32:
-        return Decimal32Dtype(
-            precision=Decimal32Dtype.MAX_PRECISION,
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
             scale=-cv.type().scale()
         )
     else:
         return cudf_to_np_types[<underlying_type_t_type_id>(tid)]
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
-    if is_list_dtype(dtype):
+    if cudf.api.types.is_list_dtype(dtype):
         tid = libcudf_types.type_id.LIST
-    elif is_struct_dtype(dtype):
+    elif cudf.api.types.is_struct_dtype(dtype):
         tid = libcudf_types.type_id.STRUCT
-    elif is_decimal64_dtype(dtype):
+    elif cudf.api.types.is_decimal64_dtype(dtype):
         tid = libcudf_types.type_id.DECIMAL64
-    elif is_decimal32_dtype(dtype):
+    elif cudf.api.types.is_decimal32_dtype(dtype):
         tid = libcudf_types.type_id.DECIMAL32
     else:
         tid = <libcudf_types.type_id> (
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index e8ac858d8b2..f9b225a0b89 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -1,10 +1,11 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 from cudf._lib.cpp.column.column cimport column_view
-from cudf._lib.cpp.table.table cimport table_view
+from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.table cimport Table
 
 
@@ -12,3 +13,7 @@ cdef vector[column_view] make_column_views(object columns) except*
 cdef vector[table_view] make_table_views(object tables) except*
 cdef vector[table_view] make_table_data_views(object tables) except*
 cdef vector[string] get_column_names(Table table, object index) except*
+cdef data_from_unique_ptr(
+    unique_ptr[table] c_tbl, column_names, index_names=*)
+cdef data_from_table_view(
+    table_view tv, object owner, object column_names, object index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index d42e15df9f3..81b62159b59 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -4,13 +4,17 @@ import pyarrow as pa
 
 import cudf
 
+from cython.operator cimport dereference
 from libc.stdint cimport uint8_t
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
+from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column_view
+from cudf._lib.cpp.column.column cimport column, column_view
 from cudf._lib.cpp.table.table cimport table_view
+from cudf._lib.cpp.types cimport size_type
 from cudf._lib.table cimport Table
 
 try:
@@ -192,3 +196,124 @@ def _index_level_name(index_name, level, column_names):
         return index_name
     else:
         return f"__index_level_{level}__"
+
+
+cdef data_from_unique_ptr(
+    unique_ptr[table] c_tbl, column_names, index_names=None
+):
+    """Convert a libcudf table into a dict with an index.
+
+    This method is intended to provide the bridge between the columns returned
+    from calls to libcudf APIs and the cuDF Python Table objects, which require
+    named columns and a separate index.
+
+    Since cuDF Python has an independent representation of a table as a
+    collection of columns, this function simply returns a dict of columns
+    suitable for conversion into data to be passed to cuDF constructors.
+    This method returns the columns of the table in the order they are
+    stored in libcudf, but calling code is responsible for partitioning and
+    labeling them as needed.
+
+    Parameters
+    ----------
+    c_tbl : unique_ptr[cudf::table]
+        The libcudf table whose columns will be extracted
+    column_names : iterable
+        The keys associated with the columns in the output data.
+    index_names : iterable, optional
+        If provided, an iterable of strings that will be used to label the
+        corresponding first set of columns into a (Multi)Index. If this
+        argument is omitted, all columns are assumed to be part of the output
+        table and no index is constructed.
+
+
+    Returns
+    -------
+    tuple(Dict[str, Column], Optional[Index])
+        A dict of the columns in the output table.
+    """
+    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
+    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
+
+    cdef int i
+
+    columns = [Column.from_unique_ptr(move(dereference(it+i)))
+               for i in range(c_columns.size())]
+
+    # First construct the index, if any
+    index = (
+        # TODO: For performance, the _from_data methods of Frame types assume
+        # that the passed index object is already an Index because cudf.Index
+        # and cudf.as_index are expensive. As a result, this function is
+        # currently somewhat inconsistent in returning a dict of columns for
+        # the data while actually constructing the Index object here (instead
+        # of just returning a dict for that as well). As we clean up the
+        # Frame factories we may want to look for a less dissonant approach
+        # that does not impose performance penalties. The same applies to
+        # data_from_table_view below.
+        cudf.Index._from_data(
+            {
+                name: columns[i]
+                for i, name in enumerate(index_names)
+            }
+        )
+        if index_names is not None
+        else None
+    )
+    n_index_columns = len(index_names) if index_names is not None else 0
+    data = {
+        name: columns[i + n_index_columns]
+        for i, name in enumerate(column_names)
+    }
+    return data, index
+
+
+cdef data_from_table_view(
+    table_view tv,
+    object owner,
+    object column_names,
+    object index_names=None
+):
+    """
+    Given a ``cudf::table_view``, constructs a ``cudf.Table`` from it,
+    along with referencing an ``owner`` Python object that owns the memory
+    lifetime. If ``owner`` is a ``cudf.Table``, we reach inside of it and
+    reach inside of each ``cudf.Column`` to make the owner of each newly
+    created ``Buffer`` underneath the ``cudf.Column`` objects of the
+    created ``cudf.Table`` the respective ``Buffer`` from the relevant
+    ``cudf.Column`` of the ``owner`` ``cudf.Table``.
+    """
+    cdef size_type column_idx = 0
+    table_owner = isinstance(owner, Table)
+
+    # First construct the index, if any
+    index = None
+    if index_names is not None:
+        index_columns = []
+        for _ in index_names:
+            column_owner = owner
+            if table_owner:
+                column_owner = owner._index._columns[column_idx]
+            index_columns.append(
+                Column.from_column_view(
+                    tv.column(column_idx),
+                    column_owner
+                )
+            )
+            column_idx += 1
+        index = cudf.Index._from_data(dict(zip(index_names, index_columns)))
+
+    # Construct the data dict
+    cdef size_type source_column_idx = 0
+    data_columns = []
+    for _ in column_names:
+        column_owner = owner
+        if table_owner:
+            column_owner = owner._columns[source_column_idx]
+        data_columns.append(
+            Column.from_column_view(tv.column(column_idx), column_owner)
+        )
+        column_idx += 1
+        source_column_idx += 1
+
+    return dict(zip(column_names, data_columns)), index
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 01af22f70bf..bf296e11178 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -14,9 +14,9 @@
 from pandas.api import types as pd_types
 
 import cudf
-from cudf._lib.scalar import DeviceScalar
 from cudf.core.dtypes import (  # noqa: F401
     _BaseDtype,
+    dtype,
     is_categorical_dtype,
     is_decimal32_dtype,
     is_decimal64_dtype,
@@ -124,7 +124,7 @@ def is_scalar(val):
         Return True if given object is scalar.
     """
     return (
-        isinstance(val, DeviceScalar)
+        isinstance(val, cudf._lib.scalar.DeviceScalar)
         or isinstance(val, cudf.Scalar)
         or isinstance(val, cudf.core.tools.datetimes.DateOffset)
         or pd_types.is_scalar(val)
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index 451572224c6..85b4bf20e5c 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -6,10 +6,11 @@
 import pandas as pd
 import pyarrow as pa
 
+from cudf import Series
 from cudf._lib.gpuarrow import (
     CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader,
 )
-from cudf.core import Series, column
+from cudf.core import column
 from cudf.utils.utils import mask_bitsize, mask_dtype
 
 
diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py
index 5eaa5b52fd4..ec4878b332d 100644
--- a/python/cudf/cudf/core/__init__.py
+++ b/python/cudf/cudf/core/__init__.py
@@ -1,31 +1 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
-
-from cudf.core import _internals, buffer, column, column_accessor, common
-from cudf.core.buffer import Buffer
-from cudf.core.dataframe import DataFrame, from_pandas, merge
-from cudf.core.index import (
-    BaseIndex,
-    CategoricalIndex,
-    DatetimeIndex,
-    Float32Index,
-    Float64Index,
-    GenericIndex,
-    Index,
-    Int8Index,
-    Int16Index,
-    Int32Index,
-    Int64Index,
-    IntervalIndex,
-    RangeIndex,
-    TimedeltaIndex,
-    UInt8Index,
-    UInt16Index,
-    UInt32Index,
-    UInt64Index,
-    interval_range,
-)
-from cudf.core.multiindex import MultiIndex
-from cudf.core.scalar import NA, Scalar
-from cudf.core.series import Series
-import cudf.core.udf
-from cudf.core.cut import cut
diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py
index 53d186def85..6faeeffdbec 100644
--- a/python/cudf/cudf/core/_internals/__init__.py
+++ b/python/cudf/cudf/core/_internals/__init__.py
@@ -1,3 +1 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
-
-from cudf.core._internals.where import where
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 87dc1d8e01f..0688283bc43 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -27,7 +27,9 @@ def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike:
             f"{type(other).__name__} to {col.dtype.name}"
         )
 
-    return cudf.Scalar(other, dtype=col.dtype if other is None else None)
+    return cudf.Scalar(
+        other, dtype=col.dtype if other in {None, cudf.NA} else None
+    )
 
 
 def _check_and_cast_columns_with_other(
@@ -234,9 +236,15 @@ def where(
 
     if isinstance(frame, DataFrame):
         if hasattr(cond, "__cuda_array_interface__"):
-            cond = DataFrame(
-                cond, columns=frame._column_names, index=frame.index
-            )
+            if isinstance(cond, Series):
+                cond = DataFrame(
+                    {name: cond for name in frame._column_names},
+                    index=frame.index,
+                )
+            else:
+                cond = DataFrame(
+                    cond, columns=frame._column_names, index=frame.index
+                )
         elif (
             hasattr(cond, "__array_interface__")
             and cond.__array_interface__["shape"] != frame.shape
@@ -378,6 +386,6 @@ def where(
         if isinstance(frame, Index):
             result = Index(result, name=frame.name)
         else:
-            result = frame._copy_construct(data=result)
+            result = frame._from_data({frame.name: result}, frame._index)
 
         return frame._mimic_inplace(result, inplace=inplace)
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 9f26ac8ee78..50ad592b54f 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -2,7 +2,11 @@
 from warnings import warn
 
 import cupy as cp
+import numpy as np
 
+from cudf.core.column import as_column
+from cudf.core.frame import Frame
+from cudf.core.index import RangeIndex
 from cudf.core.series import Index, Series
 
 
@@ -35,7 +39,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
 
     See Also
     --------
-    cudf.core.series.Series.factorize : Encode the input values of Series.
+    cudf.Series.factorize : Encode the input values of Series.
 
     """
     if sort:
@@ -59,3 +63,55 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
     values.name = name
 
     return labels, cats.values if return_cupy_array else Index(cats)
+
+
+def _linear_interpolation(column, index=None):
+    """
+    Interpolate over a float column. Implicitly assumes that values are
+    evenly spaced with respect to the x-axis, for example the data
+    [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
+    between the two valid values, yielding [1.0, 2.0, 3.0]
+    """
+
+    index = RangeIndex(start=0, stop=len(column), step=1)
+    return _index_or_values_interpolation(column, index=index)
+
+
+def _index_or_values_interpolation(column, index=None):
+    """
+    Interpolate over a float column. assumes a linear interpolation
+    strategy using the index of the data to denote spacing of the x
+    values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
+    would result in [1.0, 3.0, 4.0]
+    """
+    # figure out where the nans are
+    mask = cp.isnan(column)
+
+    # trivial cases, all nan or no nans
+    num_nan = mask.sum()
+    if num_nan == 0 or num_nan == len(column):
+        return column
+
+    to_interp = Frame(data={None: column}, index=index)
+    known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
+
+    known_x = known_x_and_y._index._column.values
+    known_y = known_x_and_y._data.columns[0].values
+
+    result = cp.interp(to_interp._index.values, known_x, known_y)
+
+    # find the first nan
+    first_nan_idx = (mask == 0).argmax().item()
+    result[:first_nan_idx] = np.nan
+    return result
+
+
+def get_column_interpolator(method):
+    interpolator = {
+        "linear": _linear_interpolation,
+        "index": _index_or_values_interpolation,
+        "values": _index_or_values_interpolation,
+    }.get(method, None)
+    if not interpolator:
+        raise ValueError(f"Interpolation method `{method}` not found")
+    return interpolator
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index c6875052685..0658927975f 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -11,10 +11,28 @@
 import rmm
 from rmm import DeviceBuffer
 
+import cudf
 from cudf.core.abc import Serializable
 
 
 class Buffer(Serializable):
+    """
+    A Buffer represents a device memory allocation.
+
+    Parameters
+    ----------
+    data : Buffer, array_like, int
+        An array-like object or integer representing a
+        device or host pointer to pre-allocated memory.
+    size : int, optional
+        Size of memory allocation. Required if a pointer
+        is passed for `data`.
+    owner : object, optional
+        Python object to which the lifetime of the memory
+        allocation is tied. If provided, a reference to this
+        object is kept in this Buffer.
+    """
+
     ptr: int
     size: int
     _owner: Any
@@ -22,22 +40,7 @@ class Buffer(Serializable):
     def __init__(
         self, data: Any = None, size: Optional[int] = None, owner: Any = None
     ):
-        """
-        A Buffer represents a device memory allocation.
-
-        Parameters
-        ----------
-        data : Buffer, array_like, int
-            An array-like object or integer representing a
-            device or host pointer to pre-allocated memory.
-        size : int, optional
-            Size of memory allocation. Required if a pointer
-            is passed for `data`.
-        owner : object, optional
-            Python object to which the lifetime of the memory
-            allocation is tied. If provided, a reference to this
-            object is kept in this Buffer.
-        """
+
         if isinstance(data, Buffer):
             self.ptr = data.ptr
             self.size = data.size
@@ -157,7 +160,7 @@ def _buffer_data_from_array_interface(array_interface):
     ptr = array_interface["data"][0]
     if ptr is None:
         ptr = 0
-    itemsize = np.dtype(array_interface["typestr"]).itemsize
+    itemsize = cudf.dtype(array_interface["typestr"]).itemsize
     shape = (
         array_interface["shape"] if len(array_interface["shape"]) > 0 else (1,)
     )
@@ -168,7 +171,7 @@ def _buffer_data_from_array_interface(array_interface):
 def confirm_1d_contiguous(array_interface):
     strides = array_interface["strides"]
     shape = array_interface["shape"]
-    itemsize = np.dtype(array_interface["typestr"]).itemsize
+    itemsize = cudf.dtype(array_interface["typestr"]).itemsize
     typestr = array_interface["typestr"]
     if typestr not in ("|i1", "|u1"):
         raise TypeError("Buffer data must be of uint8 type")
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 48398e03b2d..7333ae119cd 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -22,7 +22,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.scalar import as_device_scalar
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.core.buffer import Buffer
@@ -49,62 +48,63 @@
 
 
 class CategoricalAccessor(ColumnMethods):
+    """
+    Accessor object for categorical properties of the Series values.
+    Be aware that assigning to `categories` is a inplace operation,
+    while all methods return new categorical data per default.
+
+    Parameters
+    ----------
+    column : Column
+    parent : Series or CategoricalIndex
+
+    Examples
+    --------
+    >>> s = cudf.Series([1,2,3], dtype='category')
+    >>> s
+    >>> s
+    0    1
+    1    2
+    2    3
+    dtype: category
+    Categories (3, int64): [1, 2, 3]
+    >>> s.cat.categories
+    Int64Index([1, 2, 3], dtype='int64')
+    >>> s.cat.reorder_categories([3,2,1])
+    0    1
+    1    2
+    2    3
+    dtype: category
+    Categories (3, int64): [3, 2, 1]
+    >>> s.cat.remove_categories([1])
+    0    <NA>
+    1       2
+    2       3
+    dtype: category
+    Categories (2, int64): [2, 3]
+    >>> s.cat.set_categories(list('abcde'))
+    0    <NA>
+    1    <NA>
+    2    <NA>
+    dtype: category
+    Categories (5, object): ['a', 'b', 'c', 'd', 'e']
+    >>> s.cat.as_ordered()
+    0    1
+    1    2
+    2    3
+    dtype: category
+    Categories (3, int64): [1 < 2 < 3]
+    >>> s.cat.as_unordered()
+    0    1
+    1    2
+    2    3
+    dtype: category
+    Categories (3, int64): [1, 2, 3]
+    """
+
     _column: CategoricalColumn
 
     def __init__(self, parent: SeriesOrIndex):
-        """
-        Accessor object for categorical properties of the Series values.
-        Be aware that assigning to `categories` is a inplace operation,
-        while all methods return new categorical data per default.
-
-        Parameters
-        ----------
-        column : Column
-        parent : Series or CategoricalIndex
-
-        Examples
-        --------
-        >>> s = cudf.Series([1,2,3], dtype='category')
-        >>> s
-        >>> s
-        0    1
-        1    2
-        2    3
-        dtype: category
-        Categories (3, int64): [1, 2, 3]
-        >>> s.cat.categories
-        Int64Index([1, 2, 3], dtype='int64')
-        >>> s.cat.reorder_categories([3,2,1])
-        0    1
-        1    2
-        2    3
-        dtype: category
-        Categories (3, int64): [3, 2, 1]
-        >>> s.cat.remove_categories([1])
-        0    <NA>
-        1       2
-        2       3
-        dtype: category
-        Categories (2, int64): [2, 3]
-        >>> s.cat.set_categories(list('abcde'))
-        0    <NA>
-        1    <NA>
-        2    <NA>
-        dtype: category
-        Categories (5, object): ['a', 'b', 'c', 'd', 'e']
-        >>> s.cat.as_ordered()
-        0    1
-        1    2
-        2    3
-        dtype: category
-        Categories (3, int64): [1 < 2 < 3]
-        >>> s.cat.as_unordered()
-        0    1
-        1    2
-        2    3
-        dtype: category
-        Categories (3, int64): [1, 2, 3]
-        """
         if not is_categorical_dtype(parent.dtype):
             raise AttributeError(
                 "Can only use .cat accessor with a 'category' dtype"
@@ -525,50 +525,12 @@ def set_categories(
         dtype: category
         Categories (2, int64): [1, 10]
         """
-        ordered = ordered if ordered is not None else self.ordered
-        new_categories = column.as_column(new_categories)
-
-        if isinstance(new_categories, CategoricalColumn):
-            new_categories = new_categories.categories
-
-        # when called with rename=True, the pandas behavior is
-        # to replace the current category values with the new
-        # categories.
-        if rename:
-            # enforce same length
-            if len(new_categories) != len(self._column.categories):
-                raise ValueError(
-                    "new_categories must have the same "
-                    "number of items as old categories"
-                )
-
-            out_col = column.build_categorical_column(
-                categories=new_categories,
-                codes=self._column.base_children[0],
-                mask=self._column.base_mask,
-                size=self._column.size,
-                offset=self._column.offset,
-                ordered=ordered,
-            )
-        else:
-            out_col = self._column
-            if not (type(out_col.categories) is type(new_categories)):
-                # If both categories are of different Column types,
-                # return a column full of Nulls.
-                out_col = _create_empty_categorical_column(
-                    self._column,
-                    CategoricalDtype(
-                        categories=new_categories, ordered=ordered
-                    ),
-                )
-            elif (
-                not out_col._categories_equal(new_categories, ordered=ordered)
-                or not self.ordered == ordered
-            ):
-                out_col = out_col._set_categories(
-                    new_categories, ordered=ordered,
-                )
-        return self._return_or_inplace(out_col, inplace=inplace)
+        return self._return_or_inplace(
+            self._column.set_categories(
+                new_categories=new_categories, ordered=ordered, rename=rename
+            ),
+            inplace=inplace,
+        )
 
     def reorder_categories(
         self,
@@ -648,7 +610,19 @@ def reorder_categories(
 
 
 class CategoricalColumn(column.ColumnBase):
-    """Implements operations for Columns of Categorical type
+    """
+    Implements operations for Columns of Categorical type
+
+    Parameters
+    ----------
+    dtype : CategoricalDtype
+    mask : Buffer
+        The validity mask
+    offset : int
+        Data offset
+    children : Tuple[ColumnBase]
+        Two non-null columns containing the categories and codes
+        respectively
     """
 
     dtype: cudf.core.dtypes.CategoricalDtype
@@ -664,18 +638,7 @@ def __init__(
         null_count: int = None,
         children: Tuple["column.ColumnBase", ...] = (),
     ):
-        """
-        Parameters
-        ----------
-        dtype : CategoricalDtype
-        mask : Buffer
-            The validity mask
-        offset : int
-            Data offset
-        children : Tuple[ColumnBase]
-            Two non-null columns containing the categories and codes
-            respectively
-        """
+
         if size is None:
             for child in children:
                 assert child.offset == 0
@@ -882,7 +845,9 @@ def _fill(
             return self if inplace else self.copy()
 
         fill_code = self._encode(fill_value)
-        fill_scalar = as_device_scalar(fill_code, self.codes.dtype)
+        fill_scalar = cudf._lib.scalar.as_device_scalar(
+            fill_code, self.codes.dtype
+        )
 
         result = self if inplace else self.copy()
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a5e49b026f3..d52f63a79f5 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -53,7 +53,6 @@
     is_scalar,
     is_string_dtype,
     is_struct_dtype,
-    pandas_dtype,
 )
 from cudf.core.abc import Serializable
 from cudf.core.buffer import Buffer
@@ -65,11 +64,11 @@
 )
 from cudf.utils import ioutils, utils
 from cudf.utils.dtypes import (
-    check_cast_unsupported_dtype,
     cudf_dtype_from_pa_type,
     get_time_unit,
     min_unsigned_type,
     np_to_pa_dtype,
+    pandas_dtypes_alias_to_cudf_alias,
     pandas_dtypes_to_cudf_dtypes,
 )
 from cudf.utils.utils import mask_dtype
@@ -82,7 +81,7 @@ def as_frame(self) -> "cudf.core.frame.Frame":
         """
         Converts a Column to Frame
         """
-        return cudf.core.frame.Frame({None: self.copy(deep=False)})
+        return cudf.core.frame.SingleColumnFrame({None: self.copy(deep=False)})
 
     @property
     def data_array_view(self) -> "cuda.devicearray.DeviceNDArray":
@@ -171,11 +170,31 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
     def _null_equals(self, other: ColumnBase) -> ColumnBase:
         return self.binary_operator("NULL_EQUALS", other)
 
-    def all(self) -> bool:
-        return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_))
+    def all(self, skipna: bool = True) -> bool:
+        # If all entries are null the result is True, including when the column
+        # is empty.
+        result_col = self.nans_to_nulls() if skipna else self
 
-    def any(self) -> bool:
-        return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_))
+        if result_col.null_count == result_col.size:
+            return True
+
+        if isinstance(result_col, ColumnBase):
+            return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+        else:
+            return result_col
+
+    def any(self, skipna: bool = True) -> bool:
+        # Early exit for fast cases.
+        result_col = self.nans_to_nulls() if skipna else self
+        if not skipna and result_col.has_nulls:
+            return True
+        elif skipna and result_col.null_count == result_col.size:
+            return False
+
+        if isinstance(result_col, ColumnBase):
+            return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+        else:
+            return result_col
 
     def __sizeof__(self) -> int:
         n = 0
@@ -241,7 +260,9 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
         if not isinstance(array, (pa.Array, pa.ChunkedArray)):
             raise TypeError("array should be PyArrow array or chunked array")
+
         data = pa.table([array], [None])
+
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
                 {
@@ -262,10 +283,10 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
             codes = libcudf.interop.from_arrow(
                 indices_table, indices_table.column_names
-            )._data["None"]
+            )[0]["None"]
             categories = libcudf.interop.from_arrow(
                 dictionaries_table, dictionaries_table.column_names
-            )._data["None"]
+            )[0]["None"]
 
             return build_categorical_column(
                 categories=categories,
@@ -283,9 +304,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         elif isinstance(array.type, pa.Decimal128Type):
             return cudf.core.column.Decimal64Column.from_arrow(array)
 
-        result = libcudf.interop.from_arrow(data, data.column_names)._data[
-            "None"
-        ]
+        result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"]
 
         result = result._with_type_metadata(
             cudf_dtype_from_pa_type(array.type)
@@ -373,14 +392,6 @@ def _fill(
 
         return self
 
-        fill_code = self._encode(fill_value)
-        fill_scalar = as_device_scalar(fill_code, self.codes.dtype)
-
-        result = self if inplace else self.copy()
-
-        libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
-        return result
-
     def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
         return libcudf.copying.shift(self, offset, fill_value)
 
@@ -433,7 +444,7 @@ def view(self, dtype: Dtype) -> ColumnBase:
 
         """
 
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
 
         if dtype.kind in ("o", "u", "s"):
             raise TypeError(
@@ -502,7 +513,10 @@ def slice(self, start: int, stop: int, stride: int = None) -> ColumnBase:
         else:
             # Need to create a gather map for given slice with stride
             gather_map = arange(
-                start=start, stop=stop, step=stride, dtype=np.dtype(np.int32),
+                start=start,
+                stop=stop,
+                step=stride,
+                dtype=cudf.dtype(np.int32),
             )
             return self.take(gather_map)
 
@@ -545,7 +559,7 @@ def __setitem__(self, key: Any, value: Any):
                     start=key_start,
                     stop=key_stop,
                     step=key_stride,
-                    dtype=np.dtype(np.int32),
+                    dtype=cudf.dtype(np.int32),
                 )
                 nelem = len(key)
             else:
@@ -881,12 +895,16 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
         if is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
 
-        dtype = pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
+        dtype = (
+            pandas_dtypes_alias_to_cudf_alias.get(dtype, dtype)
+            if isinstance(dtype, str)
+            else pandas_dtypes_to_cudf_dtypes.get(dtype, dtype)
+        )
         if _is_non_decimal_numeric_dtype(dtype):
             return self.as_numerical_column(dtype, **kwargs)
         elif is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
-        elif pandas_dtype(dtype).type in {
+        elif cudf.dtype(dtype).type in {
             np.str_,
             np.object_,
             str,
@@ -908,9 +926,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
             return self.as_interval_column(dtype, **kwargs)
         elif is_decimal_dtype(dtype):
             return self.as_decimal_column(dtype, **kwargs)
-        elif np.issubdtype(dtype, np.datetime64):
+        elif np.issubdtype(cast(Any, dtype), np.datetime64):
             return self.as_datetime_column(dtype, **kwargs)
-        elif np.issubdtype(dtype, np.timedelta64):
+        elif np.issubdtype(cast(Any, dtype), np.timedelta64):
             return self.as_timedelta_column(dtype, **kwargs)
         else:
             return self.as_numerical_column(dtype, **kwargs)
@@ -948,7 +966,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
             cats = cats._column.dropna(drop_nan=False)
             min_type = min_unsigned_type(len(cats), 8)
             labels = labels - 1
-            if np.dtype(min_type).itemsize < labels.dtype.itemsize:
+            if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
                 labels = labels.astype(min_type)
 
         return build_categorical_column(
@@ -985,9 +1003,7 @@ def as_string_column(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> Union[
-        "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
-    ]:
+    ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]:
         raise NotImplementedError
 
     def as_decimal64_column(
@@ -1235,57 +1251,6 @@ def _process_for_reduction(
             )
         return result_col
 
-    def scatter_to_table(
-        self,
-        row_indices: ColumnBase,
-        column_indices: ColumnBase,
-        names: List[Any],
-        nrows: int = None,
-        ncols: int = None,
-    ) -> "cudf.core.frame.Frame":
-        """
-        Scatters values from the column into a table.
-
-        Parameters
-        ----------
-        row_indices
-            A column of the same size as `self` specifying the
-            row index to scatter each value to
-        column_indices
-            A column of the same size as `self` specifying the
-            column index to scatter each value to
-        names
-            The column names of the resulting table
-
-        Returns
-        -------
-        """
-        if nrows is None:
-            nrows = 0
-            if len(row_indices) > 0:
-                nrows = int(row_indices.max() + 1)
-
-        if ncols is None:
-            ncols = 0
-            if len(column_indices) > 0:
-                ncols = int(column_indices.max() + 1)
-
-        if nrows * ncols == 0:
-            return cudf.core.frame.Frame({})
-
-        scatter_map = (column_indices * np.int32(nrows)) + row_indices
-        target = cudf.core.frame.Frame(
-            {None: column_empty_like(self, masked=True, newsize=nrows * ncols)}
-        )
-        target._data[None][scatter_map] = self
-        result_frames = target._split(range(nrows, nrows * ncols, nrows))
-        return cudf.core.frame.Frame(
-            {
-                name: next(iter(f._columns))
-                for name, f in zip(names, result_frames)
-            }
-        )
-
     def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         """
         Copies type metadata from self onto other, returning a new column.
@@ -1347,7 +1312,7 @@ def column_empty(
 ) -> ColumnBase:
     """Allocate a new column like the given row_count and dtype.
     """
-    dtype = pandas_dtype(dtype)
+    dtype = cudf.dtype(dtype)
     children = ()  # type: Tuple[ColumnBase, ...]
 
     if is_struct_dtype(dtype):
@@ -1360,7 +1325,7 @@ def column_empty(
         data = None
         children = (
             build_column(
-                data=Buffer.empty(row_count * np.dtype("int32").itemsize),
+                data=Buffer.empty(row_count * cudf.dtype("int32").itemsize),
                 dtype="int32",
             ),
         )
@@ -1369,7 +1334,7 @@ def column_empty(
         children = (
             full(row_count + 1, 0, dtype="int32"),
             build_column(
-                data=Buffer.empty(row_count * np.dtype("int8").itemsize),
+                data=Buffer.empty(row_count * cudf.dtype("int8").itemsize),
                 dtype="int8",
             ),
         )
@@ -1412,7 +1377,7 @@ def build_column(
     offset : int, optional
     children : tuple, optional
     """
-    dtype = pandas_dtype(dtype)
+    dtype = cudf.dtype(dtype)
 
     if _is_non_decimal_numeric_dtype(dtype):
         assert data is not None
@@ -1768,9 +1733,9 @@ def as_column(
 
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
-        current_dtype = np.dtype(desc["typestr"])
+        current_dtype = cudf.dtype(desc["typestr"])
 
-        arb_dtype = check_cast_unsupported_dtype(current_dtype)
+        arb_dtype = cudf.dtype(current_dtype)
 
         if desc.get("mask", None) is not None:
             # Extract and remove the mask from arbitrary before
@@ -1817,9 +1782,9 @@ def as_column(
         col = ColumnBase.from_arrow(arbitrary)
         if isinstance(arbitrary, pa.NullArray):
             if type(dtype) == str and dtype == "empty":
-                new_dtype = pandas_dtype(arbitrary.type.to_pandas_dtype())
+                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
             else:
-                new_dtype = pandas_dtype(dtype)
+                new_dtype = cudf.dtype(dtype)
             col = col.astype(new_dtype)
 
         return col
@@ -1836,7 +1801,7 @@ def as_column(
         elif arbitrary.dtype == np.bool_:
             data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
         elif arbitrary.dtype.kind in ("f"):
-            arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype)
+            arb_dtype = cudf.dtype(arbitrary.dtype)
             data = as_column(
                 cupy.asarray(arbitrary, dtype=arb_dtype),
                 nan_as_null=nan_as_null,
@@ -1874,7 +1839,7 @@ def as_column(
         ):
             arbitrary = None
             if dtype is None:
-                dtype = np.dtype("float64")
+                dtype = cudf.dtype("float64")
 
         data = as_column(
             utils.scalar_broadcast_to(arbitrary, length, dtype=dtype)
@@ -1889,7 +1854,7 @@ def as_column(
         # CUDF assumes values are always contiguous
         desc = arbitrary.__array_interface__
         shape = desc["shape"]
-        arb_dtype = np.dtype(desc["typestr"])
+        arb_dtype = cudf.dtype(desc["typestr"])
         # CUDF assumes values are always contiguous
         if len(shape) > 1:
             raise ValueError("Data must be 1-dimensional")
@@ -1913,7 +1878,7 @@ def as_column(
             arbitrary = np.ascontiguousarray(arbitrary)
 
         if dtype is not None:
-            arbitrary = arbitrary.astype(dtype)
+            arbitrary = arbitrary.astype(np.dtype(dtype))
 
         if arb_dtype.kind == "M":
 
@@ -1921,7 +1886,7 @@ def as_column(
             cast_dtype = time_unit in ("D", "W", "M", "Y")
 
             if cast_dtype:
-                arbitrary = arbitrary.astype(np.dtype("datetime64[s]"))
+                arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))
 
             buffer = Buffer(arbitrary.view("|u1"))
             mask = None
@@ -1941,7 +1906,7 @@ def as_column(
             cast_dtype = time_unit in ("D", "W", "M", "Y")
 
             if cast_dtype:
-                arbitrary = arbitrary.astype(np.dtype("timedelta64[s]"))
+                arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))
 
             buffer = Buffer(arbitrary.view("|u1"))
             mask = None
@@ -1980,9 +1945,7 @@ def as_column(
             if dtype is not None:
                 data = data.astype(dtype)
         elif arb_dtype.kind in ("f"):
-            arb_dtype = check_cast_unsupported_dtype(
-                arb_dtype if dtype is None else dtype
-            )
+            arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype)
             data = as_column(
                 cupy.asarray(arbitrary, dtype=arb_dtype),
                 nan_as_null=nan_as_null,
@@ -1995,9 +1958,9 @@ def as_column(
             arb_dtype = arbitrary.dtype
         else:
             if arbitrary.dtype == pd.StringDtype():
-                arb_dtype = np.dtype("O")
+                arb_dtype = cudf.dtype("O")
             else:
-                arb_dtype = check_cast_unsupported_dtype(arbitrary.dtype)
+                arb_dtype = cudf.dtype(arbitrary.dtype)
                 if arb_dtype != arbitrary.dtype.numpy_dtype:
                     arbitrary = arbitrary.astype(arb_dtype)
         if (
@@ -2044,6 +2007,29 @@ def as_column(
                 memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null
             )
         except TypeError:
+            if dtype is not None:
+                # Arrow throws a type error if the input is of
+                # mixed-precision and cannot fit into the provided
+                # decimal type properly, see:
+                # https://github.com/apache/arrow/pull/9948
+                # Hence we should let the exception propagate to
+                # the user.
+                if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                    data = pa.array(
+                        arbitrary,
+                        type=pa.decimal128(
+                            precision=dtype.precision, scale=dtype.scale
+                        ),
+                    )
+                    return cudf.core.column.Decimal64Column.from_arrow(data)
+                if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                    data = pa.array(
+                        arbitrary,
+                        type=pa.decimal128(
+                            precision=dtype.precision, scale=dtype.scale
+                        ),
+                    )
+                    return cudf.core.column.Decimal32Column.from_arrow(data)
             pa_type = None
             np_type = None
             try:
@@ -2082,7 +2068,6 @@ def as_column(
                         return cudf.core.column.Decimal32Column.from_arrow(
                             data
                         )
-                    dtype = pd.api.types.pandas_dtype(dtype)
                     np_type = np.dtype(dtype).type
                     if np_type == np.bool_:
                         pa_type = pa.bool_()
@@ -2136,7 +2121,7 @@ def _construct_array(
     Construct a CuPy or NumPy array from `arbitrary`
     """
     try:
-        dtype = dtype if dtype is None else np.dtype(dtype)
+        dtype = dtype if dtype is None else cudf.dtype(dtype)
         arbitrary = cupy.asarray(arbitrary, dtype=dtype)
     except (TypeError, ValueError):
         native_dtype = dtype
@@ -2150,7 +2135,7 @@ def _construct_array(
             arbitrary,
             dtype=native_dtype
             if native_dtype is None
-            else np.dtype(native_dtype),
+            else cudf.dtype(native_dtype),
         )
     return arbitrary
 
@@ -2159,7 +2144,7 @@ def _data_from_cuda_array_interface_desc(obj) -> Buffer:
     desc = obj.__cuda_array_interface__
     ptr = desc["data"][0]
     nelem = desc["shape"][0] if len(desc["shape"]) > 0 else 1
-    dtype = np.dtype(desc["typestr"])
+    dtype = cudf.dtype(desc["typestr"])
 
     data = Buffer(data=ptr, size=nelem * dtype.itemsize, owner=obj)
     return data
@@ -2328,7 +2313,7 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase:
 def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
-        dtype = pandas_dtype(None)
+        dtype = cudf.dtype(None)
         return column_empty(0, dtype=dtype, masked=True)
 
     # If all columns are `NumericalColumn` with different dtypes,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index f3d1880b290..46ff1990ac2 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -52,6 +52,19 @@
 
 
 class DatetimeColumn(column.ColumnBase):
+    """
+    A Column implementation for Date-time types.
+
+    Parameters
+    ----------
+    data : Buffer
+        The datetime values
+    dtype : np.dtype
+        The data type
+    mask : Buffer; optional
+        The validity mask
+    """
+
     def __init__(
         self,
         data: Buffer,
@@ -61,17 +74,8 @@ def __init__(
         offset: int = 0,
         null_count: int = None,
     ):
-        """
-        Parameters
-        ----------
-        data : Buffer
-            The datetime values
-        dtype : np.dtype
-            The data type
-        mask : Buffer; optional
-            The validity mask
-        """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
+
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -154,6 +158,15 @@ def to_pandas(
             index=index,
         )
 
+    @property
+    def values(self):
+        """
+        Return a CuPy representation of the DateTimeColumn.
+        """
+        raise NotImplementedError(
+            "DateTime Arrays is not yet implemented in cudf"
+        )
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
@@ -236,7 +249,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
         return output
 
     def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
@@ -264,7 +277,7 @@ def as_string_column(
             )
         if len(self) > 0:
             return string._datetime_to_str_typecast_functions[
-                np.dtype(self.dtype)
+                cudf.dtype(self.dtype)
             ](self, format)
         else:
             return cast(
@@ -316,7 +329,7 @@ def binary_operator(
             return rhs._datetime_binop(self, op, reflect=reflect)
         lhs: Union[ScalarLike, ColumnBase] = self
         if op in ("eq", "ne", "lt", "gt", "le", "ge", "NULL_EQUALS"):
-            out_dtype = np.dtype(np.bool_)  # type: Dtype
+            out_dtype = cudf.dtype(np.bool_)  # type: Dtype
         elif op == "add" and pd.api.types.is_timedelta64_dtype(rhs.dtype):
             out_dtype = cudf.core.column.timedelta._timedelta_add_result_dtype(
                 rhs, lhs
@@ -389,13 +402,13 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 
-            max_int = np.iinfo(np.dtype("int64")).max
+            max_int = np.iinfo(cudf.dtype("int64")).max
 
             max_dist = np.timedelta64(
-                self.max().astype(np.dtype("int64"), copy=False), self_res
+                self.max().astype(cudf.dtype("int64"), copy=False), self_res
             )
             min_dist = np.timedelta64(
-                self.min().astype(np.dtype("int64"), copy=False), self_res
+                self.min().astype(cudf.dtype("int64"), copy=False), self_res
             )
 
             self_delta_dtype = np.timedelta64(0, self_res).dtype
@@ -408,7 +421,7 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
                 return True
             else:
                 return False
-        elif to_dtype == np.dtype("int64") or to_dtype == np.dtype("O"):
+        elif to_dtype == cudf.dtype("int64") or to_dtype == cudf.dtype("O"):
             # can safely cast to representation, or string
             return True
         else:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index c667799c7c2..47f39eb570d 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -25,7 +25,30 @@
 from .numerical_base import NumericalBaseColumn
 
 
-class Decimal32Column(NumericalBaseColumn):
+class DecimalBaseColumn(NumericalBaseColumn):
+    """Base column for decimal64 and decimal32 columns
+    """
+
+    dtype: Union[Decimal32Dtype, Decimal64Dtype]
+
+    def as_decimal_column(
+        self, dtype: Dtype, **kwargs
+    ) -> Union["DecimalBaseColumn"]:
+        if (
+            isinstance(dtype, (Decimal64Dtype, Decimal32Dtype))
+            and dtype.scale < self.dtype.scale
+        ):
+            warn(
+                "cuDF truncates when downcasting decimals to a lower scale. "
+                "To round, use Series.round() or DataFrame.round()."
+            )
+
+        if dtype == self.dtype:
+            return self
+        return libcudf.unary.cast(self, dtype)
+
+
+class Decimal32Column(DecimalBaseColumn):
     dtype: Decimal32Dtype
 
     @classmethod
@@ -78,7 +101,7 @@ def to_arrow(self):
         )
 
 
-class Decimal64Column(NumericalBaseColumn):
+class Decimal64Column(DecimalBaseColumn):
     dtype: Decimal64Dtype
 
     def __truediv__(self, other):
@@ -202,24 +225,6 @@ def _decimal_quantile(
 
         return result._with_type_metadata(self.dtype)
 
-    def as_decimal_column(
-        self, dtype: Dtype, **kwargs
-    ) -> Union[
-        "cudf.core.column.Decimal32Column", "cudf.core.column.Decimal64Column"
-    ]:
-        if (
-            isinstance(dtype, Decimal64Dtype)
-            and dtype.scale < self.dtype.scale
-        ):
-            warn(
-                "cuDF truncates when downcasting decimals to a lower scale. "
-                "To round, use Series.round() or DataFrame.round()."
-            )
-
-        if dtype == self.dtype:
-            return self
-        return libcudf.unary.cast(self, dtype)
-
     def as_numerical_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 27dc4fe0c0d..a587c58a49d 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -63,8 +63,8 @@ def _return_or_inplace(
         """
         if inplace:
             self._parent._mimic_inplace(
-                self._parent.__class__._from_table(
-                    cudf._lib.table.Table({self._parent.name: new_col})
+                self._parent.__class__._from_data(
+                    {self._parent.name: new_col}
                 ),
                 inplace=True,
             )
@@ -78,8 +78,8 @@ def _return_or_inplace(
                 table = new_col
 
                 if isinstance(self._parent, cudf.BaseIndex):
-                    idx = self._parent._constructor_expanddim._from_table(
-                        table=table
+                    idx = self._parent._constructor_expanddim._from_data(
+                        table._data, table._index
                     )
                     idx.names = None
                     return idx
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 1ac3f1de6a2..bc12b42a3fa 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -36,6 +36,17 @@
 
 
 class NumericalColumn(NumericalBaseColumn):
+    """
+    A Column object for Numeric types.
+
+    Parameters
+    ----------
+    data : Buffer
+    dtype : np.dtype
+        The dtype associated with the data Buffer
+    mask : Buffer, optional
+    """
+
     def __init__(
         self,
         data: Buffer,
@@ -45,15 +56,8 @@ def __init__(
         offset: int = 0,
         null_count: int = None,
     ):
-        """
-        Parameters
-        ----------
-        data : Buffer
-        dtype : np.dtype
-            The dtype associated with the data Buffer
-        mask : Buffer, optional
-        """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
+
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -121,14 +125,14 @@ def binary_operator(
         self, binop: str, rhs: BinaryOperand, reflect: bool = False,
     ) -> ColumnBase:
         int_dtypes = [
-            np.dtype("int8"),
-            np.dtype("int16"),
-            np.dtype("int32"),
-            np.dtype("int64"),
-            np.dtype("uint8"),
-            np.dtype("uint16"),
-            np.dtype("uint32"),
-            np.dtype("uint64"),
+            cudf.dtype("int8"),
+            cudf.dtype("int16"),
+            cudf.dtype("int32"),
+            cudf.dtype("int64"),
+            cudf.dtype("uint8"),
+            cudf.dtype("uint16"),
+            cudf.dtype("uint32"),
+            cudf.dtype("uint64"),
         ]
         if rhs is None:
             out_dtype = self.dtype
@@ -158,7 +162,7 @@ def binary_operator(
                     (np.isscalar(tmp) and (0 == tmp))
                     or ((isinstance(tmp, NumericalColumn)) and (0.0 in tmp))
                 ):
-                    out_dtype = np.dtype("float64")
+                    out_dtype = cudf.dtype("float64")
 
         if binop in {
             "l_and",
@@ -193,13 +197,13 @@ def normalize_binop_value(
             if isinstance(other, cudf.Scalar):
                 return other
             other_dtype = np.promote_types(self.dtype, other_dtype)
-            if other_dtype == np.dtype("float16"):
-                other_dtype = np.dtype("float32")
+            if other_dtype == cudf.dtype("float16"):
+                other_dtype = cudf.dtype("float32")
                 other = other_dtype.type(other)
             if self.dtype.kind == "b":
                 other_dtype = min_signed_type(other)
             if np.isscalar(other):
-                other = np.dtype(other_dtype).type(other)
+                other = cudf.dtype(other_dtype).type(other)
                 return other
             else:
                 ary = utils.scalar_broadcast_to(
@@ -212,7 +216,7 @@ def normalize_binop_value(
             raise TypeError(f"cannot broadcast {type(other)}")
 
     def int2ip(self) -> "cudf.core.column.StringColumn":
-        if self.dtype != np.dtype("int64"):
+        if self.dtype != cudf.dtype("int64"):
             raise TypeError("Only int64 type can be converted to ip")
 
         return libcudf.string_casting.int2ip(self)
@@ -222,7 +226,7 @@ def as_string_column(
     ) -> "cudf.core.column.StringColumn":
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
-                np.dtype(self.dtype)
+                cudf.dtype(self.dtype)
             ](self)
         else:
             return cast(
@@ -263,7 +267,7 @@ def as_decimal_column(
         return libcudf.unary.cast(self, dtype)
 
     def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype)
@@ -618,7 +622,7 @@ def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase:
     else:
         raise TypeError(
             f"Cannot safely cast non-equivalent "
-            f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}"
+            f"{col.dtype.type.__name__} to {cudf.dtype(dtype).type.__name__}"
         )
 
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 92c57477465..c4b07c41b06 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -53,62 +53,63 @@ def str_to_boolean(column: StringColumn):
 
 
 _str_to_numeric_typecast_functions = {
-    np.dtype("int8"): str_cast.stoi8,
-    np.dtype("int16"): str_cast.stoi16,
-    np.dtype("int32"): str_cast.stoi,
-    np.dtype("int64"): str_cast.stol,
-    np.dtype("uint8"): str_cast.stoui8,
-    np.dtype("uint16"): str_cast.stoui16,
-    np.dtype("uint32"): str_cast.stoui,
-    np.dtype("uint64"): str_cast.stoul,
-    np.dtype("float32"): str_cast.stof,
-    np.dtype("float64"): str_cast.stod,
-    np.dtype("bool"): str_to_boolean,
+    cudf.dtype("int8"): str_cast.stoi8,
+    cudf.dtype("int16"): str_cast.stoi16,
+    cudf.dtype("int32"): str_cast.stoi,
+    cudf.dtype("int64"): str_cast.stol,
+    cudf.dtype("uint8"): str_cast.stoui8,
+    cudf.dtype("uint16"): str_cast.stoui16,
+    cudf.dtype("uint32"): str_cast.stoui,
+    cudf.dtype("uint64"): str_cast.stoul,
+    cudf.dtype("float32"): str_cast.stof,
+    cudf.dtype("float64"): str_cast.stod,
+    cudf.dtype("bool"): str_to_boolean,
 }
 
 _numeric_to_str_typecast_functions = {
-    np.dtype("int8"): str_cast.i8tos,
-    np.dtype("int16"): str_cast.i16tos,
-    np.dtype("int32"): str_cast.itos,
-    np.dtype("int64"): str_cast.ltos,
-    np.dtype("uint8"): str_cast.ui8tos,
-    np.dtype("uint16"): str_cast.ui16tos,
-    np.dtype("uint32"): str_cast.uitos,
-    np.dtype("uint64"): str_cast.ultos,
-    np.dtype("float32"): str_cast.ftos,
-    np.dtype("float64"): str_cast.dtos,
-    np.dtype("bool"): str_cast.from_booleans,
+    cudf.dtype("int8"): str_cast.i8tos,
+    cudf.dtype("int16"): str_cast.i16tos,
+    cudf.dtype("int32"): str_cast.itos,
+    cudf.dtype("int64"): str_cast.ltos,
+    cudf.dtype("uint8"): str_cast.ui8tos,
+    cudf.dtype("uint16"): str_cast.ui16tos,
+    cudf.dtype("uint32"): str_cast.uitos,
+    cudf.dtype("uint64"): str_cast.ultos,
+    cudf.dtype("float32"): str_cast.ftos,
+    cudf.dtype("float64"): str_cast.dtos,
+    cudf.dtype("bool"): str_cast.from_booleans,
 }
 
 _datetime_to_str_typecast_functions = {
     # TODO: support Date32 UNIX days
-    # np.dtype("datetime64[D]"): str_cast.int2timestamp,
-    np.dtype("datetime64[s]"): str_cast.int2timestamp,
-    np.dtype("datetime64[ms]"): str_cast.int2timestamp,
-    np.dtype("datetime64[us]"): str_cast.int2timestamp,
-    np.dtype("datetime64[ns]"): str_cast.int2timestamp,
+    # cudf.dtype("datetime64[D]"): str_cast.int2timestamp,
+    cudf.dtype("datetime64[s]"): str_cast.int2timestamp,
+    cudf.dtype("datetime64[ms]"): str_cast.int2timestamp,
+    cudf.dtype("datetime64[us]"): str_cast.int2timestamp,
+    cudf.dtype("datetime64[ns]"): str_cast.int2timestamp,
 }
 
 _timedelta_to_str_typecast_functions = {
-    np.dtype("timedelta64[s]"): str_cast.int2timedelta,
-    np.dtype("timedelta64[ms]"): str_cast.int2timedelta,
-    np.dtype("timedelta64[us]"): str_cast.int2timedelta,
-    np.dtype("timedelta64[ns]"): str_cast.int2timedelta,
+    cudf.dtype("timedelta64[s]"): str_cast.int2timedelta,
+    cudf.dtype("timedelta64[ms]"): str_cast.int2timedelta,
+    cudf.dtype("timedelta64[us]"): str_cast.int2timedelta,
+    cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
 
 class StringMethods(ColumnMethods):
+    """
+    Vectorized string functions for Series and Index.
+
+    This mimics pandas ``df.str`` interface. nulls stay null
+    unless handled otherwise by a particular method.
+    Patterned after Python’s string methods, with some
+    inspiration from R’s stringr package.
+    """
+
     _column: StringColumn
 
     def __init__(self, parent):
-        """
-        Vectorized string functions for Series and Index.
-
-        This mimics pandas ``df.str`` interface. nulls stay null
-        unless handled otherwise by a particular method.
-        Patterned after Python’s string methods, with some
-        inspiration from R’s stringr package.
-        """
         value_type = (
             parent.dtype.leaf_type
             if is_list_dtype(parent.dtype)
@@ -607,11 +608,12 @@ def extract(
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")
 
-        out = libstrings.extract(self._column, pat)
-        if out._num_columns == 1 and expand is False:
-            return self._return_or_inplace(out._columns[0], expand=expand)
+        data, index = libstrings.extract(self._column, pat)
+        if len(data) == 1 and expand is False:
+            data = next(iter(data.values()))
         else:
-            return self._return_or_inplace(out, expand=expand)
+            data = cudf.core.frame.Frame(data, index)
+        return self._return_or_inplace(data, expand=expand)
 
     def contains(
         self,
@@ -749,6 +751,59 @@ def contains(
             )
         return self._return_or_inplace(result_col)
 
+    def repeat(self, repeats: Union[int, Sequence],) -> SeriesOrIndex:
+        """
+        Duplicate each string in the Series or Index.
+        Equivalent to `str.repeat()
+        <https://pandas.pydata.org/docs/reference/api/pandas.Series.str.repeat.html>`_.
+
+        Parameters
+        ----------
+        repeats : int or sequence of int
+            Same value for all (int) or different value per (sequence).
+
+        Returns
+        -------
+        Series or Index of object
+            Series or Index of repeated string objects specified by
+            input parameter repeats.
+
+        Examples
+        --------
+        >>> s = cudf.Series(['a', 'b', 'c'])
+        >>> s
+        0    a
+        1    b
+        2    c
+        dtype: object
+
+        Single int repeats string in Series
+
+        >>> s.str.repeat(repeats=2)
+        0    aa
+        1    bb
+        2    cc
+        dtype: object
+
+        Sequence of int repeats corresponding string in Series
+
+        >>> s.str.repeat(repeats=[1, 2, 3])
+        0      a
+        1     bb
+        2    ccc
+        dtype: object
+        """
+        if can_convert_to_column(repeats):
+            return self._return_or_inplace(
+                libstrings.repeat_sequence(
+                    self._column, column.as_column(repeats, dtype="int"),
+                ),
+            )
+
+        return self._return_or_inplace(
+            libstrings.repeat_scalar(self._column, repeats)
+        )
+
     def replace(
         self,
         pat: Union[str, Sequence],
@@ -2274,12 +2329,13 @@ def split(
             if self._column.null_count == len(self._column):
                 result_table = cudf.core.frame.Frame({0: self._column.copy()})
             else:
-                result_table = libstrings.split(
+                data, index = libstrings.split(
                     self._column, cudf.Scalar(pat, "str"), n
                 )
-                if len(result_table._data) == 1:
-                    if result_table._data[0].null_count == len(self._column):
-                        result_table = cudf.core.frame.Frame({})
+                if len(data) == 1 and data[0].null_count == len(self._column):
+                    result_table = cudf.core.frame.Frame({})
+                else:
+                    result_table = cudf.core.frame.Frame(data, index)
         else:
             result_table = libstrings.split_record(
                 self._column, cudf.Scalar(pat, "str"), n
@@ -2429,12 +2485,13 @@ def rsplit(
             if self._column.null_count == len(self._column):
                 result_table = cudf.core.frame.Frame({0: self._column.copy()})
             else:
-                result_table = libstrings.rsplit(
-                    self._column, cudf.Scalar(pat), n
+                data, index = libstrings.rsplit(
+                    self._column, cudf.Scalar(pat, "str"), n
                 )
-                if len(result_table._data) == 1:
-                    if result_table._data[0].null_count == len(self._column):
-                        result_table = cudf.core.frame.Frame({})
+                if len(data) == 1 and data[0].null_count == len(self._column):
+                    result_table = cudf.core.frame.Frame({})
+                else:
+                    result_table = cudf.core.frame.Frame(data, index)
         else:
             result_table = libstrings.rsplit_record(
                 self._column, cudf.Scalar(pat), n
@@ -2499,7 +2556,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
 
         Also available on indices:
 
-        >>> idx = cudf.core.index.StringIndex(['X 123', 'Y 999'])
+        >>> idx = cudf.Index(['X 123', 'Y 999'])
         >>> idx
         StringIndex(['X 123' 'Y 999'], dtype='object')
 
@@ -2519,7 +2576,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep)),
+            cudf.core.frame.Frame(
+                *libstrings.partition(self._column, cudf.Scalar(sep))
+            ),
             expand=expand,
         )
 
@@ -2564,7 +2623,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
 
         Also available on indices:
 
-        >>> idx = cudf.core.index.StringIndex(['X 123', 'Y 999'])
+        >>> idx = cudf.Index(['X 123', 'Y 999'])
         >>> idx
         StringIndex(['X 123' 'Y 999'], dtype='object')
 
@@ -2584,7 +2643,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep)),
+            cudf.core.frame.Frame(
+                *libstrings.rpartition(self._column, cudf.Scalar(sep))
+            ),
             expand=expand,
         )
 
@@ -3234,7 +3295,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
 
         This is also available on Index.
 
-        >>> index = cudf.core.index.StringIndex(['A', 'A', 'Aaba', 'cat'])
+        >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat'])
         >>> index.str.count('a')
         Int64Index([0, 0, 2, 1], dtype='int64')
         """  # noqa W605
@@ -3309,8 +3370,9 @@ def findall(
         if flags != 0:
             raise NotImplementedError("`flags` parameter is not yet supported")
 
+        data, index = libstrings.findall(self._column, pat)
         return self._return_or_inplace(
-            libstrings.findall(self._column, pat), expand=expand
+            cudf.core.frame.Frame(data, index), expand=expand
         )
 
     def isempty(self) -> SeriesOrIndex:
@@ -4861,7 +4923,18 @@ def _expected_types_format(types):
 
 
 class StringColumn(column.ColumnBase):
-    """Implements operations for Columns of String type
+    """
+    Implements operations for Columns of String type
+
+    Parameters
+    ----------
+    mask : Buffer
+        The validity mask
+    offset : int
+        Data offset
+    children : Tuple[Column]
+        Two non-null columns containing the string data and offsets
+        respectively
     """
 
     _start_offset: Optional[int]
@@ -4876,18 +4949,7 @@ def __init__(
         null_count: int = None,
         children: Tuple["column.ColumnBase", ...] = (),
     ):
-        """
-        Parameters
-        ----------
-        mask : Buffer
-            The validity mask
-        offset : int
-            Data offset
-        children : Tuple[Column]
-            Two non-null columns containing the string data and offsets
-            respectively
-        """
-        dtype = np.dtype("object")
+        dtype = cudf.dtype("object")
 
         if size is None:
             for child in children:
@@ -5054,7 +5116,7 @@ def __contains__(self, item: ScalarLike) -> bool:
     def as_numerical_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
 
         if out_dtype.kind in {"i", "u"}:
             if not libstrings.is_integer(self).all():
@@ -5096,7 +5158,7 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
     def as_datetime_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.DatetimeColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
 
         # infer on host from the first not na element
         # or return all null column if all values
@@ -5120,7 +5182,7 @@ def as_datetime_column(
     def as_timedelta_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.TimeDeltaColumn":
-        out_dtype = np.dtype(dtype)
+        out_dtype = cudf.dtype(dtype)
         format = "%D days %H:%M:%S"
         return self._as_datetime_or_timedelta_column(out_dtype, format)
 
@@ -5232,7 +5294,7 @@ def deserialize(cls, header: dict, frames: list) -> StringColumn:
         return col
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        to_dtype = np.dtype(to_dtype)
+        to_dtype = cudf.dtype(to_dtype)
 
         if self.dtype == to_dtype:
             return True
@@ -5379,7 +5441,7 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
             raise ValueError(
                 "Can not produce a view of a string column with nulls"
             )
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         str_byte_offset = self.base_children[0].element_indexing(self.offset)
         str_end_byte_offset = self.base_children[0].element_indexing(
             self.offset + self.size
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 6988128606e..fd63b4de144 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -191,7 +191,15 @@ def field(self, key):
             pos = fields.index(key)
             return self._return_or_inplace(self._column.children[pos])
         else:
-            return self._return_or_inplace(self._column.children[key])
+            if isinstance(key, int):
+                try:
+                    return self._return_or_inplace(self._column.children[key])
+                except IndexError:
+                    raise IndexError(f"Index {key} out of range")
+            else:
+                raise KeyError(
+                    f"Field '{key}' is not found in the set of existing keys."
+                )
 
     def explode(self):
         """
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index a27c20cc50c..7c1250231f3 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -34,6 +34,24 @@
 
 
 class TimeDeltaColumn(column.ColumnBase):
+    """
+    Parameters
+    ----------
+    data : Buffer
+        The Timedelta values
+    dtype : np.dtype
+        The data type
+    size : int
+        Size of memory allocation.
+    mask : Buffer; optional
+        The validity mask
+    offset : int
+        Data offset
+    null_count : int, optional
+        The number of null values.
+        If None, it is calculated automatically.
+    """
+
     def __init__(
         self,
         data: Buffer,
@@ -43,24 +61,8 @@ def __init__(
         offset: int = 0,
         null_count: int = None,
     ):
-        """
-        Parameters
-        ----------
-        data : Buffer
-            The Timedelta values
-        dtype : np.dtype
-            The data type
-        size : int
-            Size of memory allocation.
-        mask : Buffer; optional
-            The validity mask
-        offset : int
-            Data offset
-        null_count : int, optional
-            The number of null values.
-            If None, it is calculated automatically.
-        """
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
+
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
@@ -90,6 +92,15 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool:
             return False
         return item.view("int64") in self.as_numerical
 
+    @property
+    def values(self):
+        """
+        Return a CuPy representation of the TimeDeltaColumn.
+        """
+        raise NotImplementedError(
+            "TimeDelta Arrays is not yet implemented in cudf"
+        )
+
     def to_arrow(self) -> pa.Array:
         mask = None
         if self.nullable:
@@ -137,7 +148,7 @@ def _binary_op_floordiv(
                     rhs = cudf.Scalar(None, "float64")
             else:
                 rhs = rhs.astype(common_dtype).astype("float64")
-            out_dtype = np.dtype("int64")
+            out_dtype = cudf.dtype("int64")
         elif rhs.dtype.kind in ("f", "i", "u"):
             out_dtype = self.dtype
         else:
@@ -204,7 +215,7 @@ def _binary_op_truediv(
             else:
                 rhs = rhs.astype(common_dtype).astype("float64")
 
-            out_dtype = np.dtype("float64")
+            out_dtype = cudf.dtype("float64")
         elif rhs.dtype.kind in ("f", "i", "u"):
             out_dtype = self.dtype
         else:
@@ -344,7 +355,7 @@ def as_string_column(
             )
         if len(self) > 0:
             return string._timedelta_to_str_typecast_functions[
-                np.dtype(self.dtype)
+                cudf.dtype(self.dtype)
             ](self, format=format)
         else:
             return cast(
@@ -353,7 +364,7 @@ def as_string_column(
             )
 
     def as_timedelta_column(self, dtype: Dtype, **kwargs) -> TimeDeltaColumn:
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
@@ -575,9 +586,9 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
 
 
 def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype:
-    if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)):
+    if np.can_cast(cudf.dtype(lhs_dtype), cudf.dtype(rhs_dtype)):
         return rhs_dtype
-    elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)):
+    elif np.can_cast(cudf.dtype(rhs_dtype), cudf.dtype(lhs_dtype)):
         return lhs_dtype
     else:
         raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}")
@@ -594,7 +605,7 @@ def _timedelta_add_result_dtype(
         lhs_unit = units.index(lhs_time_unit)
         rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
         rhs_unit = units.index(rhs_time_unit)
-        out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
+        out_dtype = cudf.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
     else:
         raise TypeError(
             f"Addition of {lhs.dtype} with {rhs.dtype} "
@@ -619,7 +630,7 @@ def _timedelta_sub_result_dtype(
         lhs_unit = units.index(lhs_time_unit)
         rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
         rhs_unit = units.index(rhs_time_unit)
-        out_dtype = np.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
+        out_dtype = cudf.dtype(f"datetime64[{units[max(lhs_unit, rhs_unit)]}]")
     else:
         raise TypeError(
             f"Subtraction of {lhs.dtype} with {rhs.dtype} "
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 607b8ac307b..56882f89af8 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -80,6 +80,19 @@ def _to_flat_dict(d):
 
 
 class ColumnAccessor(MutableMapping):
+    """
+    Parameters
+    ----------
+    data : mapping
+        Mapping of keys to column values.
+    multiindex : bool, optional
+        Whether tuple keys represent a hierarchical
+        index with multiple "levels" (default=False).
+    level_names : tuple, optional
+        Tuple containing names for each of the levels.
+        For a non-hierarchical index, a tuple of size 1
+        may be passe.
+    """
 
     _data: "Dict[Any, ColumnBase]"
     multiindex: bool
@@ -91,19 +104,6 @@ def __init__(
         multiindex: bool = False,
         level_names=None,
     ):
-        """
-        Parameters
-        ----------
-        data : mapping
-            Mapping of keys to column values.
-        multiindex : bool, optional
-            Whether tuple keys represent a hierarchical
-            index with multiple "levels" (default=False).
-        level_names : tuple, optional
-            Tuple containing names for each of the levels.
-            For a non-hierarchical index, a tuple of size 1
-            may be passe.
-        """
         if data is None:
             data = {}
         # TODO: we should validate the keys of `data`
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 7811f477170..91f623a3cd3 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -5,7 +5,6 @@
 import pandas as pd
 
 import cudf
-from cudf._lib.labeling import label_bins
 from cudf.core.column import as_column, build_categorical_column
 from cudf.core.index import IntervalIndex, interval_range
 from cudf.utils.dtypes import is_list_like
@@ -240,7 +239,7 @@ def cut(
         # the input arr must be changed to the same type as the edges
         input_arr = input_arr.astype(left_edges.dtype)
     # get the indexes for the appropriate number
-    index_labels = label_bins(
+    index_labels = cudf._lib.labeling.label_bins(
         input_arr, left_edges, left_inclusive, right_edges, right_inclusive
     )
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7cd42d749ec..721ebf22de7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -10,7 +10,7 @@
 import warnings
 from collections import defaultdict
 from collections.abc import Iterable, Sequence
-from typing import Any, Optional, TypeVar
+from typing import Any, MutableMapping, Optional, TypeVar
 
 import cupy
 import numpy as np
@@ -23,6 +23,7 @@
 from pandas.io.formats.printing import pprint_thing
 
 import cudf
+import cudf.core.common
 from cudf import _lib as libcudf
 from cudf.api.types import is_bool_dtype, is_dict_like
 from cudf.core import column, reshape
@@ -31,7 +32,7 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import DataFrameGroupBy
-from cudf.core.index import BaseIndex, Index, RangeIndex, as_index
+from cudf.core.index import BaseIndex, RangeIndex, as_index
 from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
 from cudf.core.series import Series
 from cudf.core.window import Rolling
@@ -62,6 +63,7 @@
     "max": "nanmax",
     "sum": "nansum",
     "prod": "nanprod",
+    "product": "nanprod",
     "mean": "nanmean",
     "std": "nanstd",
     "var": "nanvar",
@@ -69,100 +71,101 @@
 
 
 class DataFrame(Frame, Serializable, GetAttrGetItemMixin):
+    """
+    A GPU Dataframe object.
 
-    _PROTECTED_KEYS = frozenset(("_data", "_index"))
-
-    @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
-    def __init__(self, data=None, index=None, columns=None, dtype=None):
-        """
-        A GPU Dataframe object.
-
-        Parameters
-        ----------
-        data : array-like, Iterable, dict, or DataFrame.
-            Dict can contain Series, arrays, constants, or list-like objects.
+    Parameters
+    ----------
+    data : array-like, Iterable, dict, or DataFrame.
+        Dict can contain Series, arrays, constants, or list-like objects.
 
-        index : Index or array-like
-            Index to use for resulting frame. Will default to
-            RangeIndex if no indexing information part of input data and
-            no index provided.
+    index : Index or array-like
+        Index to use for resulting frame. Will default to
+        RangeIndex if no indexing information part of input data and
+        no index provided.
 
-        columns : Index or array-like
-            Column labels to use for resulting frame.
-            Will default to RangeIndex (0, 1, 2, …, n) if no column
-            labels are provided.
+    columns : Index or array-like
+        Column labels to use for resulting frame.
+        Will default to RangeIndex (0, 1, 2, …, n) if no column
+        labels are provided.
 
-        dtype : dtype, default None
-            Data type to force. Only a single dtype is allowed.
-            If None, infer.
+    dtype : dtype, default None
+        Data type to force. Only a single dtype is allowed.
+        If None, infer.
 
-        Examples
-        --------
+    Examples
+    --------
 
-        Build dataframe with ``__setitem__``:
+    Build dataframe with ``__setitem__``:
 
-        >>> import cudf
-        >>> df = cudf.DataFrame()
-        >>> df['key'] = [0, 1, 2, 3, 4]
-        >>> df['val'] = [float(i + 10) for i in range(5)]  # insert column
-        >>> df
-           key   val
-        0    0  10.0
-        1    1  11.0
-        2    2  12.0
-        3    3  13.0
-        4    4  14.0
+    >>> import cudf
+    >>> df = cudf.DataFrame()
+    >>> df['key'] = [0, 1, 2, 3, 4]
+    >>> df['val'] = [float(i + 10) for i in range(5)]  # insert column
+    >>> df
+       key   val
+    0    0  10.0
+    1    1  11.0
+    2    2  12.0
+    3    3  13.0
+    4    4  14.0
+
+    Build DataFrame via dict of columns:
+
+    >>> import numpy as np
+    >>> from datetime import datetime, timedelta
+    >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S')
+    >>> n = 5
+    >>> df = cudf.DataFrame({
+    ...     'id': np.arange(n),
+    ...     'datetimes': np.array(
+    ...     [(t0+ timedelta(seconds=x)) for x in range(n)])
+    ... })
+    >>> df
+        id                datetimes
+    0    0  2018-10-07T12:00:00.000
+    1    1  2018-10-07T12:00:01.000
+    2    2  2018-10-07T12:00:02.000
+    3    3  2018-10-07T12:00:03.000
+    4    4  2018-10-07T12:00:04.000
+
+    Build DataFrame via list of rows as tuples:
+
+    >>> df = cudf.DataFrame([
+    ...     (5, "cats", "jump", np.nan),
+    ...     (2, "dogs", "dig", 7.5),
+    ...     (3, "cows", "moo", -2.1, "occasionally"),
+    ... ])
+    >>> df
+       0     1     2     3             4
+    0  5  cats  jump  <NA>          <NA>
+    1  2  dogs   dig   7.5          <NA>
+    2  3  cows   moo  -2.1  occasionally
+
+    Convert from a Pandas DataFrame:
 
-        Build DataFrame via dict of columns:
+    >>> import pandas as pd
+    >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})
+    >>> pdf
+       a    b
+    0  0  0.1
+    1  1  0.2
+    2  2  NaN
+    3  3  0.3
+    >>> df = cudf.from_pandas(pdf)
+    >>> df
+       a     b
+    0  0   0.1
+    1  1   0.2
+    2  2  <NA>
+    3  3   0.3
+    """
 
-        >>> import numpy as np
-        >>> from datetime import datetime, timedelta
-        >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S')
-        >>> n = 5
-        >>> df = cudf.DataFrame({
-        ...     'id': np.arange(n),
-        ...     'datetimes': np.array(
-        ...     [(t0+ timedelta(seconds=x)) for x in range(n)])
-        ... })
-        >>> df
-            id                datetimes
-        0    0  2018-10-07T12:00:00.000
-        1    1  2018-10-07T12:00:01.000
-        2    2  2018-10-07T12:00:02.000
-        3    3  2018-10-07T12:00:03.000
-        4    4  2018-10-07T12:00:04.000
-
-        Build DataFrame via list of rows as tuples:
-
-        >>> df = cudf.DataFrame([
-        ...     (5, "cats", "jump", np.nan),
-        ...     (2, "dogs", "dig", 7.5),
-        ...     (3, "cows", "moo", -2.1, "occasionally"),
-        ... ])
-        >>> df
-           0     1     2     3             4
-        0  5  cats  jump  <NA>          <NA>
-        1  2  dogs   dig   7.5          <NA>
-        2  3  cows   moo  -2.1  occasionally
+    _PROTECTED_KEYS = frozenset(("_data", "_index"))
 
-        Convert from a Pandas DataFrame:
+    @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
+    def __init__(self, data=None, index=None, columns=None, dtype=None):
 
-        >>> import pandas as pd
-        >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})
-        >>> pdf
-           a    b
-        0  0  0.1
-        1  1  0.2
-        2  2  NaN
-        3  3  0.3
-        >>> df = cudf.from_pandas(pdf)
-        >>> df
-           a     b
-        0  0   0.1
-        1  1   0.2
-        2  2  <NA>
-        3  3   0.3
-        """
         super().__init__()
 
         if isinstance(columns, (Series, cudf.BaseIndex)):
@@ -455,30 +458,16 @@ def _init_from_dict_like(self, data, index=None, columns=None):
         if columns is not None:
             self.columns = columns
 
-    @classmethod
-    def _from_table(cls, table, index=None):
-        if index is None:
-            if table._index is not None:
-                index = Index._from_table(table._index)
-            else:
-                index = RangeIndex(table._num_rows)
-        out = cls.__new__(cls)
-        out._data = table._data
-        out._index = index
-        return out
-
     @classmethod
     def _from_data(
         cls,
-        data: ColumnAccessor,
-        index: Optional[Index] = None,
+        data: MutableMapping,
+        index: Optional[BaseIndex] = None,
         columns: Any = None,
     ) -> DataFrame:
-        out = cls.__new__(cls)
-        out._data = data
+        out = super()._from_data(data, index)
         if index is None:
-            index = cudf.Index(range(data.nrows))
-        out._index = index
+            out.index = RangeIndex(out._data.nrows)
         if columns is not None:
             out.columns = columns
         return out
@@ -864,17 +853,20 @@ def _slice(self: T, arg: slice) -> T:
                     )
                 )
             else:
-                result = self._from_table(
-                    libcudf.copying.table_slice(
+                result = self._from_data(
+                    *libcudf.copying.table_slice(
                         self, [start, stop], keep_index
                     )[0]
                 )
 
                 result._copy_type_metadata(self, include_index=keep_index)
-                # Adding index of type RangeIndex back to
-                # result
-                if keep_index is False and self.index is not None:
-                    result.index = self.index[start:stop]
+                if self.index is not None:
+                    if keep_index:
+                        result._index.names = self.index.names
+                    else:
+                        # Adding index of type RangeIndex back to
+                        # result
+                        result.index = self.index[start:stop]
                 result.columns = self.columns
                 return result
 
@@ -3476,7 +3468,7 @@ def rename(
         if index:
             if (
                 any(type(item) == str for item in index.values())
-                and type(self.index) != cudf.core.index.StringIndex
+                and type(self.index) != cudf.StringIndex
             ):
                 raise NotImplementedError(
                     "Implicit conversion of index to "
@@ -3547,12 +3539,12 @@ def as_gpu_matrix(self, columns=None, order="F"):
         if ncol < 1:
             # This is the case for empty dataframe - construct empty cupy array
             matrix = cupy.empty(
-                shape=(0, 0), dtype=np.dtype("float64"), order=order
+                shape=(0, 0), dtype=cudf.dtype("float64"), order=order
             )
             return cuda.as_cuda_array(matrix)
 
         if any(
-            (is_categorical_dtype(c) or np.issubdtype(c, np.dtype("object")))
+            (is_categorical_dtype(c) or np.issubdtype(c, cudf.dtype("object")))
             for c in cols
         ):
             raise TypeError("non-numeric data not yet supported")
@@ -3566,7 +3558,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
                 )
         cupy_dtype = dtype
         if np.issubdtype(cupy_dtype, np.datetime64):
-            cupy_dtype = np.dtype("int64")
+            cupy_dtype = cudf.dtype("int64")
 
         if order not in ("F", "C"):
             raise ValueError(
@@ -3893,9 +3885,9 @@ def sort_values(
         Examples
         --------
         >>> import cudf
-        >>> a = ('a', [0, 1, 2])
-        >>> b = ('b', [-3, 2, 0])
-        >>> df = cudf.DataFrame([a, b])
+        >>> df = cudf.DataFrame()
+        >>> df['a'] = [0, 1, 2]
+        >>> df['b'] = [-3, 2, 0]
         >>> df.sort_values('b')
            a  b
         0  0 -3
@@ -3904,8 +3896,17 @@ def sort_values(
         """
         if inplace:
             raise NotImplementedError("`inplace` not currently implemented.")
-        if kind != "quicksort":
-            raise NotImplementedError("`kind` not currently implemented.")
+        if kind not in {"quicksort", "mergesort", "heapsort", "stable"}:
+            raise AttributeError(
+                f"{kind} is not a valid sorting algorithm for "
+                f"'DataFrame' object"
+            )
+        elif kind != "quicksort":
+            msg = (
+                f"GPU-accelerated {kind} is currently not supported, "
+                f"now defaulting to GPU-accelerated quicksort."
+            )
+            warnings.warn(msg)
         if axis != 0:
             raise NotImplementedError("`axis` not currently implemented.")
 
@@ -4214,10 +4215,12 @@ def transpose(self):
         index = self.columns.copy(deep=False)
         if self._num_columns == 0 or self._num_rows == 0:
             return DataFrame(index=index, columns=columns)
-        # Cython renames the columns to the range [0...ncols]
-        result = self.__class__._from_table(libcudf.transpose.transpose(self))
         # Set the old column names as the new index
-        result._index = as_index(index)
+        result = self.__class__._from_data(
+            # Cython renames the columns to the range [0...ncols]
+            libcudf.transpose.transpose(self),
+            as_index(index),
+        )
         # Set the old index as the new column names
         result.columns = columns
         return result
@@ -4458,6 +4461,7 @@ def join(
         )
         return df
 
+    @copy_docstring(DataFrameGroupBy)
     def groupby(
         self,
         by=None,
@@ -4502,6 +4506,7 @@ def groupby(
             sort=sort,
         )
 
+    @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
     ):
@@ -5851,7 +5856,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             )
 
         if data.ndim == 2:
-            num_cols = len(data[0])
+            num_cols = data.shape[1]
         else:
             # Since we validate ndim to be either 1 or 2 above,
             # this case can be assumed to be ndim == 1.
@@ -5896,6 +5901,36 @@ def _from_columns(cls, cols, index=None, columns=None):
 
         return cls(data=data, index=index,)
 
+    def interpolate(
+        self,
+        method="linear",
+        axis=0,
+        limit=None,
+        inplace=False,
+        limit_direction=None,
+        limit_area=None,
+        downcast=None,
+        **kwargs,
+    ):
+
+        if all(dt == np.dtype("object") for dt in self.dtypes):
+            raise TypeError(
+                "Cannot interpolate with all object-dtype "
+                "columns in the DataFrame. Try setting at "
+                "least one column to a numeric dtype."
+            )
+
+        return super().interpolate(
+            method=method,
+            axis=axis,
+            limit=limit,
+            inplace=inplace,
+            limit_direction=limit_direction,
+            limit_area=limit_area,
+            downcast=downcast,
+            **kwargs,
+        )
+
     def quantile(
         self,
         q=0.5,
@@ -6140,12 +6175,12 @@ def isin(self, values):
                     isinstance(
                         self[col]._column, cudf.core.column.CategoricalColumn
                     )
-                    or np.issubdtype(self[col].dtype, np.dtype("object"))
+                    or np.issubdtype(self[col].dtype, cudf.dtype("object"))
                 ) or (
                     isinstance(
                         values._column, cudf.core.column.CategoricalColumn
                     )
-                    or np.issubdtype(values.dtype, np.dtype("object"))
+                    or np.issubdtype(values.dtype, cudf.dtype("object"))
                 ):
                     result[col] = utils.scalar_broadcast_to(False, len(self))
                 else:
@@ -6195,7 +6230,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
             col.nullable for col in self._columns
         ):
             msg = (
-                f"Row-wise operations to calculate '{method}' is not "
+                f"Row-wise operations to calculate '{method}' do not "
                 f"currently support columns with null values. "
                 f"Consider removing them with .dropna() "
                 f"or using .fillna()."
@@ -6266,472 +6301,56 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         Single    5
         dtype: int64
         """
-        if axis not in (0, "index", None):
+        axis = self._get_axis_from_axis_arg(axis)
+        if axis != 0:
             raise NotImplementedError("Only axis=0 is currently supported.")
 
-        return self._apply_support_method(
-            "count",
-            axis=axis,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
-        )
-
-    def min(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
-    ):
-        """
-        Return the minimum of the values in the DataFrame.
-
-        Parameters
-        ----------
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-        level: int or level name, default None
-            If the axis is a MultiIndex (hierarchical), count along a
-            particular level, collapsing into a Series.
-        numeric_only: bool, default None
-            Include only float, int, boolean columns. If None, will attempt to
-            use everything, then use only numeric data.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `level`, `numeric_only`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.min()
-        a    1
-        b    7
-        dtype: int64
-        """
-        return self._apply_support_method(
-            "min",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
-        )
-
-    def max(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
-    ):
-        """
-        Return the maximum of the values in the DataFrame.
-
-        Parameters
-        ----------
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-        level: int or level name, default None
-            If the axis is a MultiIndex (hierarchical), count along a
-            particular level, collapsing into a Series.
-        numeric_only: bool, default None
-            Include only float, int, boolean columns. If None, will attempt to
-            use everything, then use only numeric data.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `level`, `numeric_only`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.max()
-        a     4
-        b    10
-        dtype: int64
-        """
-        return self._apply_support_method(
-            "max",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
-        )
-
-    def sum(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
-    ):
-        """
-        Return sum of the values in the DataFrame.
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-        dtype: data type
-            Data type to cast the result to.
-        min_count: int, default 0
-            The required number of valid values to perform the operation.
-            If fewer than min_count non-NA values are present the result
-            will be NA.
-
-            The default being 0. This means the sum of an all-NA or empty
-            Series is 0, and the product of an all-NA or empty Series is 1.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `level`, `numeric_only`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.sum()
-        a    10
-        b    34
-        dtype: int64
-        """
-        return self._apply_support_method(
-            "sum",
-            axis=axis,
-            skipna=skipna,
-            dtype=dtype,
-            level=level,
-            numeric_only=numeric_only,
-            min_count=min_count,
-            **kwargs,
+        return Series._from_data(
+            {None: [self._data[col].valid_count for col in self._data.names]},
+            as_index(self._data.names),
         )
 
-    def product(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
-    ):
-        """
-        Return product of the values in the DataFrame.
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-        dtype: data type
-            Data type to cast the result to.
-        min_count: int, default 0
-            The required number of valid values to perform the operation.
-            If fewer than min_count non-NA values are present the result
-            will be NA.
-
-            The default being 0. This means the sum of an all-NA or empty
-            Series is 0, and the product of an all-NA or empty Series is 1.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are level`, `numeric_only`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.product()
-        a      24
-        b    5040
-        dtype: int64
-        """
-        return self._apply_support_method(
-            "prod",
-            axis=axis,
-            skipna=skipna,
-            dtype=dtype,
-            level=level,
-            numeric_only=numeric_only,
-            min_count=min_count,
-            **kwargs,
-        )
+    _SUPPORT_AXIS_LOOKUP = {
+        0: 0,
+        1: 1,
+        None: 0,
+        "index": 0,
+        "columns": 1,
+    }
 
-    def prod(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
+    def _reduce(
+        self, op, axis=None, level=None, numeric_only=None, **kwargs,
     ):
-        """
-        Return product of the values in the DataFrame.
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-        dtype: data type
-            Data type to cast the result to.
-        min_count: int, default 0
-            The required number of valid values to perform the operation.
-            If fewer than min_count non-NA values are present the result
-            will be NA.
-
-            The default being 0. This means the sum of an all-NA or empty
-            Series is 0, and the product of an all-NA or empty Series is 1.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `level`, `numeric_only`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.prod()
-        a      24
-        b    5040
-        dtype: int64
-        """
-        return self.product(
-            axis=axis,
-            skipna=skipna,
-            dtype=dtype,
-            level=level,
-            numeric_only=numeric_only,
-            min_count=min_count,
-            **kwargs,
-        )
-
-    def cummin(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative minimum of the DataFrame.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-        Returns
-        -------
-        DataFrame
-
-        Notes
-        -----
-        Parameters currently not supported is `axis`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.cummin()
-           a  b
-        0  1  7
-        1  1  7
-        2  1  7
-        3  1  7
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
-
-        return self._apply_support_method(
-            "cummin", axis=axis, skipna=skipna, *args, **kwargs
-        )
-
-    def cummax(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative maximum of the DataFrame.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-        Returns
-        -------
-        DataFrame
-
-        Notes
-        -----
-        Parameters currently not supported is `axis`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.cummax()
-           a   b
-        0  1   7
-        1  2   8
-        2  3   9
-        3  4  10
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
-
-        return self._apply_support_method(
-            "cummax", axis=axis, skipna=skipna, *args, **kwargs
-        )
-
-    def cumsum(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative sum of the DataFrame.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-
-        Returns
-        -------
-        DataFrame
-
-        Notes
-        -----
-        Parameters currently not supported is `axis`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> s.cumsum()
-            a   b
-        0   1   7
-        1   3  15
-        2   6  24
-        3  10  34
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
-
-        return self._apply_support_method(
-            "cumsum", axis=axis, skipna=skipna, *args, **kwargs
-        )
-
-    def cumprod(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative product of the DataFrame.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-        Returns
-        -------
-        DataFrame
+        if level is not None:
+            raise NotImplementedError("level parameter is not implemented yet")
 
-        Notes
-        -----
-        Parameters currently not supported is `axis`
+        if numeric_only not in (None, True):
+            raise NotImplementedError(
+                "numeric_only parameter is not implemented yet"
+            )
+        axis = self._get_axis_from_axis_arg(axis)
 
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> s.cumprod()
-            a     b
-        0   1     7
-        1   2    56
-        2   6   504
-        3  24  5040
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
+        if axis == 0:
+            result = [
+                getattr(self._data[col], op)(**kwargs)
+                for col in self._data.names
+            ]
 
-        return self._apply_support_method(
-            "cumprod", axis=axis, skipna=skipna, *args, **kwargs
-        )
+            return Series._from_data(
+                {None: result}, as_index(self._data.names)
+            )
+        elif axis == 1:
+            return self._apply_cupy_method_axis_1(op, **kwargs)
 
-    def mean(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    def _scan(
+        self, op, axis=None, *args, **kwargs,
     ):
-        """
-        Return the mean of the values for the requested axis.
-
-        Parameters
-        ----------
-        axis : {0 or 'index', 1 or 'columns'}
-            Axis for the function to be applied on.
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-        level : int or level name, default None
-            If the axis is a MultiIndex (hierarchical), count along a
-            particular level, collapsing into a Series.
-        numeric_only : bool, default None
-            Include only float, int, boolean columns. If None, will attempt to
-            use everything, then use only numeric data. Not implemented for
-            Series.
-        **kwargs
-            Additional keyword arguments to be passed to the function.
-
-        Returns
-        -------
-        mean : Series or DataFrame (if level specified)
+        axis = self._get_axis_from_axis_arg(axis)
 
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.mean()
-        a    2.5
-        b    8.5
-        dtype: float64
-        """
-        return self._apply_support_method(
-            "mean",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
-        )
+        if axis == 0:
+            return super()._scan(op, axis=axis, *args, **kwargs)
+        elif axis == 1:
+            return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs)
 
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
@@ -6759,9 +6378,9 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
 
         See Also
         --------
-        cudf.core.series.Series.mode : Return the highest frequency value
+        cudf.Series.mode : Return the highest frequency value
             in a Series.
-        cudf.core.series.Series.value_counts : Return the counts of values
+        cudf.Series.value_counts : Return the counts of values
             in a Series.
 
         Notes
@@ -6833,424 +6452,129 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
 
         return df
 
-    def std(
-        self,
-        axis=None,
-        skipna=None,
-        level=None,
-        ddof=1,
-        numeric_only=None,
-        **kwargs,
-    ):
-        """
-        Return sample standard deviation of the DataFrame.
-
-        Normalized by N-1 by default. This can be changed using
-        the `ddof` argument
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA, the result
-            will be NA.
-        ddof: int, default 1
-            Delta Degrees of Freedom. The divisor used in calculations
-            is N - ddof, where N represents the number of elements.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.std()
-        a    1.290994
-        b    1.290994
-        dtype: float64
-        """
-
-        return self._apply_support_method(
-            "std",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            ddof=ddof,
-            numeric_only=numeric_only,
-            **kwargs,
-        )
-
-    def var(
-        self,
-        axis=None,
-        skipna=None,
-        level=None,
-        ddof=1,
-        numeric_only=None,
-        **kwargs,
-    ):
-        """
-        Return unbiased variance of the DataFrame.
-
-        Normalized by N-1 by default. This can be changed using the
-        ddof argument
-
-        Parameters
-        ----------
-
-        axis: {index (0), columns(1)}
-            Axis for the function to be applied on.
-        skipna: bool, default True
-            Exclude NA/null values. If an entire row/column is NA, the result
-            will be NA.
-        ddof: int, default 1
-            Delta Degrees of Freedom. The divisor used in calculations is
-            N - ddof, where N represents the number of elements.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.var()
-        a    1.666667
-        b    1.666667
-        dtype: float64
-        """
-        return self._apply_support_method(
-            "var",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            ddof=ddof,
-            numeric_only=numeric_only,
-            **kwargs,
-        )
-
     def kurtosis(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
-        """
-        Return Fisher's unbiased kurtosis of a sample.
-
-        Kurtosis obtained using Fisher’s definition of
-        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
-        >>> df.kurt()
-        a   -1.2
-        b   -1.2
-        dtype: float64
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
-
-        if numeric_only not in (None, True):
-            msg = "Kurtosis only supports int, float, and bool dtypes."
-            raise NotImplementedError(msg)
-
-        filtered = self.select_dtypes(include=[np.number, np.bool_])
-        return filtered._apply_support_method(
-            "kurtosis",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
+        obj = self.select_dtypes(include=[np.number, np.bool_])
+        return super(DataFrame, obj).kurtosis(
+            axis, skipna, level, numeric_only, **kwargs
         )
 
-    # Alias for kurtosis.
-    kurt = kurtosis
-
     def skew(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
-        """
-        Return unbiased Fisher-Pearson skew of a sample.
-
-        Parameters
-        ----------
-        skipna: bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]})
-        >>> df.skew()
-        a    0.00000
-        b   -0.37037
-        dtype: float64
-        """
-        if axis not in (0, "index", None):
-            raise NotImplementedError("Only axis=0 is currently supported.")
-
-        if numeric_only not in (None, True):
-            msg = "Skew only supports int, float, and bool dtypes."
-            raise NotImplementedError(msg)
-
-        filtered = self.select_dtypes(include=[np.number, np.bool_])
-        return filtered._apply_support_method(
-            "skew",
-            axis=axis,
-            skipna=skipna,
-            level=level,
-            numeric_only=numeric_only,
-            **kwargs,
+        obj = self.select_dtypes(include=[np.number, np.bool_])
+        return super(DataFrame, obj).skew(
+            axis, skipna, level, numeric_only, **kwargs
         )
 
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
-        """
-        Return whether all elements are True in DataFrame.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values. If the entire row/column is NA and
-            skipna is True, then the result will be True, as for an
-            empty row/column.
-            If skipna is False, then NA are treated as True, because
-            these are not equal to zero.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]})
-        >>> df.all()
-        a     True
-        b    False
-        dtype: bool
-        """
-        if bool_only:
-            return self.select_dtypes(include="bool")._apply_support_method(
-                "all",
-                axis=axis,
-                bool_only=bool_only,
-                skipna=skipna,
-                level=level,
-                **kwargs,
-            )
-        return self._apply_support_method(
-            "all",
-            axis=axis,
-            bool_only=bool_only,
-            skipna=skipna,
-            level=level,
-            **kwargs,
-        )
+        obj = self.select_dtypes(include="bool") if bool_only else self
+        return super(DataFrame, obj).all(axis, skipna, level, **kwargs)
 
     def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
-        """
-        Return whether any elements is True in DataFrame.
-
-        Parameters
-        ----------
-
-        skipna: bool, default True
-            Exclude NA/null values. If the entire row/column is NA and
-            skipna is True, then the result will be False, as for an
-            empty row/column.
-            If skipna is False, then NA are treated as True, because
-            these are not equal to zero.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]})
-        >>> df.any()
-        a    True
-        b    True
-        dtype: bool
-        """
-        if bool_only:
-            return self.select_dtypes(include="bool")._apply_support_method(
-                "any",
-                axis=axis,
-                bool_only=bool_only,
-                skipna=skipna,
-                level=level,
-                **kwargs,
+        obj = self.select_dtypes(include="bool") if bool_only else self
+        return super(DataFrame, obj).any(axis, skipna, level, **kwargs)
+
+    def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
+        # This method uses cupy to perform scans and reductions along rows of a
+        # DataFrame. Since cuDF is designed around columnar storage and
+        # operations, we convert DataFrames to 2D cupy arrays for these ops.
+
+        # for dask metadata compatibility
+        skipna = kwargs.pop("skipna", None)
+        skipna = True if skipna is None else skipna
+        if method not in _cupy_nan_methods_map and skipna not in (
+            None,
+            True,
+            1,
+        ):
+            raise NotImplementedError(
+                f"Row-wise operations to calculate '{method}'"
+                f" currently do not support `skipna=False`."
             )
-        return self._apply_support_method(
-            "any",
-            axis=axis,
-            bool_only=bool_only,
-            skipna=skipna,
-            level=level,
-            **kwargs,
-        )
-
-    def _apply_support_method(self, method, axis=0, *args, **kwargs):
-        assert axis in (None, 0, 1)
-
-        if axis in (None, 0):
-            result = [
-                getattr(self[col], method)(*args, **kwargs)
-                for col in self._data.names
-            ]
-
-            if isinstance(result[0], Series):
-                support_result = result
-                result = DataFrame(index=support_result[0].index)
-                for idx, col in enumerate(self._data.names):
-                    result[col] = support_result[idx]
-            else:
-                result = Series(result)
-                result = result.set_index(self._data.names)
-            return result
 
-        elif axis == 1:
-            # for dask metadata compatibility
-            skipna = kwargs.pop("skipna", None)
-            if method not in _cupy_nan_methods_map and skipna not in (
-                None,
-                True,
-                1,
-            ):
-                raise NotImplementedError(
-                    f"Row-wise operation to calculate '{method}'"
-                    f" currently do not support `skipna=False`."
-                )
+        level = kwargs.pop("level", None)
+        if level not in (None,):
+            raise NotImplementedError(
+                "Row-wise operations currently do not support `level`."
+            )
 
-            level = kwargs.pop("level", None)
-            if level not in (None,):
-                raise NotImplementedError(
-                    "Row-wise operations currently do not support `level`."
-                )
+        numeric_only = kwargs.pop("numeric_only", None)
+        if numeric_only not in (None, True):
+            raise NotImplementedError(
+                "Row-wise operations currently do not "
+                "support `numeric_only=False`."
+            )
 
-            numeric_only = kwargs.pop("numeric_only", None)
-            if numeric_only not in (None, True):
-                raise NotImplementedError(
-                    "Row-wise operations currently do not "
-                    "support `numeric_only=False`."
-                )
+        min_count = kwargs.pop("min_count", None)
+        if min_count not in (None, 0):
+            raise NotImplementedError(
+                "Row-wise operations currently do not support `min_count`."
+            )
 
-            min_count = kwargs.pop("min_count", None)
-            if min_count not in (None, 0):
-                raise NotImplementedError(
-                    "Row-wise operations currently do not "
-                    "support `min_count`."
-                )
+        bool_only = kwargs.pop("bool_only", None)
+        if bool_only not in (None, True):
+            raise NotImplementedError(
+                "Row-wise operations currently do not support `bool_only`."
+            )
 
-            bool_only = kwargs.pop("bool_only", None)
-            if bool_only not in (None, True):
-                raise NotImplementedError(
-                    "Row-wise operations currently do not "
-                    "support `bool_only`."
-                )
+        # This parameter is only necessary for axis 0 reductions that cuDF
+        # performs internally. cupy already upcasts smaller integer/bool types
+        # to int64 when accumulating.
+        kwargs.pop("cast_to_int", None)
 
-            prepared, mask, common_dtype = self._prepare_for_rowwise_op(
-                method, skipna
-            )
-            for col in prepared._data.names:
-                if prepared._data[col].nullable:
-                    prepared._data[col] = (
-                        prepared._data[col]
-                        .astype(
-                            cudf.utils.dtypes.get_min_float_dtype(
-                                prepared._data[col]
-                            )
-                            if not is_datetime_dtype(common_dtype)
-                            else np.dtype("float64")
+        prepared, mask, common_dtype = self._prepare_for_rowwise_op(
+            method, skipna
+        )
+        for col in prepared._data.names:
+            if prepared._data[col].nullable:
+                prepared._data[col] = (
+                    prepared._data[col]
+                    .astype(
+                        cudf.utils.dtypes.get_min_float_dtype(
+                            prepared._data[col]
                         )
-                        .fillna(np.nan)
+                        if not is_datetime_dtype(common_dtype)
+                        else cudf.dtype("float64")
                     )
-            arr = cupy.asarray(prepared.as_gpu_matrix())
-
-            if skipna is not False and method in _cupy_nan_methods_map:
-                method = _cupy_nan_methods_map[method]
-
-            result = getattr(cupy, method)(arr, axis=1, **kwargs)
-
-            if result.ndim == 1:
-                type_coerced_methods = {
-                    "count",
-                    "min",
-                    "max",
-                    "sum",
-                    "prod",
-                    "cummin",
-                    "cummax",
-                    "cumsum",
-                    "cumprod",
-                }
-                result_dtype = (
-                    common_dtype
-                    if method in type_coerced_methods
-                    or is_datetime_dtype(common_dtype)
-                    else None
+                    .fillna(np.nan)
                 )
-                result = column.as_column(result, dtype=result_dtype)
-                if mask is not None:
-                    result = result.set_mask(
-                        cudf._lib.transform.bools_to_mask(mask._column)
-                    )
-                return Series(result, index=self.index, dtype=result_dtype,)
-            else:
-                result_df = DataFrame(result).set_index(self.index)
-                result_df.columns = prepared.columns
-                return result_df
+        arr = cupy.asarray(prepared.as_gpu_matrix())
+
+        if skipna is not False and method in _cupy_nan_methods_map:
+            method = _cupy_nan_methods_map[method]
+
+        result = getattr(cupy, method)(arr, axis=1, **kwargs)
+
+        if result.ndim == 1:
+            type_coerced_methods = {
+                "count",
+                "min",
+                "max",
+                "sum",
+                "prod",
+                "cummin",
+                "cummax",
+                "cumsum",
+                "cumprod",
+            }
+            result_dtype = (
+                common_dtype
+                if method in type_coerced_methods
+                or is_datetime_dtype(common_dtype)
+                else None
+            )
+            result = column.as_column(result, dtype=result_dtype)
+            if mask is not None:
+                result = result.set_mask(
+                    cudf._lib.transform.bools_to_mask(mask._column)
+                )
+            return Series(result, index=self.index, dtype=result_dtype,)
+        else:
+            result_df = DataFrame(result).set_index(self.index)
+            result_df.columns = prepared.columns
+            return result_df
 
     def _columns_view(self, columns):
         """
@@ -8084,7 +7408,7 @@ def _get_union_of_indices(indexes):
     if len(indexes) == 1:
         return indexes[0]
     else:
-        merged_index = cudf.core.Index._concat(indexes)
+        merged_index = cudf.Index._concat(indexes)
         merged_index = merged_index.drop_duplicates()
         _, inds = merged_index._values.sort_by_values()
         return merged_index.take(inds)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 6dbe55d0bb8..ead0b6453c1 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -21,20 +21,71 @@
 from cudf.core.buffer import Buffer
 
 
+def dtype(arbitrary):
+    """
+    Return the cuDF-supported dtype corresponding to `arbitrary`.
+
+    Parameters
+    ----------
+    arbitrary: dtype or scalar-like
+
+    Returns
+    -------
+    dtype: the cuDF-supported dtype that best matches `arbitrary`
+    """
+    # first, try interpreting arbitrary as a NumPy dtype that we support:
+    try:
+        np_dtype = np.dtype(arbitrary)
+        if np_dtype.name == "float16":
+            return np.dtype("float32")
+        elif np_dtype.name == "float128":
+            raise NotImplementedError()
+        elif np_dtype.kind in ("OU"):
+            return np.dtype("object")
+    except TypeError:
+        pass
+    else:
+        if np_dtype.kind not in "biufUOMm":
+            raise TypeError(f"Unsupported type {np_dtype}")
+        return np_dtype
+
+    #  next, check if `arbitrary` is one of our extension types:
+    if isinstance(arbitrary, cudf.core.dtypes._BaseDtype):
+        return arbitrary
+
+    # use `pandas_dtype` to try and interpret
+    # `arbitrary` as a Pandas extension type.
+    #  Return the corresponding NumPy/cuDF type.
+    pd_dtype = pd.api.types.pandas_dtype(arbitrary)
+    try:
+        return dtype(pd_dtype.numpy_dtype)
+    except AttributeError:
+        if isinstance(pd_dtype, pd.CategoricalDtype):
+            return cudf.CategoricalDtype.from_pandas(pd_dtype)
+        elif isinstance(pd_dtype, pd.StringDtype):
+            return np.dtype("object")
+        elif isinstance(pd_dtype, pd.IntervalDtype):
+            return cudf.IntervalDtype.from_pandas(pd_dtype)
+        else:
+            raise TypeError(
+                f"Cannot interpret {arbitrary} as a valid cuDF dtype"
+            )
+
+
 class _BaseDtype(ExtensionDtype, Serializable):
     # Base type for all cudf-specific dtypes
     pass
 
 
 class CategoricalDtype(_BaseDtype):
+    """
+    dtype similar to pd.CategoricalDtype with the categories
+    stored on the GPU.
+    """
 
     ordered: Optional[bool]
 
     def __init__(self, categories=None, ordered: bool = None) -> None:
-        """
-        dtype similar to pd.CategoricalDtype with the categories
-        stored on the GPU.
-        """
         self._categories = self._init_categories(categories)
         self.ordered = ordered
 
@@ -157,7 +208,7 @@ def element_type(self) -> Dtype:
         elif isinstance(self._typ.value_type, pa.StructType):
             return StructDtype.from_arrow(self._typ.value_type)
         else:
-            return np.dtype(self._typ.value_type.to_pandas_dtype()).name
+            return cudf.dtype(self._typ.value_type.to_pandas_dtype()).name
 
     @property
     def leaf_type(self):
@@ -223,14 +274,14 @@ def deserialize(cls, header: dict, frames: list):
 
 
 class StructDtype(_BaseDtype):
+    """
+    fields : dict
+        A mapping of field names to dtypes
+    """
 
     name = "struct"
 
     def __init__(self, fields):
-        """
-        fields : dict
-            A mapping of field names to dtypes
-        """
         pa_fields = {
             k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v)
             for k, v in fields.items()
@@ -309,34 +360,34 @@ def deserialize(cls, header: dict, frames: list):
 
 
 class Decimal32Dtype(_BaseDtype):
+    """
+    Parameters
+    ----------
+    precision : int
+        The total number of digits in each value of this dtype
+    scale : int, optional
+        The scale of the Decimal32Dtype. See Notes below.
+
+    Notes
+    -----
+        When the scale is positive:
+            - numbers with fractional parts (e.g., 0.0042) can be represented
+            - the scale is the total number of digits to the right of the
+            decimal point
+        When the scale is negative:
+            - only multiples of powers of 10 (including 10**0) can be
+            represented (e.g., 1729, 4200, 1000000)
+            - the scale represents the number of trailing zeros in the value.
+        For example, 42 is representable with precision=2 and scale=0.
+        13.0051 is representable with precision=6 and scale=4,
+        and *not* representable with precision<6 or scale<4.
+    """
 
     name = "decimal32"
     _metadata = ("precision", "scale")
     MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max))
 
     def __init__(self, precision, scale=0):
-        """
-        Parameters
-        ----------
-        precision : int
-            The total number of digits in each value of this dtype
-        scale : int, optional
-            The scale of the Decimal32Dtype. See Notes below.
-
-        Notes
-        -----
-            When the scale is positive:
-              - numbers with fractional parts (e.g., 0.0042) can be represented
-              - the scale is the total number of digits to the right of the
-                decimal point
-            When the scale is negative:
-              - only multiples of powers of 10 (including 10**0) can be
-                represented (e.g., 1729, 4200, 1000000)
-              - the scale represents the number of trailing zeros in the value.
-            For example, 42 is representable with precision=2 and scale=0.
-            13.0051 is representable with precision=6 and scale=4,
-            and *not* representable with precision<6 or scale<4.
-        """
         self._validate(precision, scale)
         self._typ = pa.decimal128(precision, scale)
 
@@ -417,34 +468,34 @@ def deserialize(cls, header: dict, frames: list):
 
 
 class Decimal64Dtype(_BaseDtype):
+    """
+    Parameters
+    ----------
+    precision : int
+        The total number of digits in each value of this dtype
+    scale : int, optional
+        The scale of the Decimal64Dtype. See Notes below.
+
+    Notes
+    -----
+        When the scale is positive:
+          - numbers with fractional parts (e.g., 0.0042) can be represented
+          - the scale is the total number of digits to the right of the
+            decimal point
+        When the scale is negative:
+          - only multiples of powers of 10 (including 10**0) can be
+            represented (e.g., 1729, 4200, 1000000)
+          - the scale represents the number of trailing zeros in the value.
+        For example, 42 is representable with precision=2 and scale=0.
+        13.0051 is representable with precision=6 and scale=4,
+        and *not* representable with precision<6 or scale<4.
+    """
 
     name = "decimal64"
     _metadata = ("precision", "scale")
     MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
 
     def __init__(self, precision, scale=0):
-        """
-        Parameters
-        ----------
-        precision : int
-            The total number of digits in each value of this dtype
-        scale : int, optional
-            The scale of the Decimal64Dtype. See Notes below.
-
-        Notes
-        -----
-            When the scale is positive:
-              - numbers with fractional parts (e.g., 0.0042) can be represented
-              - the scale is the total number of digits to the right of the
-                decimal point
-            When the scale is negative:
-              - only multiples of powers of 10 (including 10**0) can be
-                represented (e.g., 1729, 4200, 1000000)
-              - the scale represents the number of trailing zeros in the value.
-            For example, 42 is representable with precision=2 and scale=0.
-            13.0051 is representable with precision=6 and scale=4,
-            and *not* representable with precision<6 or scale<4.
-        """
         self._validate(precision, scale)
         self._typ = pa.decimal128(precision, scale)
 
@@ -525,16 +576,17 @@ def deserialize(cls, header: dict, frames: list):
 
 
 class IntervalDtype(StructDtype):
+    """
+    subtype: str, np.dtype
+        The dtype of the Interval bounds.
+    closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’
+        Whether the interval is closed on the left-side, right-side,
+        both or neither. See the Notes for more detailed explanation.
+    """
+
     name = "interval"
 
     def __init__(self, subtype, closed="right"):
-        """
-        subtype: str, np.dtype
-            The dtype of the Interval bounds.
-        closed: {‘right’, ‘left’, ‘both’, ‘neither’}, default ‘right’
-            Whether the interval is closed on the left-side, right-side,
-            both or neither. See the Notes for more detailed explanation.
-        """
         super().__init__(fields={"left": subtype, "right": subtype})
 
         if closed in ["left", "right", "neither", "both"]:
@@ -559,6 +611,12 @@ def to_arrow(self):
             pa.from_numpy_dtype(self.subtype), self.closed
         )
 
+    @classmethod
+    def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype":
+        return cls(
+            subtype=pd_dtype.subtype
+        )  # TODO: needs `closed` when we upgrade Pandas
+
 
 def is_categorical_dtype(obj):
     """Check whether an array-like or dtype is of the Categorical dtype.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 6ecb0bcc139..9f743cd8c85 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import functools
 import warnings
 from collections import abc
-from typing import Any, Dict, Optional, Tuple, TypeVar, Union
+from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
 
 import cupy
 import numpy as np
@@ -27,6 +27,7 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import merge
+from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
@@ -64,8 +65,14 @@ def __init_subclass__(cls):
         cls._accessors = set()
 
     @classmethod
-    def _from_table(cls, table: Frame):
-        return cls(table._data, index=table._index)
+    def _from_data(
+        cls,
+        data: MutableMapping,
+        index: Optional[cudf.core.index.BaseIndex] = None,
+    ):
+        obj = cls.__new__(cls)
+        libcudf.table.Table.__init__(obj, data, index)
+        return obj
 
     def _mimic_inplace(
         self: T, result: Frame, inplace: bool = False
@@ -476,8 +483,8 @@ def _concat(
             )
 
         # Concatenate the Tables
-        out = cls._from_table(
-            libcudf.concat.concat_tables(tables, ignore_index=ignore_index)
+        out = cls._from_data(
+            *libcudf.concat.concat_tables(tables, ignore_index)
         )
 
         # If ignore_index is True, all input frames are empty, and at
@@ -612,10 +619,11 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         if not ignore_index and self._index is not None:
             explode_column_num += self._index.nlevels
 
-        res_tbl = libcudf.lists.explode_outer(
-            self, explode_column_num, ignore_index
+        res = self.__class__._from_data(  # type: ignore
+            *libcudf.lists.explode_outer(
+                self, explode_column_num, ignore_index
+            )
         )
-        res = self.__class__._from_table(res_tbl)
 
         res._data.multiindex = self._data.multiindex
         res._data._level_names = self._data._level_names
@@ -644,14 +652,15 @@ def _get_columns_by_index(self, indices):
     def _gather(self, gather_map, keep_index=True, nullify=False):
         if not is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
-        result = self.__class__._from_table(
-            libcudf.copying.gather(
+        result = self.__class__._from_data(
+            *libcudf.copying.gather(
                 self,
                 as_column(gather_map),
                 keep_index=keep_index,
                 nullify=nullify,
             )
         )
+
         result._copy_type_metadata(self, include_index=keep_index)
         if keep_index and self._index is not None:
             result._index.names = self._index.names
@@ -663,10 +672,10 @@ def _hash(self, initial_hash_values=None):
     def _hash_partition(
         self, columns_to_hash, num_partitions, keep_index=True
     ):
-        output, offsets = libcudf.hash.hash_partition(
+        output_data, output_index, offsets = libcudf.hash.hash_partition(
             self, columns_to_hash, num_partitions, keep_index
         )
-        output = self.__class__._from_table(output)
+        output = self.__class__._from_data(output_data, output_index)
         output._copy_type_metadata(self, include_index=keep_index)
         return output, offsets
 
@@ -684,14 +693,16 @@ def _as_column(self):
         return self._data[None].copy(deep=False)
 
     def _scatter(self, key, value):
-        result = self._from_table(libcudf.copying.scatter(value, key, self))
+        result = self.__class__._from_data(
+            *libcudf.copying.scatter(value, key, self)
+        )
 
         result._copy_type_metadata(self)
         return result
 
     def _empty_like(self, keep_index=True):
-        result = self._from_table(
-            libcudf.copying.table_empty_like(self, keep_index)
+        result = self.__class__._from_data(
+            *libcudf.copying.table_empty_like(self, keep_index)
         )
 
         result._copy_type_metadata(self, include_index=keep_index)
@@ -876,8 +887,9 @@ def where(self, cond, other=None, inplace=False):
         4    <NA>
         dtype: int64
         """
+        import cudf.core._internals.where
 
-        return cudf.core._internals.where(
+        return cudf.core._internals.where.where(
             frame=self, cond=cond, other=other, inplace=inplace
         )
 
@@ -944,10 +956,10 @@ def mask(self, cond, other=None, inplace=False):
 
     def _partition(self, scatter_map, npartitions, keep_index=True):
 
-        output_table, output_offsets = libcudf.partitioning.partition(
+        data, index, output_offsets = libcudf.partitioning.partition(
             self, scatter_map, npartitions, keep_index
         )
-        partitioned = self.__class__._from_table(output_table)
+        partitioned = self.__class__._from_data(data, index)
 
         # due to the split limitation mentioned
         # here: https://github.com/rapidsai/cudf/issues/4607
@@ -1108,19 +1120,19 @@ def dropna(
 
         See also
         --------
-        cudf.core.dataframe.DataFrame.isna
+        cudf.DataFrame.isna
             Indicate null values.
 
-        cudf.core.dataframe.DataFrame.notna
+        cudf.DataFrame.notna
             Indicate non-null values.
 
-        cudf.core.dataframe.DataFrame.fillna
+        cudf.DataFrame.fillna
             Replace null values.
 
-        cudf.core.series.Series.dropna
+        cudf.Series.dropna
             Drop null values.
 
-        cudf.core.index.Index.dropna
+        cudf.Index.dropna
             Drop null indices.
 
         Examples
@@ -1332,7 +1344,7 @@ def fillna(
             ) or method is not None
             if should_fill:
                 copy_data[name] = copy_data[name].fillna(value[name], method)
-        result = self._from_table(Frame(copy_data, self._index))
+        result = self._from_data(copy_data, self._index)
 
         return self._mimic_inplace(result, inplace=inplace)
 
@@ -1381,8 +1393,8 @@ def _drop_na_rows(
                 else:
                     frame._data[name] = col
 
-        result = frame.__class__._from_table(
-            libcudf.stream_compaction.drop_nulls(
+        result = self.__class__._from_data(
+            *libcudf.stream_compaction.drop_nulls(
                 frame, how=how, keys=subset, thresh=thresh
             )
         )
@@ -1427,14 +1439,83 @@ def _apply_boolean_mask(self, boolean_mask):
         """
         boolean_mask = as_column(boolean_mask)
 
-        result = self.__class__._from_table(
-            libcudf.stream_compaction.apply_boolean_mask(
+        result = self.__class__._from_data(
+            *libcudf.stream_compaction.apply_boolean_mask(
                 self, as_column(boolean_mask)
             )
         )
         result._copy_type_metadata(self)
         return result
 
+    def interpolate(
+        self,
+        method="linear",
+        axis=0,
+        limit=None,
+        inplace=False,
+        limit_direction=None,
+        limit_area=None,
+        downcast=None,
+        **kwargs,
+    ):
+        """
+        Interpolate data values between some points.
+
+        Parameters
+        ----------
+        method : str, default 'linear'
+            Interpolation technique to use. Currently,
+            only 'linear` is supported.
+            * 'linear': Ignore the index and treat the values as
+            equally spaced. This is the only method supported on MultiIndexes.
+            * 'index', 'values': linearly interpolate using the index as
+            an x-axis. Unsorted indices can lead to erroneous results.
+        axis : int, default 0
+            Axis to interpolate along. Currently,
+            only 'axis=0' is supported.
+        inplace : bool, default False
+            Update the data in place if possible.
+
+        Returns
+        -------
+        Series or DataFrame
+            Returns the same object type as the caller, interpolated at
+            some or all ``NaN`` values
+
+        """
+
+        if method in {"pad", "ffill"} and limit_direction != "forward":
+            raise ValueError(
+                f"`limit_direction` must be 'forward' for method `{method}`"
+            )
+        if method in {"backfill", "bfill"} and limit_direction != "backward":
+            raise ValueError(
+                f"`limit_direction` must be 'backward' for method `{method}`"
+            )
+
+        data = self
+
+        if not isinstance(data._index, cudf.RangeIndex):
+            perm_sort = data._index.argsort()
+            data = data._gather(perm_sort)
+
+        interpolator = cudf.core.algorithms.get_column_interpolator(method)
+        columns = {}
+        for colname, col in data._data.items():
+            if col.nullable:
+                col = col.astype("float64").fillna(np.nan)
+
+            # Interpolation methods may or may not need the index
+            columns[colname] = interpolator(col, index=data._index)
+
+        result = self._from_data(columns, index=data._index)
+
+        return (
+            result
+            if isinstance(data._index, cudf.RangeIndex)
+            else result._gather(perm_sort.argsort())
+        )
+
     def _quantiles(
         self,
         q,
@@ -1453,8 +1534,8 @@ def _quantiles(
             libcudf.types.NullOrder[key] for key in null_precedence
         ]
 
-        result = self.__class__._from_table(
-            libcudf.quantiles.quantiles(
+        result = self.__class__._from_data(
+            *libcudf.quantiles.quantiles(
                 self,
                 q,
                 interpolation,
@@ -1548,11 +1629,11 @@ def rank(
             if source.empty:
                 return source.astype("float64")
 
-        out_rank_table = libcudf.sort.rank_columns(
+        data, index = libcudf.sort.rank_columns(
             source, method_enum, na_option, ascending, pct
         )
 
-        return self._from_table(out_rank_table).astype(np.float64)
+        return self._from_data(data, index).astype(np.float64)
 
     def repeat(self, repeats, axis=None):
         """Repeats elements consecutively.
@@ -1639,24 +1720,24 @@ def _repeat(self, count):
         if not is_scalar(count):
             count = as_column(count)
 
-        result = self.__class__._from_table(
-            libcudf.filling.repeat(self, count)
+        result = self.__class__._from_data(
+            *libcudf.filling.repeat(self, count)
         )
 
         result._copy_type_metadata(self)
         return result
 
     def _reverse(self):
-        result = self.__class__._from_table(libcudf.copying.reverse(self))
-        return result
+        return self.__class__._from_data(*libcudf.copying.reverse(self))
 
     def _fill(self, fill_values, begin, end, inplace):
         col_and_fill = zip(self._columns, fill_values)
 
         if not inplace:
             data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill)
-            data = zip(self._column_names, data_columns)
-            return self.__class__._from_table(Frame(data, self._index))
+            return self.__class__._from_data(
+                zip(self._column_names, data_columns), self._index
+            )
 
         for (c, v) in col_and_fill:
             c.fill(v, begin, end, inplace=True)
@@ -1671,8 +1752,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
 
     def _shift(self, offset, fill_value=None):
         data_columns = (col.shift(offset, fill_value) for col in self._columns)
-        data = zip(self._column_names, data_columns)
-        return self.__class__._from_table(Frame(data, self._index))
+        return self.__class__._from_data(
+            zip(self._column_names, data_columns), self._index
+        )
 
     def __array__(self, dtype=None):
         raise TypeError(
@@ -1792,13 +1874,11 @@ def round(self, decimals=0, how="half_even"):
                 "decimals must be an integer, a dict-like or a Series"
             )
 
-        return self.__class__._from_table(
-            Frame(
-                data=cudf.core.column_accessor.ColumnAccessor(
-                    cols,
-                    multiindex=self._data.multiindex,
-                    level_names=self._data.level_names,
-                )
+        return self.__class__._from_data(
+            data=cudf.core.column_accessor.ColumnAccessor(
+                cols,
+                multiindex=self._data.multiindex,
+                level_names=self._data.level_names,
             ),
             index=self._index,
         )
@@ -1923,8 +2003,8 @@ def sample(
             else:
                 seed = np.int64(random_state)
 
-            result = self._from_table(
-                libcudf.copying.sample(
+            result = self.__class__._from_data(
+                *libcudf.copying.sample(
                     self,
                     n=n,
                     replace=replace,
@@ -2064,12 +2144,12 @@ def from_arrow(cls, data):
                 )
 
         # Handle dict arrays
-        cudf_category_frame = libcudf.table.Table()
+        cudf_category_frame = {}
         if len(dict_indices):
 
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
-            cudf_indices_frame = libcudf.interop.from_arrow(
+            cudf_indices_frame, _ = libcudf.interop.from_arrow(
                 dict_indices_table, dict_indices_table.column_names
             )
             # as dictionary size can vary, it can't be a single table
@@ -2078,9 +2158,8 @@ def from_arrow(cls, data):
                 for name in dict_dictionaries.keys()
             }
 
-            for name in cudf_indices_frame._data.names:
-                codes = cudf_indices_frame._data[name]
-                cudf_category_frame._data[name] = build_categorical_column(
+            for name, codes in cudf_indices_frame.items():
+                cudf_category_frame[name] = build_categorical_column(
                     cudf_dictionaries_columns[name],
                     codes,
                     mask=codes.base_mask,
@@ -2090,30 +2169,20 @@ def from_arrow(cls, data):
 
         # Handle non-dict arrays
         cudf_non_category_frame = (
-            libcudf.table.Table()
+            {}
             if data.num_columns == 0
-            else libcudf.interop.from_arrow(data, data.column_names)
+            else libcudf.interop.from_arrow(data, data.column_names)[0]
         )
 
-        if (
-            cudf_non_category_frame._num_columns > 0
-            and cudf_category_frame._num_columns > 0
-        ):
-            result = cudf_non_category_frame
-            for name in cudf_category_frame._data.names:
-                result._data[name] = cudf_category_frame._data[name]
-        elif cudf_non_category_frame._num_columns > 0:
-            result = cudf_non_category_frame
-        else:
-            result = cudf_category_frame
+        result = {**cudf_non_category_frame, **cudf_category_frame}
 
         # There are some special cases that need to be handled
         # based on metadata.
         if pandas_dtypes:
-            for name in result._data.names:
+            for name in result:
                 dtype = None
                 if (
-                    len(result._data[name]) == 0
+                    len(result[name]) == 0
                     and pandas_dtypes[name] == "categorical"
                 ):
                     # When pandas_dtype is a categorical column and the size
@@ -2139,18 +2208,14 @@ def from_arrow(cls, data):
                     # struct fields, hence renaming the struct fields is
                     # necessary by extracting the field names from arrow
                     # struct types.
-                    result._data[name] = result._data[name]._rename_fields(
+                    result[name] = result[name]._rename_fields(
                         [field.name for field in data[name].type]
                     )
 
                 if dtype is not None:
-                    result._data[name] = result._data[name].astype(dtype)
-
-        result = libcudf.table.Table(
-            result._data.select_by_label(column_names)
-        )
+                    result[name] = result[name].astype(dtype)
 
-        return cls._from_table(result)
+        return cls._from_data({name: result[name] for name in column_names})
 
     @annotate("TO_ARROW", color="orange", domain="cudf_python")
     def to_arrow(self):
@@ -2209,8 +2274,8 @@ def drop_duplicates(
         if len(subset_cols) == 0:
             return self.copy(deep=True)
 
-        result = self._from_table(
-            libcudf.stream_compaction.drop_duplicates(
+        result = self.__class__._from_data(
+            *libcudf.stream_compaction.drop_duplicates(
                 self,
                 keys=subset,
                 keep=keep,
@@ -2256,7 +2321,7 @@ def replace(self, to_replace: Any, replacement: Any) -> Frame:
         else:
             copy_data = self._data.copy(deep=True)
 
-        result = self._from_table(Frame(copy_data, self._index))
+        result = self._from_data(copy_data, self._index)
 
         return result
 
@@ -2278,15 +2343,17 @@ def _copy_type_metadata(
         if include_index:
             if self._index is not None and other._index is not None:
                 self._index._copy_type_metadata(other._index)
-                # When other._index is a CategoricalIndex, there is
+                # When other._index is a CategoricalIndex, the current index
+                # will be a NumericalIndex with an underlying CategoricalColumn
+                # (the above _copy_type_metadata call will have converted the
+                # column). Calling cudf.Index on that column generates the
+                # appropriate index.
                 if isinstance(
                     other._index, cudf.core.index.CategoricalIndex
                 ) and not isinstance(
                     self._index, cudf.core.index.CategoricalIndex
                 ):
-                    self._index = cudf.core.index.Index._from_table(
-                        self._index
-                    )
+                    self._index = cudf.Index(self._index._column)
 
         return self
 
@@ -2376,8 +2443,9 @@ def isnull(self):
         GenericIndex([False, False, True, True, False, False], dtype='bool')
         """
         data_columns = (col.isnull() for col in self._columns)
-        data = zip(self._column_names, data_columns)
-        return self.__class__._from_table(Frame(data, self._index))
+        return self.__class__._from_data(
+            zip(self._column_names, data_columns), self._index
+        )
 
     # Alias for isnull
     isna = isnull
@@ -2456,8 +2524,9 @@ def notnull(self):
         GenericIndex([True, True, False, False, True, True], dtype='bool')
         """
         data_columns = (col.notnull() for col in self._columns)
-        data = zip(self._column_names, data_columns)
-        return self.__class__._from_table(Frame(data, self._index))
+        return self.__class__._from_data(
+            zip(self._column_names, data_columns), self._index
+        )
 
     # Alias for notnull
     notna = notnull
@@ -2526,7 +2595,7 @@ def tile(self, count):
         -------
         The table containing the tiled "rows".
         """
-        result = self.__class__._from_table(libcudf.reshape.tile(self, count))
+        result = self.__class__._from_data(*libcudf.reshape.tile(self, count))
         result._copy_type_metadata(self)
         return result
 
@@ -3264,20 +3333,16 @@ def _is_sorted(self, ascending=None, null_position=None):
         )
 
     def _split(self, splits, keep_index=True):
-        result = libcudf.copying.table_split(
+        results = libcudf.copying.table_split(
             self, splits, keep_index=keep_index
         )
-        result = [self.__class__._from_table(tbl) for tbl in result]
-        return result
+        return [self.__class__._from_data(*result) for result in results]
 
     def _encode(self):
-        keys, indices = libcudf.transform.table_encode(self)
-        keys = self.__class__._from_table(keys)
-        for col in keys._data:
-            keys._data[col] = keys._data[col]._with_type_metadata(
-                self._data[col].dtype
-            )
-
+        data, index, indices = libcudf.transform.table_encode(self)
+        for name, col in data.items():
+            data[name] = col._with_type_metadata(self._data[name].dtype)
+        keys = self.__class__._from_data(data, index)
         return keys, indices
 
     def _reindex(
@@ -3312,7 +3377,7 @@ def _reindex(
         if index is not None:
             index = cudf.core.index.as_index(index)
 
-            if isinstance(index, cudf.core.MultiIndex):
+            if isinstance(index, cudf.MultiIndex):
                 idx_dtype_match = (
                     df.index._source_data.dtypes == index._source_data.dtypes
                 ).all()
@@ -3344,13 +3409,11 @@ def _reindex(
             for name in names
         }
 
-        result = self.__class__._from_table(
-            Frame(
-                data=cudf.core.column_accessor.ColumnAccessor(
-                    cols,
-                    multiindex=self._data.multiindex,
-                    level_names=self._data.level_names,
-                )
+        result = self.__class__._from_data(
+            data=cudf.core.column_accessor.ColumnAccessor(
+                cols,
+                multiindex=self._data.multiindex,
+                level_names=self._data.level_names,
             ),
             index=index,
         )
@@ -3359,8 +3422,9 @@ def _reindex(
 
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
-        data = zip(self._column_names, data_columns)
-        return self.__class__._from_table(Frame(data, self._index))
+        return self.__class__._from_data(
+            zip(self._column_names, data_columns), self._index
+        )
 
     def _binaryop(
         self,
@@ -3613,6 +3677,860 @@ def __pos__(self):
     def __abs__(self):
         return self._unaryop("abs")
 
+    # Reductions
+    @classmethod
+    def _get_axis_from_axis_arg(cls, axis):
+        try:
+            return cls._SUPPORT_AXIS_LOOKUP[axis]
+        except KeyError:
+            raise ValueError(f"No axis named {axis} for object type {cls}")
+
+    def _reduce(self, *args, **kwargs):
+        raise NotImplementedError(
+            f"Reductions are not supported for objects of type {type(self)}."
+        )
+
+    def min(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
+    ):
+        """
+        Return the minimum of the values in the DataFrame.
+
+        Parameters
+        ----------
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+        level: int or level name, default None
+            If the axis is a MultiIndex (hierarchical), count along a
+            particular level, collapsing into a Series.
+        numeric_only: bool, default None
+            Include only float, int, boolean columns. If None, will attempt to
+            use everything, then use only numeric data.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `level`, `numeric_only`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.min()
+        a    1
+        b    7
+        dtype: int64
+        """
+        return self._reduce(
+            "min",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def max(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
+    ):
+        """
+        Return the maximum of the values in the DataFrame.
+
+        Parameters
+        ----------
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+        level: int or level name, default None
+            If the axis is a MultiIndex (hierarchical), count along a
+            particular level, collapsing into a Series.
+        numeric_only: bool, default None
+            Include only float, int, boolean columns. If None, will attempt to
+            use everything, then use only numeric data.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `level`, `numeric_only`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.max()
+        a     4
+        b    10
+        dtype: int64
+        """
+        return self._reduce(
+            "max",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def sum(
+        self,
+        axis=None,
+        skipna=None,
+        dtype=None,
+        level=None,
+        numeric_only=None,
+        min_count=0,
+        **kwargs,
+    ):
+        """
+        Return sum of the values in the DataFrame.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+        dtype: data type
+            Data type to cast the result to.
+        min_count: int, default 0
+            The required number of valid values to perform the operation.
+            If fewer than min_count non-NA values are present the result
+            will be NA.
+
+            The default being 0. This means the sum of an all-NA or empty
+            Series is 0, and the product of an all-NA or empty Series is 1.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `level`, `numeric_only`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.sum()
+        a    10
+        b    34
+        dtype: int64
+        """
+        return self._reduce(
+            "sum",
+            axis=axis,
+            skipna=skipna,
+            dtype=dtype,
+            level=level,
+            numeric_only=numeric_only,
+            min_count=min_count,
+            **kwargs,
+        )
+
+    def product(
+        self,
+        axis=None,
+        skipna=None,
+        dtype=None,
+        level=None,
+        numeric_only=None,
+        min_count=0,
+        **kwargs,
+    ):
+        """
+        Return product of the values in the DataFrame.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+        dtype: data type
+            Data type to cast the result to.
+        min_count: int, default 0
+            The required number of valid values to perform the operation.
+            If fewer than min_count non-NA values are present the result
+            will be NA.
+
+            The default being 0. This means the sum of an all-NA or empty
+            Series is 0, and the product of an all-NA or empty Series is 1.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are level`, `numeric_only`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.product()
+        a      24
+        b    5040
+        dtype: int64
+        """
+        axis = self._get_axis_from_axis_arg(axis)
+        return self._reduce(
+            # cuDF columns use "product" as the op name, but cupy uses "prod"
+            # and we need cupy if axis == 1.
+            "product" if axis == 0 else "prod",
+            axis=axis,
+            skipna=skipna,
+            dtype=dtype,
+            level=level,
+            numeric_only=numeric_only,
+            min_count=min_count,
+            **kwargs,
+        )
+
+    # Alias for pandas compatibility.
+    prod = product
+
+    def mean(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        """
+        Return the mean of the values for the requested axis.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}
+            Axis for the function to be applied on.
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        level : int or level name, default None
+            If the axis is a MultiIndex (hierarchical), count along a
+            particular level, collapsing into a Series.
+        numeric_only : bool, default None
+            Include only float, int, boolean columns. If None, will attempt to
+            use everything, then use only numeric data. Not implemented for
+            Series.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        mean : Series or DataFrame (if level specified)
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.mean()
+        a    2.5
+        b    8.5
+        dtype: float64
+        """
+        return self._reduce(
+            "mean",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def std(
+        self,
+        axis=None,
+        skipna=None,
+        level=None,
+        ddof=1,
+        numeric_only=None,
+        **kwargs,
+    ):
+        """
+        Return sample standard deviation of the DataFrame.
+
+        Normalized by N-1 by default. This can be changed using
+        the `ddof` argument
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ddof: int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations
+            is N - ddof, where N represents the number of elements.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `level` and
+        `numeric_only`
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.std()
+        a    1.290994
+        b    1.290994
+        dtype: float64
+        """
+
+        return self._reduce(
+            "std",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            ddof=ddof,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def var(
+        self,
+        axis=None,
+        skipna=None,
+        level=None,
+        ddof=1,
+        numeric_only=None,
+        **kwargs,
+    ):
+        """
+        Return unbiased variance of the DataFrame.
+
+        Normalized by N-1 by default. This can be changed using the
+        ddof argument
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ddof: int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is
+            N - ddof, where N represents the number of elements.
+
+        Returns
+        -------
+        scalar
+
+        Notes
+        -----
+        Parameters currently not supported are `level` and
+        `numeric_only`
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.var()
+        a    1.666667
+        b    1.666667
+        dtype: float64
+        """
+        return self._reduce(
+            "var",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            ddof=ddof,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def kurtosis(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        """
+        Return Fisher's unbiased kurtosis of a sample.
+
+        Kurtosis obtained using Fisher’s definition of
+        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+
+        Returns
+        -------
+        Series or scalar
+
+        Notes
+        -----
+        Parameters currently not supported are `level` and `numeric_only`
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4])
+        >>> series.kurtosis()
+        -1.1999999999999904
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.kurt()
+        a   -1.2
+        b   -1.2
+        dtype: float64
+        """
+        if axis not in (0, "index", None):
+            raise NotImplementedError("Only axis=0 is currently supported.")
+
+        return self._reduce(
+            "kurtosis",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    # Alias for kurtosis.
+    @copy_docstring(kurtosis)
+    def kurt(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        return self.kurtosis(
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def skew(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        """
+        Return unbiased Fisher-Pearson skew of a sample.
+
+        Parameters
+        ----------
+        skipna: bool, default True
+            Exclude NA/null values when computing the result.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `axis`, `level` and
+        `numeric_only`
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6])
+        >>> series
+        0    1
+        1    2
+        2    3
+        3    4
+        4    5
+        5    6
+        6    6
+        dtype: int64
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]})
+        >>> df.skew()
+        a    0.00000
+        b   -0.37037
+        dtype: float64
+        """
+        if axis not in (0, "index", None):
+            raise NotImplementedError("Only axis=0 is currently supported.")
+
+        return self._reduce(
+            "skew",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    def all(self, axis=0, skipna=True, level=None, **kwargs):
+        """
+        Return whether all elements are True in DataFrame.
+
+        Parameters
+        ----------
+
+        skipna: bool, default True
+            Exclude NA/null values. If the entire row/column is NA and
+            skipna is True, then the result will be True, as for an
+            empty row/column.
+            If skipna is False, then NA are treated as True, because
+            these are not equal to zero.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `axis`, `bool_only`, `level`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]})
+        >>> df.all()
+        a     True
+        b    False
+        dtype: bool
+        """
+        return self._reduce(
+            "all", axis=axis, skipna=skipna, level=level, **kwargs,
+        )
+
+    def any(self, axis=0, skipna=True, level=None, **kwargs):
+        """
+        Return whether any elements is True in DataFrame.
+
+        Parameters
+        ----------
+
+        skipna: bool, default True
+            Exclude NA/null values. If the entire row/column is NA and
+            skipna is True, then the result will be False, as for an
+            empty row/column.
+            If skipna is False, then NA are treated as True, because
+            these are not equal to zero.
+
+        Returns
+        -------
+        Series
+
+        Notes
+        -----
+        Parameters currently not supported are `axis`, `bool_only`, `level`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]})
+        >>> df.any()
+        a    True
+        b    True
+        dtype: bool
+        """
+        return self._reduce(
+            "any", axis=axis, skipna=skipna, level=level, **kwargs,
+        )
+
+    def sum_of_squares(self, dtype=None):
+        """Return the sum of squares of values.
+
+        Parameters
+        ----------
+        dtype: data type
+            Data type to cast the result to.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]})
+        >>> df.sum_of_squares()
+        a     38
+        b    249
+        dtype: int64
+        """
+        return self._reduce("sum_of_squares", dtype=dtype)
+
+    def median(
+        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
+    ):
+        """
+        Return the median of the values for the requested axis.
+
+        Parameters
+        ----------
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+
+        Returns
+        -------
+        scalar
+
+        Notes
+        -----
+        Parameters currently not supported are `level` and `numeric_only`.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> ser = cudf.Series([10, 25, 3, 25, 24, 6])
+        >>> ser
+        0    10
+        1    25
+        2     3
+        3    25
+        4    24
+        5     6
+        dtype: int64
+        >>> ser.median()
+        17.0
+        """
+        return self._reduce(
+            "median",
+            axis=axis,
+            skipna=skipna,
+            level=level,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    # Scans
+    def _scan(self, op, axis=None, skipna=True, cast_to_int=False):
+        skipna = True if skipna is None else skipna
+
+        results = {}
+        for name, col in self._data.items():
+            if skipna:
+                result_col = self._data[name].nans_to_nulls()
+            else:
+                result_col = self._data[name].copy()
+                if result_col.has_nulls:
+                    # Workaround as find_first_value doesn't seem to work
+                    # incase of bools.
+                    first_index = int(
+                        result_col.isnull().astype("int8").find_first_value(1)
+                    )
+                    result_col[first_index:] = None
+
+            if (
+                cast_to_int
+                and not is_decimal_dtype(result_col.dtype)
+                and (
+                    np.issubdtype(result_col.dtype, np.integer)
+                    or np.issubdtype(result_col.dtype, np.bool_)
+                )
+            ):
+                # For reductions that accumulate a value (e.g. sum, not max)
+                # pandas returns an int64 dtype for all int or bool dtypes.
+                result_col = result_col.astype(np.int64)
+            results[name] = result_col._apply_scan_op(op)
+        # TODO: This will work for Index because it's passing self._index
+        # (which is None), but eventually we may want to remove that parameter
+        # for Index._from_data and simplify.
+        return self._from_data(results, index=self._index)
+
+    def cummin(self, axis=None, skipna=True, *args, **kwargs):
+        """
+        Return cumulative minimum of the Series or DataFrame.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values. If an entire row/column is NA,
+            the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cummin()
+        0    1
+        1    1
+        2    1
+        3    1
+        4    1
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.cummin()
+           a  b
+        0  1  7
+        1  1  7
+        2  1  7
+        3  1  7
+        """
+        return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs)
+
+    def cummax(self, axis=None, skipna=True, *args, **kwargs):
+        """
+        Return cumulative maximum of the Series or DataFrame.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values. If an entire row/column is NA,
+            the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cummax()
+        0    1
+        1    5
+        2    5
+        3    5
+        4    5
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> df.cummax()
+           a   b
+        0  1   7
+        1  2   8
+        2  3   9
+        3  4  10
+        """
+        return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs)
+
+    def cumsum(self, axis=None, skipna=True, *args, **kwargs):
+        """
+        Return cumulative sum of the Series or DataFrame.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values. If an entire row/column is NA,
+            the result will be NA.
+
+
+        Returns
+        -------
+        Series or DataFrame
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cumsum()
+        0    1
+        1    6
+        2    8
+        3    12
+        4    15
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> s.cumsum()
+            a   b
+        0   1   7
+        1   3  15
+        2   6  24
+        3  10  34
+        """
+        return self._scan(
+            "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs
+        )
+
+    def cumprod(self, axis=None, skipna=True, *args, **kwargs):
+        """
+        Return cumulative product of the Series or DataFrame.
+
+        Parameters
+        ----------
+
+        axis: {index (0), columns(1)}
+            Axis for the function to be applied on.
+        skipna: bool, default True
+            Exclude NA/null values. If an entire row/column is NA,
+            the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> ser = cudf.Series([1, 5, 2, 4, 3])
+        >>> ser.cumprod()
+        0    1
+        1    5
+        2    10
+        3    40
+        4    120
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]})
+        >>> s.cumprod()
+            a     b
+        0   1     7
+        1   2    56
+        2   6   504
+        3  24  5040
+        """
+        return self._scan(
+            "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs
+        )
+
 
 class SingleColumnFrame(Frame):
     """A one-dimensional frame.
@@ -3621,6 +4539,46 @@ class SingleColumnFrame(Frame):
     this class.
     """
 
+    _SUPPORT_AXIS_LOOKUP = {
+        0: 0,
+        None: 0,
+        "index": 0,
+    }
+
+    def _reduce(
+        self, op, axis=None, level=None, numeric_only=None, **kwargs,
+    ):
+        if axis not in (None, 0):
+            raise NotImplementedError("axis parameter is not implemented yet")
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not implemented yet")
+
+        if numeric_only not in (None, True):
+            raise NotImplementedError(
+                "numeric_only parameter is not implemented yet"
+            )
+        return getattr(self._column, op)(**kwargs)
+
+    def _scan(self, op, axis=None, *args, **kwargs):
+        if axis not in (None, 0):
+            raise NotImplementedError("axis parameter is not implemented yet")
+
+        return super()._scan(op, axis=axis, *args, **kwargs)
+
+    @classmethod
+    def _from_data(
+        cls,
+        data: MutableMapping,
+        index: Optional[cudf.core.index.BaseIndex] = None,
+        name: Any = None,
+    ):
+
+        out = super()._from_data(data, index)
+        if name is not None:
+            out.name = name
+        return out
+
     @property
     def name(self):
         """The name of this object."""
@@ -3642,6 +4600,12 @@ def shape(self):
         return (len(self),)
 
     def __iter__(self):
+        """
+        Iterating over a GPU object is not effecient and hence not supported.
+
+        Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host``
+        if you wish to iterate over the values.
+        """
         cudf.utils.utils.raise_iteration_error(obj=self)
 
     def __len__(self):
@@ -3895,16 +4859,6 @@ def factorize(self, na_sentinel=-1):
         """
         return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
 
-    @property
-    def _copy_construct_defaults(self):
-        """A default dictionary of kwargs to be used for copy construction."""
-        raise NotImplementedError
-
-    def _copy_construct(self, **kwargs):
-        """Shallow copy this object by replacing certain ctor args.
-        """
-        return self.__class__(**{**self._copy_construct_defaults, **kwargs})
-
     def _binaryop(
         self,
         other: T,
@@ -3963,8 +4917,9 @@ def _binaryop(
             result_name: (self._column, other, reflect, fill_value)
         }
 
-        return self._copy_construct(
-            data=type(self)._colwise_binop(operands, fn)[result_name],
+        return self._from_data(
+            data=type(self)._colwise_binop(operands, fn),
+            index=self._index,
             name=result_name,
         )
 
@@ -4012,7 +4967,7 @@ def _get_replacement_values_for_columns(
                 col: [value]
                 if _is_non_decimal_numeric_dtype(columns_dtype_map[col])
                 else cudf.utils.utils.scalar_broadcast_to(
-                    value, (len(to_replace),), np.dtype(type(value)),
+                    value, (len(to_replace),), cudf.dtype(type(value)),
                 )
                 for col in columns_dtype_map
             }
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 29c29691389..fd425d9de76 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -195,9 +195,9 @@ def agg(self, func):
         # Note: When there are no key columns, the below produces
         # a Float64Index, while Pandas returns an Int64Index
         # (GH: 6945)
-        result = self._groupby.aggregate(self.obj, normalized_aggs)
-
-        result = cudf.DataFrame._from_table(result)
+        result = cudf.DataFrame._from_data(
+            *self._groupby.aggregate(self.obj, normalized_aggs)
+        )
 
         if self._sort:
             result = result.sort_index()
@@ -220,13 +220,17 @@ def agg(self, func):
                     else:
                         raise
 
+        if libgroupby._is_all_scan_aggregate(normalized_aggs):
+            # Scan aggregations return rows in original index order
+            return self._mimic_pandas_order(result)
+
         # set index names to be group key names
         if len(result):
             result.index.names = self.grouping.names
 
         # copy categorical information from keys to the result index:
         result.index._copy_type_metadata(self.grouping.keys)
-        result._index = cudf.core.index.Index._from_table(result._index)
+        result._index = cudf.Index(result._index)
 
         if not self._as_index:
             for col_name in reversed(self.grouping._named_columns):
@@ -288,9 +292,7 @@ def deserialize(cls, header, frames):
 
     def _grouped(self):
         grouped_keys, grouped_values, offsets = self._groupby.groups(self.obj)
-
-        grouped_keys = cudf.Index._from_table(grouped_keys)
-        grouped_values = self.obj.__class__._from_table(grouped_values)
+        grouped_values = self.obj.__class__._from_data(*grouped_values)
         grouped_values._copy_type_metadata(self.obj)
         group_names = grouped_keys.unique()
         return (group_names, offsets, grouped_keys, grouped_values)
@@ -350,10 +352,10 @@ def pipe(self, func, *args, **kwargs):
 
         See also
         --------
-        cudf.core.series.Series.pipe
+        cudf.Series.pipe
             Apply a function with arguments to a series.
 
-        cudf.core.dataframe.DataFrame.pipe
+        cudf.DataFrame.pipe
             Apply a function with arguments to a dataframe.
 
         apply
@@ -447,7 +449,7 @@ def mult(df):
         """
         if not callable(function):
             raise TypeError(f"type {type(function)} is not callable")
-        _, offsets, _, grouped_values = self._grouped()
+        group_names, offsets, _, grouped_values = self._grouped()
 
         ngroups = len(offsets) - 1
         if ngroups > self._MAX_GROUPS_BEFORE_WARN:
@@ -465,9 +467,7 @@ def mult(df):
             return self.obj.__class__()
 
         if cudf.utils.dtypes.is_scalar(chunk_results[0]):
-            result = cudf.Series(
-                chunk_results, index=self.grouping.keys[offsets[:-1]]
-            )
+            result = cudf.Series(chunk_results, index=group_names)
             result.index.names = self.grouping.names
         elif isinstance(chunk_results[0], cudf.Series):
             result = cudf.concat(chunk_results, axis=1).T
@@ -815,14 +815,21 @@ def cummax(self):
         """Get the column-wise cumulative maximum value in each group."""
         return self.agg("cummax")
 
+    def first(self):
+        """Get the first non-null value in each group."""
+        return self.agg("first")
+
+    def last(self):
+        """Get the last non-null value in each group."""
+        return self.agg("last")
+
     def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
         """Internal implementation for `ffill` and `bfill`
         """
         value_columns = self.grouping.values
-        result = self._groupby.replace_nulls(
-            Table(value_columns._data), method
+        result = self.obj.__class__._from_data(
+            self._groupby.replace_nulls(Table(value_columns._data), method)
         )
-        result = self.obj.__class__._from_table(result)
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(value_columns)
 
@@ -936,9 +943,9 @@ def fillna(
             return getattr(self, method, limit)()
 
         value_columns = self.grouping.values
-        _, grouped_values, _ = self._groupby.groups(Table(value_columns._data))
+        _, (data, index), _ = self._groupby.groups(Table(value_columns._data))
 
-        grouped = self.obj.__class__._from_data(grouped_values._data)
+        grouped = self.obj.__class__._from_data(data, index)
         result = grouped.fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
@@ -984,21 +991,20 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
-        value_column_names = [
-            x for x in self.obj._column_names if x not in self.grouping.names
-        ]
-        num_columns_to_shift = len(value_column_names)
+        value_columns = self.grouping.values
         if is_list_like(fill_value):
-            if not len(fill_value) == num_columns_to_shift:
+            if not len(fill_value) == len(value_columns._data):
                 raise ValueError(
                     "Mismatched number of columns and values to fill."
                 )
         else:
-            fill_value = [fill_value] * num_columns_to_shift
+            fill_value = [fill_value] * len(value_columns._data)
 
-        value_columns = self.obj._data.select_by_label(value_column_names)
-        result = self._groupby.shift(Table(value_columns), periods, fill_value)
-        return self.obj.__class__._from_table(result)
+        result = self.obj.__class__._from_data(
+            *self._groupby.shift(Table(value_columns), periods, fill_value)
+        )
+        result = self._mimic_pandas_order(result)
+        return result._copy_type_metadata(value_columns)
 
     def _mimic_pandas_order(
         self, result: DataFrameOrSeries
@@ -1007,104 +1013,103 @@ def _mimic_pandas_order(
         matching that of pandas. This also adds appropriate indices.
         """
         sorted_order_column = arange(0, result._data.nrows)
-        _, order, _ = self._groupby.groups(
+        _, (order, _), _ = self._groupby.groups(
             Table({"sorted_order_column": sorted_order_column})
         )
-        order = order._data["sorted_order_column"]
-        gather_map = order.argsort()
+        gather_map = order["sorted_order_column"].argsort()
         result = result.take(gather_map)
         result.index = self.obj.index
         return result
 
 
 class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
+    """
+    Group DataFrame using a mapper or by a Series of columns.
+
+    A groupby operation involves some combination of splitting the object,
+    applying a function, and combining the results. This can be used to
+    group large amounts of data and compute operations on these groups.
+
+    Parameters
+    ----------
+    by : mapping, function, label, or list of labels
+        Used to determine the groups for the groupby. If by is a
+        function, it’s called on each value of the object’s index.
+        If a dict or Series is passed, the Series or dict VALUES will
+        be used to determine the groups (the Series’ values are first
+        aligned; see .align() method). If a cupy array is passed, the
+        values are used as-is determine the groups. A label or list
+        of labels may be passed to group by the columns in self.
+        Notice that a tuple is interpreted as a (single) key.
+    level : int, level name, or sequence of such, default None
+        If the axis is a MultiIndex (hierarchical), group by a particular
+        level or levels.
+    as_index : bool, default True
+        For aggregated output, return object with group labels as
+        the index. Only relevant for DataFrame input.
+        as_index=False is effectively “SQL-style” grouped output.
+    sort : bool, default False
+        Sort result by group key. Differ from Pandas, cudf defaults to
+        ``False`` for better performance. Note this does not influence
+        the order of observations within each group. Groupby preserves
+        the order of rows within each group.
+    dropna : bool, optional
+        If True (default), do not include the "null" group.
+
+    Returns
+    -------
+        DataFrameGroupBy
+            Returns a groupby object that contains information
+            about the groups.
+
+    Examples
+    --------
+    >>> import cudf
+    >>> import pandas as pd
+    >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon',
+    ...                               'Parrot', 'Parrot'],
+    ...                    'Max Speed': [380., 370., 24., 26.]})
+    >>> df
+       Animal  Max Speed
+    0  Falcon      380.0
+    1  Falcon      370.0
+    2  Parrot       24.0
+    3  Parrot       26.0
+    >>> df.groupby(['Animal']).mean()
+            Max Speed
+    Animal
+    Falcon      375.0
+    Parrot       25.0
+
+    >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+    ... ['Captive', 'Wild', 'Captive', 'Wild']]
+    >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
+    >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]},
+            index=index)
+    >>> df
+                    Max Speed
+    Animal Type
+    Falcon Captive      390.0
+        Wild         350.0
+    Parrot Captive       30.0
+        Wild          20.0
+    >>> df.groupby(level=0).mean()
+            Max Speed
+    Animal
+    Falcon      370.0
+    Parrot       25.0
+    >>> df.groupby(level="Type").mean()
+            Max Speed
+    Type
+    Wild         185.0
+    Captive      210.0
+    """
+
     _PROTECTED_KEYS = frozenset(("obj",))
 
     def __init__(
         self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
     ):
-        """
-        Group DataFrame using a mapper or by a Series of columns.
-
-        A groupby operation involves some combination of splitting the object,
-        applying a function, and combining the results. This can be used to
-        group large amounts of data and compute operations on these groups.
-
-        Parameters
-        ----------
-        by : mapping, function, label, or list of labels
-            Used to determine the groups for the groupby. If by is a
-            function, it’s called on each value of the object’s index.
-            If a dict or Series is passed, the Series or dict VALUES will
-            be used to determine the groups (the Series’ values are first
-            aligned; see .align() method). If a cupy array is passed, the
-            values are used as-is determine the groups. A label or list
-            of labels may be passed to group by the columns in self.
-            Notice that a tuple is interpreted as a (single) key.
-        level : int, level name, or sequence of such, default None
-            If the axis is a MultiIndex (hierarchical), group by a particular
-            level or levels.
-        as_index : bool, default True
-            For aggregated output, return object with group labels as
-            the index. Only relevant for DataFrame input.
-            as_index=False is effectively “SQL-style” grouped output.
-        sort : bool, default False
-            Sort result by group key. Differ from Pandas, cudf defaults to
-            ``False`` for better performance. Note this does not influence
-            the order of observations within each group. Groupby preserves
-            the order of rows within each group.
-        dropna : bool, optional
-            If True (default), do not include the "null" group.
-
-        Returns
-        -------
-            DataFrameGroupBy
-                Returns a groupby object that contains information
-                about the groups.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import pandas as pd
-        >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon',
-        ...                               'Parrot', 'Parrot'],
-        ...                    'Max Speed': [380., 370., 24., 26.]})
-        >>> df
-        Animal  Max Speed
-        0  Falcon      380.0
-        1  Falcon      370.0
-        2  Parrot       24.0
-        3  Parrot       26.0
-        >>> df.groupby(['Animal']).mean()
-                Max Speed
-        Animal
-        Falcon      375.0
-        Parrot       25.0
-
-        >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
-        ... ['Captive', 'Wild', 'Captive', 'Wild']]
-        >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
-        >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]},
-                index=index)
-        >>> df
-                        Max Speed
-        Animal Type
-        Falcon Captive      390.0
-            Wild         350.0
-        Parrot Captive       30.0
-            Wild          20.0
-        >>> df.groupby(level=0).mean()
-                Max Speed
-        Animal
-        Falcon      370.0
-        Parrot       25.0
-        >>> df.groupby(level="Type").mean()
-                Max Speed
-        Type
-        Wild         185.0
-        Captive      210.0
-
-        """
         super().__init__(
             obj=obj,
             by=by,
@@ -1127,68 +1132,68 @@ def nunique(self):
 
 
 class SeriesGroupBy(GroupBy):
+    """
+    Group Series using a mapper or by a Series of columns.
+
+    A groupby operation involves some combination of splitting the object,
+    applying a function, and combining the results. This can be used to
+    group large amounts of data and compute operations on these groups.
+
+    Parameters
+    ----------
+    by : mapping, function, label, or list of labels
+        Used to determine the groups for the groupby. If by is a
+        function, it’s called on each value of the object’s index.
+        If a dict or Series is passed, the Series or dict VALUES will
+        be used to determine the groups (the Series’ values are first
+        aligned; see .align() method). If an cupy array is passed, the
+        values are used as-is determine the groups. A label or list
+        of labels may be passed to group by the columns in self.
+        Notice that a tuple is interpreted as a (single) key.
+    level : int, level name, or sequence of such, default None
+        If the axis is a MultiIndex (hierarchical), group by a particular
+        level or levels.
+    as_index : bool, default True
+        For aggregated output, return object with group labels as
+        the index. Only relevant for DataFrame input.
+        as_index=False is effectively “SQL-style” grouped output.
+    sort : bool, default False
+        Sort result by group key. Differ from Pandas, cudf defaults to
+        ``False`` for better performance. Note this does not influence
+        the order of observations within each group. Groupby preserves
+        the order of rows within each group.
+
+    Returns
+    -------
+        SeriesGroupBy
+            Returns a groupby object that contains information
+            about the groups.
+
+    Examples
+    --------
+    >>> ser = cudf.Series([390., 350., 30., 20.],
+    ...                 index=['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+    ...                 name="Max Speed")
+    >>> ser
+    Falcon    390.0
+    Falcon    350.0
+    Parrot     30.0
+    Parrot     20.0
+    Name: Max Speed, dtype: float64
+    >>> ser.groupby(level=0).mean()
+    Falcon    370.0
+    Parrot     25.0
+    Name: Max Speed, dtype: float64
+    >>> ser.groupby(ser > 100).mean()
+    Max Speed
+    False     25.0
+    True     370.0
+    Name: Max Speed, dtype: float64
+    """
+
     def __init__(
         self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
     ):
-        """
-        Group Series using a mapper or by a Series of columns.
-
-        A groupby operation involves some combination of splitting the object,
-        applying a function, and combining the results. This can be used to
-        group large amounts of data and compute operations on these groups.
-
-        Parameters
-        ----------
-        by : mapping, function, label, or list of labels
-            Used to determine the groups for the groupby. If by is a
-            function, it’s called on each value of the object’s index.
-            If a dict or Series is passed, the Series or dict VALUES will
-            be used to determine the groups (the Series’ values are first
-            aligned; see .align() method). If an cupy array is passed, the
-            values are used as-is determine the groups. A label or list
-            of labels may be passed to group by the columns in self.
-            Notice that a tuple is interpreted as a (single) key.
-        level : int, level name, or sequence of such, default None
-            If the axis is a MultiIndex (hierarchical), group by a particular
-            level or levels.
-        as_index : bool, default True
-            For aggregated output, return object with group labels as
-            the index. Only relevant for DataFrame input.
-            as_index=False is effectively “SQL-style” grouped output.
-        sort : bool, default False
-            Sort result by group key. Differ from Pandas, cudf defaults to
-            ``False`` for better performance. Note this does not influence
-            the order of observations within each group. Groupby preserves
-            the order of rows within each group.
-
-        Returns
-        -------
-            SeriesGroupBy
-                Returns a groupby object that contains information
-                about the groups.
-
-        Examples
-        --------
-        >>> ser = cudf.Series([390., 350., 30., 20.],
-        ...                 index=['Falcon', 'Falcon', 'Parrot', 'Parrot'],
-        ...                 name="Max Speed")
-        >>> ser
-        Falcon    390.0
-        Falcon    350.0
-        Parrot     30.0
-        Parrot     20.0
-        Name: Max Speed, dtype: float64
-        >>> ser.groupby(level=0).mean()
-        Falcon    370.0
-        Parrot     25.0
-        Name: Max Speed, dtype: float64
-        >>> ser.groupby(ser > 100).mean()
-        Max Speed
-        False     25.0
-        True     370.0
-        Name: Max Speed, dtype: float64
-
-        """
         super().__init__(
             obj=obj,
             by=by,
@@ -1215,6 +1220,14 @@ def agg(self, func):
 
         return result
 
+    def apply(self, func):
+        result = super().apply(func)
+
+        # apply Series name to result
+        result.name = self.obj.name
+
+        return result
+
 
 class Grouper(object):
     def __init__(self, key=None, level=None):
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 691b6ab2e29..6be21ce74d2 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -4,7 +4,16 @@
 
 import pickle
 from numbers import Number
-from typing import Any, Dict, Optional, Tuple, Type, Union
+from typing import (
+    Any,
+    Dict,
+    List,
+    MutableMapping,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
 
 import cupy
 import numpy as np
@@ -13,7 +22,7 @@
 from pandas._config import get_option
 
 import cudf
-from cudf._lib.datetime import is_leap_year
+from cudf._lib.datetime import extract_quarter, is_leap_year
 from cudf._lib.filling import sequence
 from cudf._lib.search import search_sorted
 from cudf._lib.table import Table
@@ -326,7 +335,7 @@ def set_names(self, names, level=None, inplace=False):
 
         See Also
         --------
-        cudf.core.index.Index.rename : Able to set new names without level.
+        cudf.Index.rename : Able to set new names without level.
 
         Examples
         --------
@@ -518,83 +527,20 @@ def gpu_values(self):
         """
         return self._values.data_array_view
 
-    def min(self):
-        """
-        Return the minimum value of the Index.
-
-        Returns
-        -------
-        scalar
-            Minimum value.
-
-        See Also
-        --------
-        cudf.core.index.Index.max : Return the maximum value in an Index.
-        cudf.core.series.Series.min : Return the minimum value in a Series.
-        cudf.core.dataframe.DataFrame.min : Return the minimum values in
-            a DataFrame.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index([3, 2, 1])
-        >>> idx.min()
-        1
-        """
-        return self._values.min()
-
-    def max(self):
-        """
-        Return the maximum value of the Index.
-
-        Returns
-        -------
-        scalar
-            Maximum value.
-
-        See Also
-        --------
-        cudf.core.index.Index.min : Return the minimum value in an Index.
-        cudf.core.series.Series.max : Return the maximum value in a Series.
-        cudf.core.dataframe.DataFrame.max : Return the maximum values in
-            a DataFrame.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index([3, 2, 1])
-        >>> idx.max()
-        3
-        """
-        return self._values.max()
-
-    def sum(self):
-        """
-        Return the sum of all values of the Index.
-
-        Returns
-        -------
-        scalar
-            Sum of all values.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index([3, 2, 1])
-        >>> idx.sum()
-        6
-        """
-        return self._values.sum()
-
     @classmethod
     def _concat(cls, objs):
-        data = concat_columns([o._values for o in objs])
+        if all(isinstance(obj, RangeIndex) for obj in objs):
+            result = _concat_range_index(objs)
+        else:
+            data = concat_columns([o._values for o in objs])
+            result = as_index(data)
+
         names = {obj.name for obj in objs}
         if len(names) == 1:
             [name] = names
         else:
             name = None
-        result = as_index(data)
+
         result.name = name
         return result
 
@@ -646,12 +592,12 @@ def append(self, other):
                 if is_mixed_with_object_dtype(this, other):
                     got_dtype = (
                         other.dtype
-                        if this.dtype == np.dtype("object")
+                        if this.dtype == cudf.dtype("object")
                         else this.dtype
                     )
                     raise TypeError(
                         f"cudf does not support appending an Index of "
-                        f"dtype `{np.dtype('object')}` with an Index "
+                        f"dtype `{cudf.dtype('object')}` with an Index "
                         f"of dtype `{got_dtype}`, please type-cast "
                         f"either one of them to same dtypes."
                     )
@@ -724,39 +670,6 @@ def difference(self, other, sort=None):
 
         return difference
 
-    def _copy_construct(self, **kwargs):
-        # Need to override the parent behavior because pandas allows operations
-        # on unsigned types to return signed values, forcing us to choose the
-        # right index type here.
-        data = kwargs.get("data")
-        cls = self.__class__
-
-        if data is not None:
-            if self.dtype != data.dtype:
-                # TODO: This logic is largely copied from `as_index`. The two
-                # should be unified via a centralized type dispatching scheme.
-                if isinstance(data, NumericalColumn):
-                    try:
-                        cls = _dtype_to_index[data.dtype.type]
-                    except KeyError:
-                        cls = GenericIndex
-                elif isinstance(data, StringColumn):
-                    cls = StringIndex
-                elif isinstance(data, DatetimeColumn):
-                    cls = DatetimeIndex
-                elif isinstance(data, TimeDeltaColumn):
-                    cls = TimedeltaIndex
-                elif isinstance(data, CategoricalColumn):
-                    cls = CategoricalIndex
-            elif cls is RangeIndex:
-                # RangeIndex must convert to other numerical types for ops
-                try:
-                    cls = _dtype_to_index[data.dtype.type]
-                except KeyError:
-                    cls = GenericIndex
-
-        return cls(**{**self._copy_construct_defaults, **kwargs})
-
     def sort_values(self, return_indexer=False, ascending=True, key=None):
         """
         Return a sorted copy of the index, and optionally return the indices
@@ -780,8 +693,8 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
 
         See Also
         --------
-        cudf.core.series.Series.min : Sort values of a Series.
-        cudf.core.dataframe.DataFrame.sort_values : Sort values in a DataFrame.
+        cudf.Series.min : Sort values of a Series.
+        cudf.DataFrame.sort_values : Sort values in a DataFrame.
 
         Examples
         --------
@@ -1350,9 +1263,9 @@ def from_pandas(cls, index, nan_as_null=None):
         >>> import numpy as np
         >>> data = [10, 20, 30, np.nan]
         >>> pdi = pd.Index(data)
-        >>> cudf.core.index.Index.from_pandas(pdi)
+        >>> cudf.Index.from_pandas(pdi)
         Float64Index([10.0, 20.0, 30.0, <NA>], dtype='float64')
-        >>> cudf.core.index.Index.from_pandas(pdi, nan_as_null=False)
+        >>> cudf.Index.from_pandas(pdi, nan_as_null=False)
         Float64Index([10.0, 20.0, 30.0, nan], dtype='float64')
         """
         if not isinstance(index, pd.Index):
@@ -1363,52 +1276,43 @@ def from_pandas(cls, index, nan_as_null=None):
         return ind
 
     @classmethod
-    def _from_table(cls, table):
-        if not isinstance(table, RangeIndex):
-            if table._num_columns == 0:
-                raise ValueError("Cannot construct Index from any empty Table")
-            if table._num_columns == 1:
-                values = next(iter(table._data.values()))
-
-                if isinstance(values, NumericalColumn):
-                    try:
-                        index_class_type = _dtype_to_index[values.dtype.type]
-                    except KeyError:
-                        index_class_type = GenericIndex
-                    out = super(BaseIndex, index_class_type).__new__(
-                        index_class_type
-                    )
-                elif isinstance(values, DatetimeColumn):
-                    out = super(BaseIndex, DatetimeIndex).__new__(
-                        DatetimeIndex
-                    )
-                elif isinstance(values, TimeDeltaColumn):
-                    out = super(BaseIndex, TimedeltaIndex).__new__(
-                        TimedeltaIndex
-                    )
-                elif isinstance(values, StringColumn):
-                    out = super(BaseIndex, StringIndex).__new__(StringIndex)
-                elif isinstance(values, CategoricalColumn):
-                    out = super(BaseIndex, CategoricalIndex).__new__(
-                        CategoricalIndex
-                    )
-                out._data = table._data
-                out._index = None
-                return out
-            else:
-                return cudf.MultiIndex._from_table(
-                    table, names=table._data.names
+    def _from_data(
+        cls,
+        data: MutableMapping,
+        index: Optional[BaseIndex] = None,
+        name: Any = None,
+    ) -> BaseIndex:
+        assert index is None
+        if not isinstance(data, cudf.core.column_accessor.ColumnAccessor):
+            data = cudf.core.column_accessor.ColumnAccessor(data)
+        if len(data) == 0:
+            raise ValueError("Cannot construct Index from any empty Table")
+        if len(data) == 1:
+            values = next(iter(data.values()))
+
+            if isinstance(values, NumericalColumn):
+                try:
+                    index_class_type = _dtype_to_index[values.dtype.type]
+                except KeyError:
+                    index_class_type = GenericIndex
+                out = super(BaseIndex, index_class_type).__new__(
+                    index_class_type
                 )
+            elif isinstance(values, DatetimeColumn):
+                out = super(BaseIndex, DatetimeIndex).__new__(DatetimeIndex)
+            elif isinstance(values, TimeDeltaColumn):
+                out = super(BaseIndex, TimedeltaIndex).__new__(TimedeltaIndex)
+            elif isinstance(values, StringColumn):
+                out = super(BaseIndex, StringIndex).__new__(StringIndex)
+            elif isinstance(values, CategoricalColumn):
+                out = super(BaseIndex, CategoricalIndex).__new__(
+                    CategoricalIndex
+                )
+            out._data = data
+            out._index = None
+            return out
         else:
-            return as_index(table)
-
-    @property
-    def _copy_construct_defaults(self):
-        return {"data": self._column, "name": self.name}
-
-    @classmethod
-    def _from_data(cls, data, index=None):
-        return cls._from_table(SingleColumnFrame(data=data))
+            return cudf.MultiIndex._from_data(data)
 
     @property
     def _constructor_expanddim(self):
@@ -1640,7 +1544,7 @@ def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
         """
-        return np.dtype(np.int64)
+        return cudf.dtype(np.int64)
 
     @property
     def is_contiguous(self):
@@ -1783,25 +1687,25 @@ def __mul__(self, other):
 
 
 class GenericIndex(BaseIndex):
-    """An array of orderable values that represent the indices of another Column
+    """
+    An array of orderable values that represent the indices of another Column
 
     Attributes
     ----------
     _values: A Column object
     name: A string
+
+    Parameters
+    ----------
+    data : Column
+        The Column of data for this index
+    name : str optional
+        The name of the Index. If not provided, the Index adopts the value
+        Column's name. Otherwise if this name is different from the value
+        Column's, the data Column will be cloned to adopt this name.
     """
 
     def __init__(self, data, **kwargs):
-        """
-        Parameters
-        ----------
-        data : Column
-            The Column of data for this index
-        name : str optional
-            The name of the Index. If not provided, the Index adopts the value
-            Column's name. Otherwise if this name is different from the value
-            Column's, the data Column will be cloned to adopt this name.
-        """
         kwargs = _setdefault_name(data, **kwargs)
 
         # normalize the input
@@ -2007,42 +1911,252 @@ def __init__(self, data=None, dtype=None, copy=False, name=None):
 
 
 class Int8Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    Int8Index is a special case of Index with purely
+    integer(``int8``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    Int8Index
+    """
+
     _dtype = np.int8
 
 
 class Int16Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    Int16Index is a special case of Index with purely
+    integer(``int16``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    Int16Index
+    """
+
     _dtype = np.int16
 
 
 class Int32Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    Int32Index is a special case of Index with purely
+    integer(``int32``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    Int32Index
+    """
+
     _dtype = np.int32
 
 
 class Int64Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    Int64Index is a special case of Index with purely
+    integer(``int64``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    Int64Index
+    """
+
     _dtype = np.int64
 
 
 class UInt8Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    UInt8Index is a special case of Index with purely
+    integer(``uint64``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    UInt8Index
+    """
+
     _dtype = np.uint8
 
 
 class UInt16Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    UInt16Index is a special case of Index with purely
+    integer(``uint16``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    UInt16Index
+    """
+
     _dtype = np.uint16
 
 
 class UInt32Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    UInt32Index is a special case of Index with purely
+    integer(``uint32``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    UInt32Index
+    """
+
     _dtype = np.uint32
 
 
 class UInt64Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    UInt64Index is a special case of Index with purely
+    integer(``uint64``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    UInt64Index
+    """
+
     _dtype = np.uint64
 
 
 class Float32Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    Float32Index is a special case of Index with purely
+    float(``float32``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    Float32Index
+    """
+
     _dtype = np.float32
 
 
 class Float64Index(NumericIndex):
+    """
+    Immutable, ordered and sliceable sequence of labels.
+    The basic object storing row labels for all cuDF objects.
+    Float64Index is a special case of Index with purely
+    float(``float64``) labels.
+
+    Parameters
+    ----------
+    data : array-like (1-dimensional)
+    dtype : NumPy dtype,
+            but not used.
+    copy : bool
+        Make a copy of input data.
+    name : object
+        Name to be stored in the index.
+
+    Returns
+    -------
+    Float64Index
+    """
+
     _dtype = np.float64
 
 
@@ -2357,6 +2471,31 @@ def is_leap_year(self):
         res = is_leap_year(self._values).fillna(False)
         return cupy.asarray(res)
 
+    @property
+    def quarter(self):
+        """
+        Integer indicator for which quarter of the year the date belongs in.
+
+        There are 4 quarters in a year. With the first quarter being from
+        January - March, second quarter being April - June, third quarter
+        being July - September and fourth quarter being October - December.
+
+        Returns
+        -------
+        Int8Index
+        Integer indicating which quarter the date belongs to.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
+        ...    "1999-12-31 18:40:00"])
+        >>> gIndex.quarter
+        Int8Index([2, 4], dtype='int8')
+        """
+        res = extract_quarter(self._values)
+        return Int8Index(res, dtype="int8")
+
     def to_pandas(self):
         nanos = self._values.astype("datetime64[ns]")
         return pd.DatetimeIndex(nanos.to_pandas(), name=self.name)
@@ -2493,6 +2632,13 @@ def components(self):
 
     @property
     def inferred_freq(self):
+        """
+        Infers frequency of TimedeltaIndex.
+
+        Notes
+        -----
+        This property is currently not supported.
+        """
         raise NotImplementedError("inferred_freq is not yet supported")
 
 
@@ -2798,7 +2944,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         Construct an IntervalIndex from an array of splits.
 
         Parameters
-        ---------
+        ----------
         breaks : array-like (1-dimensional)
             Left and right bounds for each interval.
         closed : {"left", "right", "both", "neither"}, default "right"
@@ -2878,7 +3024,7 @@ def __repr__(self):
             + ")"
         )
 
-    @copy_docstring(StringMethods.__init__)  # type: ignore
+    @copy_docstring(StringMethods)  # type: ignore
     @property
     def str(self):
         return StringMethods(parent=self)
@@ -3043,3 +3189,43 @@ def __new__(
             )
 
         return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs)
+
+
+def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
+    """
+    An internal Utility function to concat RangeIndex objects.
+    """
+    start = step = next_ = None
+
+    # Filter the empty indexes
+    non_empty_indexes = [obj for obj in indexes if len(obj)]
+
+    if not non_empty_indexes:
+        # Here all "indexes" had 0 length, i.e. were empty.
+        # In this case return an empty range index.
+        return RangeIndex(0, 0)
+
+    for obj in non_empty_indexes:
+        if start is None:
+            # This is set by the first non-empty index
+            start = obj.start
+            if step is None and len(obj) > 1:
+                step = obj.step
+        elif step is None:
+            # First non-empty index had only one element
+            if obj.start == start:
+                result = as_index(concat_columns([x._values for x in indexes]))
+                return result
+            step = obj.start - start
+
+        non_consecutive = (step != obj.step and len(obj) > 1) or (
+            next_ is not None and obj.start != next_
+        )
+        if non_consecutive:
+            result = as_index(concat_columns([x._values for x in indexes]))
+            return result
+        if step is not None:
+            next_ = obj[-1] + step
+
+    stop = non_empty_indexes[-1].stop if next_ is None else next_
+    return RangeIndex(start, stop, step)
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
index a4a69a4e084..da999f13fa8 100755
--- a/python/cudf/cudf/core/indexing.py
+++ b/python/cudf/cudf/core/indexing.py
@@ -98,8 +98,9 @@ def __getitem__(self, arg):
             or _is_null_host_scalar(data)
         ):
             return data
-        index = self._sr.index.take(arg)
-        return self._sr._copy_construct(data=data, index=index)
+        return self._sr._from_data(
+            {self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg))
+        )
 
     def __setitem__(self, key, value):
         from cudf.core.column import column
@@ -431,7 +432,7 @@ def _setitem_tuple_arg(self, key, value):
             )
 
         try:
-            columns = self._get_column_selection(key[1])
+            columns_df = self._get_column_selection(key[1])
         except KeyError:
             if not self._df.empty and isinstance(key[0], slice):
                 pos_range = get_label_range_or_mask(
@@ -456,8 +457,27 @@ def _setitem_tuple_arg(self, key, value):
                 )
             self._df._data.insert(key[1], new_col)
         else:
-            for col in columns:
-                self._df[col].loc[key[0]] = value
+            if isinstance(value, (cp.ndarray, np.ndarray)):
+                value_df = cudf.DataFrame(value)
+                if value_df.shape[1] != columns_df.shape[1]:
+                    if value_df.shape[1] == 1:
+                        value_cols = (
+                            value_df._data.columns * columns_df.shape[1]
+                        )
+                    else:
+                        raise ValueError(
+                            f"shape mismatch: value array of shape "
+                            f"{value_df.shape} could not be "
+                            f"broadcast to indexing result of shape "
+                            f"{columns_df.shape}"
+                        )
+                else:
+                    value_cols = value_df._data.columns
+                for i, col in enumerate(columns_df._column_names):
+                    self._df[col].loc[key[0]] = value_cols[i]
+            else:
+                for col in columns_df._column_names:
+                    self._df[col].loc[key[0]] = value
 
     def _get_column_selection(self, arg):
         return self._df._get_columns_by_label(arg)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 51423d604c2..079a6d902b6 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections.abc import Sequence
-from typing import Any, List, Tuple, Union
+from typing import Any, List, Mapping, Tuple, Union
 
 import cupy
 import numpy as np
@@ -18,7 +18,6 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.column import as_column, column
-from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import SingleColumnFrame
 from cudf.core.index import BaseIndex, as_index
 from cudf.utils.utils import _maybe_indices_to_slice
@@ -94,7 +93,6 @@ def __init__(
 
         self._name = None
 
-        column_names = []
         if labels:
             warnings.warn(
                 "the 'labels' keyword is deprecated, use 'codes' " "instead",
@@ -124,17 +122,6 @@ def __init__(
             self._levels = levels
             return
 
-        # name setup
-        if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenList,),):
-            if sum(x is None for x in names) > 1:
-                column_names = list(range(len(codes)))
-            else:
-                column_names = names
-        elif names is None:
-            column_names = list(range(len(codes)))
-        else:
-            column_names = names
-
         if len(levels) == 0:
             raise ValueError("Must pass non-zero number of levels/codes")
 
@@ -147,10 +134,12 @@ def __init__(
             self._codes = codes
         elif len(levels) == len(codes):
             self._codes = cudf.DataFrame()
-            for i, codes in enumerate(codes):
-                name = column_names[i] or i
-                codes = column.as_column(codes)
-                self._codes[name] = codes.astype(np.int64)
+            self._codes = cudf.DataFrame._from_data(
+                {
+                    i: column.as_column(code).astype(np.int64)
+                    for i, code in enumerate(codes)
+                }
+            )
         else:
             raise ValueError(
                 "MultiIndex has unequal number of levels and "
@@ -161,20 +150,20 @@ def __init__(
         self._validate_levels_and_codes(self._levels, self._codes)
 
         source_data = cudf.DataFrame()
-        for i, name in enumerate(self._codes.columns):
-            codes = as_index(self._codes[name]._column)
-            if -1 in self._codes[name].values:
+        for i, n in enumerate(self._codes.columns):
+            codes = as_index(self._codes[n]._column)
+            if -1 in self._codes[n].values:
                 # Must account for null(s) in _source_data column
                 level = cudf.DataFrame(
-                    {name: [None] + list(self._levels[i])},
+                    {n: [None] + list(self._levels[i])},
                     index=range(-1, len(self._levels[i])),
                 )
             else:
-                level = cudf.DataFrame({name: self._levels[i]})
+                level = cudf.DataFrame({n: self._levels[i]})
 
-            source_data[name] = libcudf.copying.gather(
+            source_data[n] = libcudf.copying.gather(
                 level, codes._data.columns[0]
-            )._data[name]
+            )[0][n]
 
         self._data = source_data._data
         self.names = names
@@ -294,17 +283,15 @@ def set_names(self, names, level=None, inplace=False):
 
         return self._set_names(names=names, inplace=inplace)
 
+    # TODO: This type ignore is indicating a real problem, which is that
+    # MultiIndex should not be inheriting from SingleColumnFrame, but fixing
+    # that will have to wait until we reshuffle the Index hierarchy.
     @classmethod
-    def _from_data(cls, data: ColumnAccessor, index=None) -> MultiIndex:
+    def _from_data(  # type: ignore
+        cls, data: Mapping, index=None
+    ) -> MultiIndex:
         return cls.from_frame(cudf.DataFrame._from_data(data))
 
-    @classmethod
-    def _from_table(cls, table, names=None):
-        df = cudf.DataFrame(table._data)
-        if names is None:
-            names = df.columns
-        return MultiIndex.from_frame(df, names=names)
-
     @property
     def shape(self):
         return (self._data.nrows, len(self._data.names))
@@ -612,6 +599,30 @@ def to_arrow(self):
 
     @property
     def codes(self):
+        """
+        Returns the codes of the underlying MultiIndex.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]})
+        >>> cudf.MultiIndex.from_frame(df)
+        MultiIndex([(1, 10),
+                    (2, 11),
+                    (3, 12)],
+                names=['a', 'b'])
+        >>> midx = cudf.MultiIndex.from_frame(df)
+        >>> midx
+        MultiIndex([(1, 10),
+                    (2, 11),
+                    (3, 12)],
+                names=['a', 'b'])
+        >>> midx.codes
+           a  b
+        0  0  0
+        1  1  1
+        2  2  2
+        """
         if self._codes is None:
             self._compute_levels_and_codes()
         return self._codes
@@ -625,6 +636,37 @@ def nlevels(self):
 
     @property
     def levels(self):
+        """
+        Returns list of levels in the MultiIndex
+
+        Returns
+        -------
+        List of Series objects
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]})
+        >>> cudf.MultiIndex.from_frame(df)
+        MultiIndex([(1, 10),
+                    (2, 11),
+                    (3, 12)],
+                names=['a', 'b'])
+        >>> midx = cudf.MultiIndex.from_frame(df)
+        >>> midx
+        MultiIndex([(1, 10),
+                    (2, 11),
+                    (3, 12)],
+                names=['a', 'b'])
+        >>> midx.levels
+        [0    1
+        1    2
+        2    3
+        dtype: int64, 0    10
+        1    11
+        2    12
+        dtype: int64]
+        """
         if self._levels is None:
             self._compute_levels_and_codes()
         return self._levels
@@ -778,8 +820,7 @@ def _compute_levels_and_codes(self):
         for name in self._source_data.columns:
             code, cats = self._source_data[name].factorize()
             codes[name] = code.astype(np.int64)
-            cats.name = None
-            cats = cudf.Series(cats)._copy_construct(name=None)
+            cats = cudf.Series(cats, name=None)
             levels.append(cats)
 
         self._levels = levels
@@ -1055,10 +1096,12 @@ def __getitem__(self, index):
         match = self.take(index)
         if isinstance(index, slice):
             return match
-        result = []
-        for level, item in enumerate(match.codes):
-            result.append(match.levels[level][match.codes[item].iloc[0]])
-        return tuple(result)
+        if isinstance(index, int):
+            # we are indexing into a single row of the MultiIndex,
+            # return that row as a tuple:
+            return match.to_pandas()[0]
+        else:
+            return match
 
     def to_frame(self, index=True, name=None):
         df = self._source_data
@@ -1126,6 +1169,37 @@ def _concat(cls, objs):
 
     @classmethod
     def from_tuples(cls, tuples, names=None):
+        """
+        Convert list of tuples to MultiIndex.
+
+        Parameters
+        ----------
+        tuples : list / sequence of tuple-likes
+            Each tuple is the index of one row/column.
+        names : list / sequence of str, optional
+            Names for the levels in the index.
+
+        Returns
+        -------
+        MultiIndex
+
+        See Also
+        --------
+        MultiIndex.from_product : Make a MultiIndex from cartesian product
+                                  of iterables.
+        MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+
+        Examples
+        --------
+        >>> tuples = [(1, 'red'), (1, 'blue'),
+        ...           (2, 'red'), (2, 'blue')]
+        >>> cudf.MultiIndex.from_tuples(tuples, names=('number', 'color'))
+        MultiIndex([(1,  'red'),
+                    (1, 'blue'),
+                    (2,  'red'),
+                    (2, 'blue')],
+                   names=['number', 'color'])
+        """
         # Use Pandas for handling Python host objects
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         result = cls.from_pandas(pdi)
@@ -1190,11 +1264,97 @@ def values(self):
         return self._source_data.values
 
     @classmethod
-    def from_frame(cls, dataframe, names=None):
-        return cls(source_data=dataframe, names=names)
+    def from_frame(cls, df, names=None):
+        """
+        Make a MultiIndex from a DataFrame.
+
+        Parameters
+        ----------
+        df : DataFrame
+            DataFrame to be converted to MultiIndex.
+        names : list-like, optional
+            If no names are provided, use the column names, or tuple of column
+            names if the columns is a MultiIndex. If a sequence, overwrite
+            names with the given sequence.
+
+        Returns
+        -------
+        MultiIndex
+            The MultiIndex representation of the given DataFrame.
+
+        See Also
+        --------
+        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+        MultiIndex.from_product : Make a MultiIndex from cartesian product
+                                  of iterables.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
+        ...                    ['NJ', 'Temp'], ['NJ', 'Precip']],
+        ...                   columns=['a', 'b'])
+        >>> df
+              a       b
+        0    HI    Temp
+        1    HI  Precip
+        2    NJ    Temp
+        3    NJ  Precip
+        >>> cudf.MultiIndex.from_frame(df)
+        MultiIndex([('HI',   'Temp'),
+                    ('HI', 'Precip'),
+                    ('NJ',   'Temp'),
+                    ('NJ', 'Precip')],
+                   names=['a', 'b'])
+
+        Using explicit names, instead of the column names
+
+        >>> cudf.MultiIndex.from_frame(df, names=['state', 'observation'])
+        MultiIndex([('HI',   'Temp'),
+                    ('HI', 'Precip'),
+                    ('NJ',   'Temp'),
+                    ('NJ', 'Precip')],
+                   names=['state', 'observation'])
+        """
+        return cls(source_data=df, names=names)
 
     @classmethod
     def from_product(cls, arrays, names=None):
+        """
+        Make a MultiIndex from the cartesian product of multiple iterables.
+
+        Parameters
+        ----------
+        iterables : list / sequence of iterables
+            Each iterable has unique labels for each level of the index.
+        names : list / sequence of str, optional
+            Names for the levels in the index.
+            If not explicitly provided, names will be inferred from the
+            elements of iterables if an element has a name attribute
+
+        Returns
+        -------
+        MultiIndex
+
+        See Also
+        --------
+        MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
+        MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
+
+        Examples
+        --------
+        >>> numbers = [0, 1, 2]
+        >>> colors = ['green', 'purple']
+        >>> cudf.MultiIndex.from_product([numbers, colors],
+        ...                            names=['number', 'color'])
+        MultiIndex([(0,  'green'),
+                    (0, 'purple'),
+                    (1,  'green'),
+                    (1, 'purple'),
+                    (2,  'green'),
+                    (2, 'purple')],
+                   names=['number', 'color'])
+        """
         # Use Pandas for handling Python host objects
         pdi = pd.MultiIndex.from_product(arrays, names=names)
         result = cls.from_pandas(pdi)
@@ -1241,9 +1401,7 @@ def _poplevels(self, level):
             popped_data[n] = self._data.pop(n)
 
         # construct the popped result
-        popped = cudf.core.index.Index._from_table(
-            cudf.core.frame.Frame(popped_data)
-        )
+        popped = cudf.Index._from_data(popped_data)
         popped.names = popped_names
 
         # update self
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 9d449d16401..1b8405af1a4 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -781,8 +781,8 @@ def merge_sorted(
     if by_index and ignore_index:
         raise ValueError("`by_index` and `ignore_index` cannot both be True")
 
-    result = objs[0].__class__._from_table(
-        cudf._lib.merge.merge_sorted(
+    result = objs[0].__class__._from_data(
+        *cudf._lib.merge.merge_sorted(
             objs,
             keys=keys,
             by_index=by_index,
@@ -803,9 +803,9 @@ def _pivot(df, index, columns):
     Parameters
     ----------
     df : DataFrame
-    index : cudf.core.index.Index
+    index : cudf.Index
         Index labels of the result
-    columns : cudf.core.index.Index
+    columns : cudf.Index
         Column labels of the result
     """
     columns_labels, columns_idx = columns._encode()
@@ -822,22 +822,31 @@ def as_tuple(x):
 
     for v in df:
         names = [as_tuple(v) + as_tuple(name) for name in column_labels]
-        col = df._data[v]
-        result.update(
-            cudf.DataFrame._from_table(
-                col.scatter_to_table(
-                    index_idx,
-                    columns_idx,
-                    names,
-                    nrows=len(index_labels),
-                    ncols=len(names),
-                )
-            )._data
-        )
-    out = cudf.DataFrame._from_data(
+        nrows = len(index_labels)
+        ncols = len(names)
+        num_elements = nrows * ncols
+        if num_elements > 0:
+            col = df._data[v]
+            scatter_map = (columns_idx * np.int32(nrows)) + index_idx
+            target = cudf.core.frame.Frame(
+                {
+                    None: cudf.core.column.column_empty_like(
+                        col, masked=True, newsize=nrows * ncols
+                    )
+                }
+            )
+            target._data[None][scatter_map] = col
+            result_frames = target._split(range(nrows, nrows * ncols, nrows))
+            result.update(
+                {
+                    name: next(iter(f._columns))
+                    for name, f in zip(names, result_frames)
+                }
+            )
+
+    return cudf.DataFrame._from_data(
         result, index=cudf.Index(index_labels, name=index.name)
     )
-    return out
 
 
 def pivot(data, index=None, columns=None, values=None):
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index c6663a25684..f425b650ee7 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -5,7 +5,7 @@
 import pyarrow as pa
 from pandas._libs.missing import NAType as pd_NAType
 
-from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
+import cudf
 from cudf.core.column.column import ColumnBase
 from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype
 from cudf.core.index import BaseIndex
@@ -17,45 +17,46 @@
 
 
 class Scalar(object):
+    """
+    A GPU-backed scalar object with NumPy scalar like properties
+    May be used in binary operations against other scalars, cuDF
+    Series, DataFrame, and Index objects.
+
+    Examples
+    --------
+    >>> import cudf
+    >>> cudf.Scalar(42, dtype='int64')
+    Scalar(42, dtype=int64)
+    >>> cudf.Scalar(42, dtype='int32') + cudf.Scalar(42, dtype='float64')
+    Scalar(84.0, dtype=float64)
+    >>> cudf.Scalar(42, dtype='int64') + np.int8(21)
+    Scalar(63, dtype=int64)
+    >>> x = cudf.Scalar(42, dtype='datetime64[s]')
+    >>> y = cudf.Scalar(21, dtype='timedelta64[ns])
+    >>> x - y
+    Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns])
+    >>> cudf.Series([1,2,3]) + cudf.Scalar(1)
+    0    2
+    1    3
+    2    4
+    dtype: int64
+    >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]})
+    >>> slr = cudf.Scalar(10, dtype='uint8')
+    >>> df - slr
+       a    b
+    0 -9 -5.5
+    1 -8 -4.5
+    2 -7 -3.5
+
+    Parameters
+    ----------
+    value : Python Scalar, NumPy Scalar, or cuDF Scalar
+        The scalar value to be converted to a GPU backed scalar object
+    dtype : np.dtype or string specifier
+        The data type
+    """
+
     def __init__(self, value, dtype=None):
-        """
-        A GPU-backed scalar object with NumPy scalar like properties
-        May be used in binary operations against other scalars, cuDF
-        Series, DataFrame, and Index objects.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> cudf.Scalar(42, dtype='int64')
-        Scalar(42, dtype=int64)
-        >>> cudf.Scalar(42, dtype='int32') + cudf.Scalar(42, dtype='float64')
-        Scalar(84.0, dtype=float64)
-        >>> cudf.Scalar(42, dtype='int64') + np.int8(21)
-        Scalar(63, dtype=int64)
-        >>> x = cudf.Scalar(42, dtype='datetime64[s]')
-        >>> y = cudf.Scalar(21, dtype='timedelta64[ns])
-        >>> x - y
-        Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns])
-        >>> cudf.Series([1,2,3]) + cudf.Scalar(1)
-        0    2
-        1    3
-        2    4
-        dtype: int64
-        >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]})
-        >>> slr = cudf.Scalar(10, dtype='uint8')
-        >>> df - slr
-           a    b
-        0 -9 -5.5
-        1 -8 -4.5
-        2 -7 -3.5
-
-        Parameters
-        ----------
-        value : Python Scalar, NumPy Scalar, or cuDF Scalar
-            The scalar value to be converted to a GPU backed scalar object
-        dtype : np.dtype or string specifier
-            The data type
-        """
 
         self._host_value = None
         self._host_dtype = None
@@ -67,7 +68,7 @@ def __init__(self, value, dtype=None):
                 self._host_dtype = value._host_dtype
             else:
                 self._device_value = value._device_value
-        elif isinstance(value, DeviceScalar):
+        elif isinstance(value, cudf._lib.scalar.DeviceScalar):
             self._device_value = value
         else:
             self._host_value, self._host_dtype = self._preprocess_host_value(
@@ -85,7 +86,7 @@ def _is_device_value_current(self):
     @property
     def device_value(self):
         if self._device_value is None:
-            self._device_value = DeviceScalar(
+            self._device_value = cudf._lib.scalar.DeviceScalar(
                 self._host_value, self._host_dtype
             )
         return self._device_value
@@ -101,7 +102,7 @@ def value(self):
     def dtype(self):
         if self._is_host_value_current:
             if isinstance(self._host_value, str):
-                return np.dtype("object")
+                return cudf.dtype("object")
             else:
                 return self._host_dtype
         else:
@@ -110,13 +111,13 @@ def dtype(self):
     def is_valid(self):
         if not self._is_host_value_current:
             self._device_value_to_host()
-        return not _is_null_host_scalar(self._host_value)
+        return not cudf._lib.scalar._is_null_host_scalar(self._host_value)
 
     def _device_value_to_host(self):
         self._host_value = self._device_value._to_host_scalar()
 
     def _preprocess_host_value(self, value, dtype):
-        valid = not _is_null_host_scalar(value)
+        valid = not cudf._lib.scalar._is_null_host_scalar(value)
 
         if isinstance(value, list):
             if dtype is not None:
@@ -171,7 +172,7 @@ def _preprocess_host_value(self, value, dtype):
                 dtype = value.dtype
 
         if not isinstance(dtype, Decimal64Dtype):
-            dtype = np.dtype(dtype)
+            dtype = cudf.dtype(dtype)
 
         if not valid:
             value = NA
@@ -186,7 +187,7 @@ def _sync(self):
         if self._is_host_value_current and self._is_device_value_current:
             return
         elif self._is_host_value_current and not self._is_device_value_current:
-            self._device_value = DeviceScalar(
+            self._device_value = cudf._lib.scalar.DeviceScalar(
                 self._host_value, self._host_dtype
             )
         elif self._is_device_value_current and not self._is_host_value_current:
@@ -323,10 +324,10 @@ def _binop_result_dtype_or_error(self, other, op):
                     and self.dtype.char == other.dtype.char == "M"
                 ):
                     res, _ = np.datetime_data(max(self.dtype, other.dtype))
-                    return np.dtype("m8" + f"[{res}]")
+                    return cudf.dtype("m8" + f"[{res}]")
                 return np.result_type(self.dtype, other.dtype)
 
-        return np.dtype(out_dtype)
+        return cudf.dtype(out_dtype)
 
     def _scalar_binop(self, other, op):
         if isinstance(other, (ColumnBase, Series, BaseIndex, np.ndarray)):
@@ -357,9 +358,9 @@ def _unaop_result_type_or_error(self, op):
 
         if op in {"__ceil__", "__floor__"}:
             if self.dtype.char in "bBhHf?":
-                return np.dtype("float32")
+                return cudf.dtype("float32")
             else:
-                return np.dtype("float64")
+                return cudf.dtype("float64")
         return self.dtype
 
     def _scalar_unaop(self, op):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index fb197fbc90d..ff3b9fc68ef 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -7,7 +7,7 @@
 from collections import abc as abc
 from numbers import Number
 from shutil import get_terminal_size
-from typing import Any, Optional
+from typing import Any, MutableMapping, Optional
 from uuid import uuid4
 
 import cupy
@@ -64,6 +64,48 @@
 
 
 class Series(SingleColumnFrame, Serializable):
+    """
+    One-dimensional GPU array (including time series).
+
+    Labels need not be unique but must be a hashable type. The object
+    supports both integer- and label-based indexing and provides a
+    host of methods for performing operations involving the index.
+    Statistical methods from ndarray have been overridden to
+    automatically exclude missing data (currently represented
+    as null/NaN).
+
+    Operations between Series (`+`, `-`, `/`, `*`, `**`) align
+    values based on their associated index values-– they need
+    not be the same length. The result index will be the
+    sorted union of the two indexes.
+
+    ``Series`` objects are used as columns of ``DataFrame``.
+
+    Parameters
+    ----------
+    data : array-like, Iterable, dict, or scalar value
+        Contains data stored in Series.
+
+    index : array-like or Index (1d)
+        Values must be hashable and have the same length
+        as data. Non-unique index values are allowed. Will
+        default to RangeIndex (0, 1, 2, …, n) if not provided.
+        If both a dict and index sequence are used, the index will
+        override the keys found in the dict.
+
+    dtype : str, numpy.dtype, or ExtensionDtype, optional
+        Data type for the output Series. If not specified,
+        this will be inferred from data.
+
+    name : str, optional
+        The name to give to the Series.
+
+    nan_as_null : bool, Default True
+        If ``None``/``True``, converts ``np.nan`` values to
+        ``null`` values.
+        If ``False``, leaves ``np.nan`` values as is.
+    """
+
     # The `constructor*` properties are used by `dask` (and `dask_cudf`)
     @property
     def _constructor(self):
@@ -171,47 +213,6 @@ def from_masked_array(cls, data, mask, null_count=None):
     def __init__(
         self, data=None, index=None, dtype=None, name=None, nan_as_null=True,
     ):
-        """
-        One-dimensional GPU array (including time series).
-
-        Labels need not be unique but must be a hashable type. The object
-        supports both integer- and label-based indexing and provides a
-        host of methods for performing operations involving the index.
-        Statistical methods from ndarray have been overridden to
-        automatically exclude missing data (currently represented
-        as null/NaN).
-
-        Operations between Series (`+`, `-`, `/`, `*`, `**`) align
-        values based on their associated index values-– they need
-        not be the same length. The result index will be the
-        sorted union of the two indexes.
-
-        ``Series`` objects are used as columns of ``DataFrame``.
-
-        Parameters
-        ----------
-        data : array-like, Iterable, dict, or scalar value
-            Contains data stored in Series.
-
-        index : array-like or Index (1d)
-            Values must be hashable and have the same length
-            as data. Non-unique index values are allowed. Will
-            default to RangeIndex (0, 1, 2, …, n) if not provided.
-            If both a dict and index sequence are used, the index will
-            override the keys found in the dict.
-
-        dtype : str, numpy.dtype, or ExtensionDtype, optional
-            Data type for the output Series. If not specified,
-            this will be inferred from data.
-
-        name : str, optional
-            The name to give to the Series.
-
-        nan_as_null : bool, Default True
-            If ``None``/``True``, converts ``np.nan`` values to
-            ``null`` values.
-            If ``False``, leaves ``np.nan`` values as is.
-        """
         if isinstance(data, pd.Series):
             if name is None:
                 name = data.name
@@ -266,29 +267,19 @@ def __init__(
         super().__init__({name: data})
         self._index = RangeIndex(len(data)) if index is None else index
 
-    @classmethod
-    def _from_table(cls, table, index=None):
-        name, data = next(iter(table._data.items()))
-        if index is None:
-            if table._index is not None:
-                index = Index._from_table(table._index)
-        return cls(data=data, index=index, name=name)
-
     @classmethod
     def _from_data(
         cls,
-        data: ColumnAccessor,
-        index: Optional[Index] = None,
+        data: MutableMapping,
+        index: Optional[BaseIndex] = None,
         name: Any = None,
     ) -> Series:
         """
         Construct the Series from a ColumnAccessor
         """
-        out = cls.__new__(cls)
-        out._data = data
-        out._index = index if index is not None else RangeIndex(data.nrows)
-        if name is not None:
-            out.name = name
+        out: Series = super()._from_data(data, index, name)
+        if index is None:
+            out._index = RangeIndex(out._data.nrows)
         return out
 
     def __contains__(self, item):
@@ -392,10 +383,6 @@ def deserialize(cls, header, frames):
 
         return Series(column, index=index, name=name)
 
-    @property
-    def _copy_construct_defaults(self):
-        return {"data": self._column, "index": self._index, "name": self.name}
-
     def _get_columns_by_label(self, labels, downcast=False):
         """Return the column specified by `labels`
 
@@ -467,7 +454,7 @@ def drop(
             Return series without null values
         Series.drop_duplicates
             Return series with duplicate values removed
-        cudf.core.dataframe.DataFrame.drop
+        cudf.DataFrame.drop
             Drop specified labels from rows or columns in dataframe
 
         Examples
@@ -708,7 +695,7 @@ def reset_index(self, drop=False, inplace=False):
             if inplace is True:
                 self._index = RangeIndex(len(self))
             else:
-                return self._copy_construct(index=RangeIndex(len(self)))
+                return self._from_data(self._data, index=RangeIndex(len(self)))
 
     def set_index(self, index):
         """Returns a new Series with a different index.
@@ -743,7 +730,7 @@ def set_index(self, index):
         dtype: int64
         """
         index = index if isinstance(index, BaseIndex) else as_index(index)
-        return self._copy_construct(index=index)
+        return self._from_data(self._data, index, self.name)
 
     def as_index(self):
         """Returns a new Series with a RangeIndex.
@@ -855,8 +842,14 @@ def set_mask(self, mask, null_count=None):
         4       5
         dtype: int64
         """
-        col = self._column.set_mask(mask)
-        return self._copy_construct(data=col)
+        warnings.warn(
+            "Series.set_mask is deprecated and will be removed "
+            "in the future.",
+            DeprecationWarning,
+        )
+        return self._from_data(
+            {self.name: self._column.set_mask(mask)}, self._index
+        )
 
     def __sizeof__(self):
         return self._column.__sizeof__() + self._index.__sizeof__()
@@ -884,7 +877,7 @@ def memory_usage(self, index=True, deep=False):
 
         See Also
         --------
-        cudf.core.dataframe.DataFrame.memory_usage : Bytes consumed by
+        cudf.DataFrame.memory_usage : Bytes consumed by
             a DataFrame.
 
         Examples
@@ -1097,8 +1090,9 @@ def take(self, indices, keep_index=True):
             return self.iloc[indices]
         else:
             col_inds = as_column(indices)
-            data = self._column.take(col_inds, keep_index=False)
-            return self._copy_construct(data=data, index=None)
+            return self._from_data(
+                {self.name: self._column.take(col_inds, keep_index=False)}
+            )
 
     def head(self, n=5):
         """
@@ -2349,22 +2343,22 @@ def __invert__(self):
                 f"Operation `~` not supported on {self.dtype.type.__name__}"
             )
 
-    @copy_docstring(CategoricalAccessor.__init__)  # type: ignore
+    @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
     def cat(self):
         return CategoricalAccessor(parent=self)
 
-    @copy_docstring(StringMethods.__init__)  # type: ignore
+    @copy_docstring(StringMethods)  # type: ignore
     @property
     def str(self):
         return StringMethods(parent=self)
 
-    @copy_docstring(ListMethods.__init__)  # type: ignore
+    @copy_docstring(ListMethods)  # type: ignore
     @property
     def list(self):
         return ListMethods(parent=self)
 
-    @copy_docstring(StructMethods.__init__)  # type: ignore
+    @copy_docstring(StructMethods)  # type: ignore
     @property
     def struct(self):
         return StructMethods(parent=self)
@@ -2508,10 +2502,10 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         Series.fillna : Replace null values.
 
-        cudf.core.dataframe.DataFrame.dropna : Drop rows or columns which
+        cudf.DataFrame.dropna : Drop rows or columns which
             contain null values.
 
-        cudf.core.index.Index.dropna : Drop null indices.
+        cudf.Index.dropna : Drop null indices.
 
         Examples
         --------
@@ -2727,113 +2721,23 @@ def nans_to_nulls(self):
         4    10.0
         dtype: float64
         """
-        result_col = self._column.nans_to_nulls()
-        return self._copy_construct(data=result_col)
+        return self._from_data(
+            {self.name: self._column.nans_to_nulls()}, self._index
+        )
 
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
-        """
-        Return whether all elements are True in Series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If the entire row/column is NA and
-            skipna is True, then the result will be True, as for an
-            empty row/column.
-            If skipna is False, then NA are treated as True, because
-            these are not equal to zero.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.all()
-        True
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
         if bool_only not in (None, True):
             raise NotImplementedError(
-                "bool_only parameter is not implemented yet"
+                "The bool_only parameter is not supported for Series."
             )
-
-        if skipna:
-            result_series = self.nans_to_nulls()
-            if len(result_series) == result_series.null_count:
-                return True
-        else:
-            result_series = self
-        return result_series._column.all()
+        return super().all(axis, skipna, level, **kwargs)
 
     def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
-        """
-        Return whether any elements is True in Series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If the entire row/column is NA and
-            skipna is True, then the result will be False, as for an
-            empty row/column.
-            If skipna is False, then NA are treated as True, because
-            these are not equal to zero.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `bool_only`, `level`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.any()
-        True
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
         if bool_only not in (None, True):
             raise NotImplementedError(
-                "bool_only parameter is not implemented yet"
+                "The bool_only parameter is not supported for Series."
             )
-
-        skipna = False if skipna is None else skipna
-
-        if skipna is False and self.has_nulls:
-            return True
-
-        if skipna:
-            result_series = self.nans_to_nulls()
-            if len(result_series) == result_series.null_count:
-                return False
-
-        else:
-            result_series = self
-
-        return result_series._column.any()
+        return super().any(axis, skipna, level, **kwargs)
 
     def to_pandas(self, index=True, nullable=False, **kwargs):
         """
@@ -2941,7 +2845,7 @@ def loc(self):
 
         See also
         --------
-        cudf.core.dataframe.DataFrame.loc
+        cudf.DataFrame.loc
 
         Examples
         --------
@@ -2964,7 +2868,7 @@ def iloc(self):
 
         See also
         --------
-        cudf.core.dataframe.DataFrame.iloc
+        cudf.DataFrame.iloc
 
         Examples
         --------
@@ -3106,8 +3010,9 @@ def astype(self, dtype, copy=False, errors="raise"):
         try:
             data = self._column.astype(dtype)
 
-            return self._copy_construct(
-                data=data.copy(deep=True) if copy else data, index=self.index
+            return self._from_data(
+                {self.name: (data.copy(deep=True) if copy else data)},
+                index=self._index,
             )
 
         except Exception as e:
@@ -3421,8 +3326,8 @@ def _sort(self, ascending=True, na_position="last"):
         col_keys, col_inds = self._column.sort_by_values(
             ascending=ascending, na_position=na_position
         )
-        sr_keys = self._copy_construct(data=col_keys)
-        sr_inds = self._copy_construct(data=col_inds)
+        sr_keys = self._from_data({self.name: col_keys}, self._index)
+        sr_inds = self._from_data({self.name: col_inds}, self._index)
         return sr_keys, sr_inds
 
     def replace(
@@ -3725,9 +3630,9 @@ def reverse(self):
         dtype: int64
         """
         rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32)
-        col = self._column[rinds]
-        index = self.index._values[rinds]
-        return self._copy_construct(data=col, index=index)
+        return self._from_data(
+            {self.name: self._column[rinds]}, self.index._values[rinds]
+        )
 
     def one_hot_encoding(self, cats, dtype="float64"):
         """Perform one-hot-encoding
@@ -3774,7 +3679,7 @@ def one_hot_encoding(self, cats, dtype="float64"):
             cats = cats.to_pandas()
         else:
             cats = pd.Series(cats, dtype="object")
-        dtype = np.dtype(dtype)
+        dtype = cudf.dtype(dtype)
 
         def encode(cat):
             if cat is None:
@@ -3881,7 +3786,9 @@ def _return_sentinel_series():
         codes = codes.merge(value, on="value", how="left")
         codes = codes.sort_values("order")["code"].fillna(na_sentinel)
 
-        return codes._copy_construct(name=None, index=self.index)
+        codes.name = None
+        codes.index = self._index
+        return codes
 
     # UDF related
 
@@ -3995,7 +3902,7 @@ def applymap(self, udf, out_dtype=None):
         """
         if not callable(udf):
             raise ValueError("Input UDF must be a callable object.")
-        return self._copy_construct(data=self._unaryop(udf))
+        return self._from_data({self.name: self._unaryop(udf)}, self._index)
 
     #
     # Stats
@@ -4026,932 +3933,155 @@ def count(self, level=None, **kwargs):
 
         return self.valid_count
 
-    def min(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        **kwargs,
-    ):
+    def mode(self, dropna=True):
         """
-        Return the minimum of the values in the Series.
+        Return the mode(s) of the dataset.
+
+        Always returns Series even if only one value is returned.
 
         Parameters
         ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        dtype : data type
-            Data type to cast the result to.
+        dropna : bool, default True
+            Don't consider counts of NA/NaN/NaT.
 
         Returns
         -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level`, `numeric_only`.
+        Series
+            Modes of the Series in sorted order.
 
         Examples
         --------
         >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.min()
-        1
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
+        >>> series = cudf.Series([7, 6, 5, 4, 3, 2, 1])
+        >>> series
+        0    7
+        1    6
+        2    5
+        3    4
+        4    3
+        5    2
+        6    1
+        dtype: int64
+        >>> series.mode()
+        0    1
+        1    2
+        2    3
+        3    4
+        4    5
+        5    6
+        6    7
+        dtype: int64
 
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
+        We can include ``<NA>`` values in mode by
+        passing ``dropna=False``.
 
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
+        >>> series = cudf.Series([7, 4, 3, 3, 7, None, None])
+        >>> series
+        0       7
+        1       4
+        2       3
+        3       3
+        4       7
+        5    <NA>
+        6    <NA>
+        dtype: int64
+        >>> series.mode()
+        0    3
+        1    7
+        dtype: int64
+        >>> series.mode(dropna=False)
+        0       3
+        1       7
+        2    <NA>
+        dtype: int64
+        """
+        val_counts = self.value_counts(ascending=False, dropna=dropna)
+        if len(val_counts) > 0:
+            val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
-        return self._column.min(skipna=skipna, dtype=dtype)
+        return Series(val_counts.index.sort_values(), name=self.name)
 
-    def max(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        **kwargs,
-    ):
+    def round(self, decimals=0, how="half_even"):
         """
-        Return the maximum of the values in the Series.
+        Round each value in a Series to the given number of decimals.
 
         Parameters
         ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        dtype : data type
-            Data type to cast the result to.
+        decimals : int, default 0
+            Number of decimal places to round to. If decimals is negative,
+            it specifies the number of positions to the left of the decimal
+            point.
+        how : str, optional
+            Type of rounding. Can be either "half_even" (default)
+            of "half_up" rounding.
 
         Returns
         -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level`, `numeric_only`.
+        Series
+            Rounded values of the Series.
 
         Examples
         --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.max()
-        5
+        >>> s = cudf.Series([0.1, 1.4, 2.9])
+        >>> s.round()
+        0    0.0
+        1    1.0
+        2    3.0
+        dtype: float64
         """
+        return Series(
+            self._column.round(decimals=decimals, how=how),
+            name=self.name,
+            index=self.index,
+            dtype=self.dtype,
+        )
 
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.max(skipna=skipna, dtype=dtype)
-
-    def sum(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
-    ):
+    def cov(self, other, min_periods=None):
         """
-        Return sum of the values in the Series.
+        Compute covariance with Series, excluding missing values.
 
         Parameters
         ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        dtype : data type
-            Data type to cast the result to.
-
-        min_count : int, default 0
-            The required number of valid values to perform the operation.
-            If fewer than min_count non-NA values are present the result
-            will be NA.
-
-            The default being 0. This means the sum of an all-NA or empty
-            Series is 0, and the product of an all-NA or empty Series is 1.
+        other : Series
+            Series with which to compute the covariance.
 
         Returns
         -------
-        scalar
+        float
+            Covariance between Series and other normalized by N-1
+            (unbiased estimator).
 
         Notes
         -----
-        Parameters currently not supported are `axis`, `level`, `numeric_only`.
+        `min_periods` parameter is not yet supported.
 
         Examples
         --------
         >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.sum()
-        15
+        >>> ser1 = cudf.Series([0.9, 0.13, 0.62])
+        >>> ser2 = cudf.Series([0.12, 0.26, 0.51])
+        >>> ser1.cov(ser2)
+        -0.015750000000000004
         """
 
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
+        if min_periods is not None:
             raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
+                "min_periods parameter is not implemented yet"
             )
 
-        return self._column.sum(
-            skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def product(
-        self,
-        axis=None,
-        skipna=None,
-        dtype=None,
-        level=None,
-        numeric_only=None,
-        min_count=0,
-        **kwargs,
-    ):
-        """
-        Return product of the values in the Series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        dtype : data type
-            Data type to cast the result to.
+        if self.empty or other.empty:
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        min_count : int, default 0
-            The required number of valid values to perform the operation.
-            If fewer than min_count non-NA values are present the result
-            will be NA.
+        lhs = self.nans_to_nulls().dropna()
+        rhs = other.nans_to_nulls().dropna()
 
-            The default being 0. This means the sum of an all-NA or empty
-            Series is 0, and the product of an all-NA or empty Series is 1.
+        lhs, rhs = _align_indices([lhs, rhs], how="inner")
 
-        Returns
-        -------
-        scalar
+        return lhs._column.cov(rhs._column)
 
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level`, `numeric_only`.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.product()
-        120
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.product(
-            skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    prod = product
-
-    def cummin(self, axis=None, skipna=True, *args, **kwargs):
-        """
-        Return cumulative minimum of the Series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported is `axis`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.cummin()
-        0    1
-        1    1
-        2    1
-        3    1
-        4    1
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        skipna = True if skipna is None else skipna
-
-        if skipna:
-            result_col = self.nans_to_nulls()._column
-        else:
-            result_col = self._column.copy()
-            if result_col.has_nulls:
-                # Workaround as find_first_value doesn't seem to work
-                # incase of bools.
-                first_index = int(
-                    result_col.isnull().astype("int8").find_first_value(1)
-                )
-                result_col[first_index:] = None
-
-        return Series(
-            result_col._apply_scan_op("min"), name=self.name, index=self.index,
-        )
-
-    def cummax(self, axis=0, skipna=True, *args, **kwargs):
-        """
-        Return cumulative maximum of the Series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported is `axis`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.cummax()
-        0    1
-        1    5
-        2    5
-        3    5
-        4    5
-        """
-        assert axis in (None, 0)
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        skipna = True if skipna is None else skipna
-
-        if skipna:
-            result_col = self.nans_to_nulls()._column
-        else:
-            result_col = self._column.copy()
-            if result_col.has_nulls:
-                first_index = int(
-                    result_col.isnull().astype("int8").find_first_value(1)
-                )
-                result_col[first_index:] = None
-
-        return Series(
-            result_col._apply_scan_op("max"), name=self.name, index=self.index,
-        )
-
-    def cumsum(self, axis=0, skipna=True, *args, **kwargs):
-        """
-        Return cumulative sum of the Series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported is `axis`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.cumsum()
-        0    1
-        1    6
-        2    8
-        3    12
-        4    15
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        skipna = True if skipna is None else skipna
-
-        if skipna:
-            result_col = self.nans_to_nulls()._column
-        else:
-            result_col = self._column.copy()
-            if result_col.has_nulls:
-                first_index = int(
-                    result_col.isnull().astype("int8").find_first_value(1)
-                )
-                result_col[first_index:] = None
-
-        # pandas always returns int64 dtype if original dtype is int or `bool`
-        if not is_decimal_dtype(result_col.dtype) and (
-            np.issubdtype(result_col.dtype, np.integer)
-            or np.issubdtype(result_col.dtype, np.bool_)
-        ):
-            return Series(
-                result_col.astype(np.int64)._apply_scan_op("sum"),
-                name=self.name,
-                index=self.index,
-            )
-        else:
-            return Series(
-                result_col._apply_scan_op("sum"),
-                name=self.name,
-                index=self.index,
-            )
-
-    def cumprod(self, axis=0, skipna=True, *args, **kwargs):
-        """
-        Return cumulative product of the Series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If an entire row/column is NA,
-            the result will be NA.
-
-        Returns
-        -------
-        Series
-
-        Notes
-        -----
-        Parameters currently not supported is `axis`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([1, 5, 2, 4, 3])
-        >>> ser.cumprod()
-        0    1
-        1    5
-        2    10
-        3    40
-        4    120
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if is_decimal_dtype(self.dtype):
-            raise NotImplementedError(
-                "cumprod does not currently support decimal types"
-            )
-
-        skipna = True if skipna is None else skipna
-
-        if skipna:
-            result_col = self.nans_to_nulls()._column
-        else:
-            result_col = self._column.copy()
-            if result_col.has_nulls:
-                first_index = int(
-                    result_col.isnull().astype("int8").find_first_value(1)
-                )
-                result_col[first_index:] = None
-
-        # pandas always returns int64 dtype if original dtype is int or `bool`
-        if np.issubdtype(result_col.dtype, np.integer) or np.issubdtype(
-            result_col.dtype, np.bool_
-        ):
-            return Series(
-                result_col.astype(np.int64)._apply_scan_op("product"),
-                name=self.name,
-                index=self.index,
-            )
-        else:
-            return Series(
-                result_col._apply_scan_op("product"),
-                name=self.name,
-                index=self.index,
-            )
-
-    def mean(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
-    ):
-        """
-        Return the mean of the values in the series.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([10, 25, 3, 25, 24, 6])
-        >>> ser.mean()
-        15.5
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.mean(skipna=skipna)
-
-    def std(
-        self,
-        axis=None,
-        skipna=None,
-        level=None,
-        ddof=1,
-        numeric_only=None,
-        **kwargs,
-    ):
-        """
-        Return sample standard deviation of the Series.
-
-        Normalized by N-1 by default. This can be changed using
-        the `ddof` argument
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If an entire row/column is NA, the result
-            will be NA.
-
-        ddof : int, default 1
-            Delta Degrees of Freedom. The divisor used in calculations
-            is N - ddof, where N represents the number of elements.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 10, 20, 30, 40])
-        >>> series
-        0    10
-        1    10
-        2    20
-        3    30
-        4    40
-        dtype: int64
-        >>> series.std()
-        13.038404810405298
-        >>> series.std(ddof=2)
-        15.05545305418162
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.std(skipna=skipna, ddof=ddof)
-
-    def var(
-        self,
-        axis=None,
-        skipna=None,
-        level=None,
-        ddof=1,
-        numeric_only=None,
-        **kwargs,
-    ):
-        """
-        Return unbiased variance of the Series.
-
-        Normalized by N-1 by default. This can be changed using the
-        ddof argument
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values. If an entire row/column is NA, the result
-            will be NA.
-
-        ddof : int, default 1
-            Delta Degrees of Freedom. The divisor used in calculations is
-            N - ddof, where N represents the number of elements.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 11, 12, 0, 1])
-        >>> series
-        0    10
-        1    11
-        2    12
-        3     0
-        4     1
-        dtype: int64
-        >>> series.var()
-        33.7
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.var(skipna=skipna, ddof=ddof)
-
-    def sum_of_squares(self, dtype=None):
-        return self._column.sum_of_squares(dtype=dtype)
-
-    def median(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
-    ):
-        """
-        Return the median of the values for the requested axis.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser = cudf.Series([10, 25, 3, 25, 24, 6])
-        >>> ser
-        0    10
-        1    25
-        2     3
-        3    25
-        4    24
-        5     6
-        dtype: int64
-        >>> ser.median()
-        17.0
-        """
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.median(skipna=skipna)
-
-    def mode(self, dropna=True):
-        """
-        Return the mode(s) of the dataset.
-
-        Always returns Series even if only one value is returned.
-
-        Parameters
-        ----------
-        dropna : bool, default True
-            Don't consider counts of NA/NaN/NaT.
-
-        Returns
-        -------
-        Series
-            Modes of the Series in sorted order.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([7, 6, 5, 4, 3, 2, 1])
-        >>> series
-        0    7
-        1    6
-        2    5
-        3    4
-        4    3
-        5    2
-        6    1
-        dtype: int64
-        >>> series.mode()
-        0    1
-        1    2
-        2    3
-        3    4
-        4    5
-        5    6
-        6    7
-        dtype: int64
-
-        We can include ``<NA>`` values in mode by
-        passing ``dropna=False``.
-
-        >>> series = cudf.Series([7, 4, 3, 3, 7, None, None])
-        >>> series
-        0       7
-        1       4
-        2       3
-        3       3
-        4       7
-        5    <NA>
-        6    <NA>
-        dtype: int64
-        >>> series.mode()
-        0    3
-        1    7
-        dtype: int64
-        >>> series.mode(dropna=False)
-        0       3
-        1       7
-        2    <NA>
-        dtype: int64
-        """
-        val_counts = self.value_counts(ascending=False, dropna=dropna)
-        if len(val_counts) > 0:
-            val_counts = val_counts[val_counts == val_counts.iloc[0]]
-
-        return Series(val_counts.index.sort_values(), name=self.name)
-
-    def round(self, decimals=0, how="half_even"):
-        """
-        Round each value in a Series to the given number of decimals.
-
-        Parameters
-        ----------
-        decimals : int, default 0
-            Number of decimal places to round to. If decimals is negative,
-            it specifies the number of positions to the left of the decimal
-            point.
-        how : str, optional
-            Type of rounding. Can be either "half_even" (default)
-            of "half_up" rounding.
-
-        Returns
-        -------
-        Series
-            Rounded values of the Series.
-
-        Examples
-        --------
-        >>> s = cudf.Series([0.1, 1.4, 2.9])
-        >>> s.round()
-        0    0.0
-        1    1.0
-        2    3.0
-        dtype: float64
-        """
-        return Series(
-            self._column.round(decimals=decimals, how=how),
-            name=self.name,
-            index=self.index,
-            dtype=self.dtype,
-        )
-
-    def kurtosis(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
-    ):
-        """
-        Return Fisher's unbiased kurtosis of a sample.
-
-        Kurtosis obtained using Fisher’s definition of
-        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
-
-        Parameters
-        ----------
-
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1, 2, 3, 4])
-        >>> series.kurtosis()
-        -1.1999999999999904
-        """
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.kurtosis(skipna=skipna)
-
-    # Alias for kurtosis.
-    kurt = kurtosis
-
-    def skew(
-        self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
-    ):
-        """
-        Return unbiased Fisher-Pearson skew of a sample.
-
-        Parameters
-        ----------
-        skipna : bool, default True
-            Exclude NA/null values when computing the result.
-
-        Returns
-        -------
-        scalar
-
-        Notes
-        -----
-        Parameters currently not supported are `axis`, `level` and
-        `numeric_only`
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6])
-        >>> series
-        0    1
-        1    2
-        2    3
-        3    4
-        4    5
-        5    6
-        6    6
-        dtype: int64
-        >>> series.skew()
-        -0.288195490292614
-        """
-
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
-
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-
-        return self._column.skew(skipna=skipna)
-
-    def cov(self, other, min_periods=None):
-        """
-        Compute covariance with Series, excluding missing values.
-
-        Parameters
-        ----------
-        other : Series
-            Series with which to compute the covariance.
-
-        Returns
-        -------
-        float
-            Covariance between Series and other normalized by N-1
-            (unbiased estimator).
-
-        Notes
-        -----
-        `min_periods` parameter is not yet supported.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> ser1 = cudf.Series([0.9, 0.13, 0.62])
-        >>> ser2 = cudf.Series([0.12, 0.26, 0.51])
-        >>> ser1.cov(ser2)
-        -0.015750000000000004
-        """
-
-        if min_periods is not None:
-            raise NotImplementedError(
-                "min_periods parameter is not implemented yet"
-            )
-
-        if self.empty or other.empty:
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        lhs = self.nans_to_nulls().dropna()
-        rhs = other.nans_to_nulls().dropna()
-
-        lhs, rhs = _align_indices([lhs, rhs], how="inner")
-
-        return lhs._column.cov(rhs._column)
-
-    def corr(self, other, method="pearson", min_periods=None):
-        """Calculates the sample correlation between two Series,
-        excluding missing values.
+    def corr(self, other, method="pearson", min_periods=None):
+        """Calculates the sample correlation between two Series,
+        excluding missing values.
 
         Examples
         --------
@@ -4962,7 +4092,11 @@ def corr(self, other, method="pearson", min_periods=None):
         -0.20454263717316112
         """
 
-        assert method in ("pearson",) and min_periods in (None,)
+        if method not in ("pearson",):
+            raise ValueError(f"Unknown method {method}")
+
+        if min_periods not in (None,):
+            raise NotImplementedError("Unsupported argument 'min_periods'")
 
         if self.empty or other.empty:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -5149,7 +4283,7 @@ def value_counts(
         Series.count
             Number of non-NA elements in a Series.
 
-        cudf.core.dataframe.DataFrame.count
+        cudf.DataFrame.count
             Number of non-NA elements in a DataFrame.
 
         Examples
@@ -5260,7 +4394,8 @@ def scale(self):
         vmin = self.min()
         vmax = self.max()
         scaled = (self - vmin) / (vmax - vmin)
-        return self._copy_construct(data=scaled)
+        scaled._index = self._index.copy(deep=False)
+        return scaled
 
     # Absolute
     def abs(self):
@@ -5411,7 +4546,8 @@ def hash_encode(self, stop, use_name=False):
         2     76
         dtype: int32
         """
-        assert stop > 0
+        if not stop > 0:
+            raise ValueError("stop must be a positive integer.")
 
         initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None
         hashed_values = Series(self._hash(initial_hash))
@@ -5755,7 +4891,7 @@ def diff(self, periods=1):
 
         return Series(output_col, name=self.name, index=self.index)
 
-    @copy_docstring(SeriesGroupBy.__init__)
+    @copy_docstring(SeriesGroupBy)
     def groupby(
         self,
         by=None,
@@ -6438,6 +5574,42 @@ def is_leap_year(self):
         -------
         Series
         Booleans indicating if dates belong to a leap year.
+
+        Example
+        -------
+        >>> import pandas as pd, cudf
+        >>> s = cudf.Series(
+        ...     pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y'))
+        >>> s
+        0    2000-12-31
+        1    2001-12-31
+        2    2002-12-31
+        3    2003-12-31
+        4    2004-12-31
+        5    2005-12-31
+        6    2006-12-31
+        7    2007-12-31
+        8    2008-12-31
+        9    2009-12-31
+        10   2010-12-31
+        11   2011-12-31
+        12   2012-12-31
+        dtype: datetime64[ns]
+        >>> s.dt.is_leap_year
+        0      True
+        1     False
+        2     False
+        3     False
+        4      True
+        5     False
+        6     False
+        7     False
+        8      True
+        9     False
+        10    False
+        11    False
+        12     True
+        dtype: bool
         """
         res = libcudf.datetime.is_leap_year(self.series._column).fillna(False)
         return Series._from_data(
@@ -6447,17 +5619,304 @@ def is_leap_year(self):
         )
 
     @property
-    def is_month_start(self):
+    def quarter(self):
         """
-        Boolean indicator if the date is the first day of the month.
+        Integer indicator for which quarter of the year the date belongs in.
+
+        There are 4 quarters in a year. With the first quarter being from
+        January - March, second quarter being April - June, third quarter
+        being July - September and fourth quarter being October - December.
 
         Returns
         -------
         Series
+        Integer indicating which quarter the date belongs to.
+
+        Examples
+        -------
+        >>> import cudf
+        >>> s = cudf.Series(["2020-05-31 08:00:00","1999-12-31 18:40:00"],
+        ...     dtype="datetime64[ms]")
+        >>> s.dt.quarter
+        0    2
+        1    4
+        dtype: int8
+        """
+        res = libcudf.datetime.extract_quarter(self.series._column).astype(
+            np.int8
+        )
+        return Series._from_data(
+            {None: res}, index=self.series._index, name=self.series.name,
+        )
+
+    @property
+    def is_month_start(self):
+        """
         Booleans indicating if dates are the first day of the month.
         """
         return (self.day == 1).fillna(False)
 
+    @property
+    def days_in_month(self):
+        """
+        Get the total number of days in the month that the date falls on.
+
+        Returns
+        -------
+        Series
+        Integers representing the number of days in month
+
+        Example
+        -------
+        >>> import pandas as pd, cudf
+        >>> s = cudf.Series(
+        ...     pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M'))
+        >>> s
+        0    2000-08-31
+        1    2000-09-30
+        2    2000-10-31
+        3    2000-11-30
+        4    2000-12-31
+        5    2001-01-31
+        6    2001-02-28
+        7    2001-03-31
+        8    2001-04-30
+        9    2001-05-31
+        10   2001-06-30
+        11   2001-07-31
+        dtype: datetime64[ns]
+        >>> s.dt.days_in_month
+        0     31
+        1     30
+        2     31
+        3     30
+        4     31
+        5     31
+        6     28
+        7     31
+        8     30
+        9     31
+        10    30
+        11    31
+        dtype: int16
+        """
+        res = libcudf.datetime.days_in_month(self.series._column)
+        return Series._from_data(
+            ColumnAccessor({None: res}),
+            index=self.series._index,
+            name=self.series.name,
+        )
+
+    @property
+    def is_month_end(self):
+        """
+        Boolean indicator if the date is the last day of the month.
+
+        Returns
+        -------
+        Series
+        Booleans indicating if dates are the last day of the month.
+
+        Example
+        -------
+        >>> import pandas as pd, cudf
+        >>> s = cudf.Series(
+        ...     pd.date_range(start='2000-08-26', end='2000-09-03', freq='1D'))
+        >>> s
+        0   2000-08-26
+        1   2000-08-27
+        2   2000-08-28
+        3   2000-08-29
+        4   2000-08-30
+        5   2000-08-31
+        6   2000-09-01
+        7   2000-09-02
+        8   2000-09-03
+        dtype: datetime64[ns]
+        >>> s.dt.is_month_end
+        0    False
+        1    False
+        2    False
+        3    False
+        4    False
+        5     True
+        6    False
+        7    False
+        8    False
+        dtype: bool
+        """  # noqa: E501
+        last_day = libcudf.datetime.last_day_of_month(self.series._column)
+        last_day = Series._from_data(
+            ColumnAccessor({None: last_day}),
+            index=self.series._index,
+            name=self.series.name,
+        )
+        return (self.day == last_day.dt.day).fillna(False)
+
+    @property
+    def is_quarter_start(self):
+        """
+        Boolean indicator if the date is the first day of a quarter.
+
+        Returns
+        -------
+        Series
+        Booleans indicating if dates are the begining of a quarter
+
+        Example
+        -------
+        >>> import pandas as pd, cudf
+        >>> s = cudf.Series(
+        ...     pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D'))
+        >>> s
+        0   2000-09-26
+        1   2000-09-27
+        2   2000-09-28
+        3   2000-09-29
+        4   2000-09-30
+        5   2000-10-01
+        6   2000-10-02
+        7   2000-10-03
+        dtype: datetime64[ns]
+        >>> s.dt.is_quarter_start
+        0    False
+        1    False
+        2    False
+        3    False
+        4    False
+        5     True
+        6    False
+        7    False
+        dtype: bool
+        """
+        day = self.series._column.get_dt_field("day")
+        first_month = self.series._column.get_dt_field("month").isin(
+            [1, 4, 7, 10]
+        )
+
+        result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
+        return Series._from_data(
+            {None: result}, index=self.series._index, name=self.series.name,
+        )
+
+    @property
+    def is_quarter_end(self):
+        """
+        Boolean indicator if the date is the last day of a quarter.
+
+        Returns
+        -------
+        Series
+        Booleans indicating if dates are the end of a quarter
+
+        Example
+        -------
+        >>> import pandas as pd, cudf
+        >>> s = cudf.Series(
+        ...     pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D'))
+        >>> s
+        0   2000-09-26
+        1   2000-09-27
+        2   2000-09-28
+        3   2000-09-29
+        4   2000-09-30
+        5   2000-10-01
+        6   2000-10-02
+        7   2000-10-03
+        dtype: datetime64[ns]
+        >>> s.dt.is_quarter_end
+        0    False
+        1    False
+        2    False
+        3    False
+        4     True
+        5    False
+        6    False
+        7    False
+        dtype: bool
+        """
+        day = self.series._column.get_dt_field("day")
+        last_day = libcudf.datetime.last_day_of_month(self.series._column)
+        last_day = last_day.get_dt_field("day")
+        last_month = self.series._column.get_dt_field("month").isin(
+            [3, 6, 9, 12]
+        )
+
+        result = ((day == last_day) & last_month).fillna(False)
+        return Series._from_data(
+            {None: result}, index=self.series._index, name=self.series.name,
+        )
+
+    @property
+    def is_year_start(self):
+        """
+        Boolean indicator if the date is the first day of the year.
+
+        Returns
+        -------
+        Series
+        Booleans indicating if dates are the first day of the year.
+
+        Example
+        -------
+        >>> import pandas as pd, cudf
+        >>> s = cudf.Series(pd.date_range("2017-12-30", periods=3))
+        >>> dates
+        0   2017-12-30
+        1   2017-12-31
+        2   2018-01-01
+        dtype: datetime64[ns]
+        >>> dates.dt.is_year_start
+        0    False
+        1    False
+        2    True
+        dtype: bool
+        """
+        outcol = self.series._column.get_dt_field(
+            "day_of_year"
+        ) == cudf.Scalar(1)
+        return Series._from_data(
+            {None: outcol.fillna(False)},
+            index=self.series._index,
+            name=self.series.name,
+        )
+
+    @property
+    def is_year_end(self):
+        """
+        Boolean indicator if the date is the last day of the year.
+
+        Returns
+        -------
+        Series
+        Booleans indicating if dates are the last day of the year.
+
+        Example
+        -------
+        >>> import pandas as pd, cudf
+        >>> dates = cudf.Series(pd.date_range("2017-12-30", periods=3))
+        >>> dates
+        0   2017-12-30
+        1   2017-12-31
+        2   2018-01-01
+        dtype: datetime64[ns]
+        >>> dates.dt.is_year_end
+        0    False
+        1     True
+        2    False
+        dtype: bool
+        """
+        day_of_year = self.series._column.get_dt_field("day_of_year")
+        leap_dates = libcudf.datetime.is_leap_year(self.series._column)
+
+        leap = day_of_year == cudf.Scalar(366)
+        non_leap = day_of_year == cudf.Scalar(365)
+        result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
+        result = result.fillna(False)
+        return Series._from_data(
+            {None: result}, index=self.series._index, name=self.series.name,
+        )
+
     def _get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
         return Series(
@@ -6828,7 +6287,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
     for sr in series_list[1:]:
         if not sr.index.names == head.names:
             all_names_equal = False
-    new_index_names = [None]
+    new_index_names = [None] * head.nlevels
     if all_names_equal:
         new_index_names = head.names
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 00f60cfc8b5..946cdcb1ebc 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -345,6 +345,66 @@ def get_units(value):
 
 
 class DateOffset:
+    """
+    An object used for binary ops where calendrical arithmetic
+    is desired rather than absolute time arithmetic. Used to
+    add or subtract a whole number of periods, such as several
+    months or years, to a series or index of datetime dtype.
+    Works similarly to pd.DateOffset, but stores the offset
+    on the device (GPU).
+
+    Parameters
+    ----------
+    n : int, default 1
+        The number of time periods the offset represents.
+    **kwds
+        Temporal parameter that add to or replace the offset value.
+        Parameters that **add** to the offset (like Timedelta):
+        - months
+
+    See Also
+    --------
+    pandas.DateOffset : The equivalent Pandas object that this
+    object replicates
+
+    Examples
+    --------
+    >>> from cudf import DateOffset
+    >>> ts = cudf.Series([
+        "2000-01-01 00:00:00.012345678",
+        "2000-01-31 00:00:00.012345678",
+        "2000-02-29 00:00:00.012345678",
+    ], dtype='datetime64[ns])
+    >>> ts + DateOffset(months=3)
+    0   2000-04-01 00:00:00.012345678
+    1   2000-04-30 00:00:00.012345678
+    2   2000-05-29 00:00:00.012345678
+    dtype: datetime64[ns]
+    >>> ts - DateOffset(months=12)
+    0   1999-01-01 00:00:00.012345678
+    1   1999-01-31 00:00:00.012345678
+    2   1999-02-28 00:00:00.012345678
+    dtype: datetime64[ns]
+
+    Notes
+    -----
+    Note that cuDF does not yet support DateOffset arguments
+    that 'replace' units in the datetime data being operated on
+    such as
+        - year
+        - month
+        - week
+        - day
+        - hour
+        - minute
+        - second
+        - microsecond
+        - millisecond
+        - nanosecond
+
+    cuDF does not yet support rounding via a `normalize`
+    keyword argument.
+    """
 
     _UNITS_TO_CODES = {
         "nanoseconds": "ns",
@@ -362,66 +422,6 @@ class DateOffset:
     _CODES_TO_UNITS = {v: k for k, v in _UNITS_TO_CODES.items()}
 
     def __init__(self, n=1, normalize=False, **kwds):
-        """
-        An object used for binary ops where calendrical arithmetic
-        is desired rather than absolute time arithmetic. Used to
-        add or subtract a whole number of periods, such as several
-        months or years, to a series or index of datetime dtype.
-        Works similarly to pd.DateOffset, but stores the offset
-        on the device (GPU).
-
-        Parameters
-        ----------
-        n : int, default 1
-            The number of time periods the offset represents.
-        **kwds
-            Temporal parameter that add to or replace the offset value.
-            Parameters that **add** to the offset (like Timedelta):
-            - months
-
-        See Also
-        --------
-        pandas.DateOffset : The equivalent Pandas object that this
-        object replicates
-
-        Examples
-        --------
-        >>> from cudf import DateOffset
-        >>> ts = cudf.Series([
-            "2000-01-01 00:00:00.012345678",
-            "2000-01-31 00:00:00.012345678",
-            "2000-02-29 00:00:00.012345678",
-        ], dtype='datetime64[ns])
-        >>> ts + DateOffset(months=3)
-        0   2000-04-01 00:00:00.012345678
-        1   2000-04-30 00:00:00.012345678
-        2   2000-05-29 00:00:00.012345678
-        dtype: datetime64[ns]
-        >>> ts - DateOffset(months=12)
-        0   1999-01-01 00:00:00.012345678
-        1   1999-01-31 00:00:00.012345678
-        2   1999-02-28 00:00:00.012345678
-        dtype: datetime64[ns]
-
-        Notes
-        -----
-        Note that cuDF does not yet support DateOffset arguments
-        that 'replace' units in the datetime data being operated on
-        such as
-            - year
-            - month
-            - week
-            - day
-            - hour
-            - minute
-            - second
-            - microsecond
-            - millisecond
-            - nanosecond
-
-        cuDF does not yet support rounding via a `normalize`
-        keyword argument.
-        """
         if normalize:
             raise NotImplementedError(
                 "normalize not yet supported for DateOffset"
@@ -495,7 +495,7 @@ def __init__(self, n=1, normalize=False, **kwds):
                     dtype = "int16"
                 else:
                     unit = self._UNITS_TO_CODES[k]
-                    dtype = np.dtype(f"timedelta64[{unit}]")
+                    dtype = cudf.dtype(f"timedelta64[{unit}]")
                 scalars[k] = cudf.Scalar(v, dtype=dtype)
 
         self._scalars = scalars
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 6d31c1ba74d..d5c4df12246 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -109,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype = col.dtype
 
     if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
-        col = col.as_numerical_column(np.dtype("int64"))
+        col = col.as_numerical_column(cudf.dtype("int64"))
     elif is_categorical_dtype(dtype):
         cat_dtype = col.dtype.type
         if _is_non_decimal_numeric_dtype(cat_dtype):
@@ -140,7 +140,7 @@ def to_numeric(arg, errors="raise", downcast=None):
         raise ValueError("Unrecognized datatype")
 
     # str->float conversion may require lower precision
-    if col.dtype == np.dtype("f"):
+    if col.dtype == cudf.dtype("f"):
         col = col.as_numerical_column("d")
 
     if downcast:
@@ -150,13 +150,13 @@ def to_numeric(arg, errors="raise", downcast=None):
             "unsigned": list(np.typecodes["UnsignedInteger"]),
         }
         float_types = list(np.typecodes["Float"])
-        idx = float_types.index(np.dtype(np.float32).char)
+        idx = float_types.index(cudf.dtype(np.float32).char)
         downcast_type_map["float"] = float_types[idx:]
 
         type_set = downcast_type_map[downcast]
 
         for t in type_set:
-            downcast_dtype = np.dtype(t)
+            downcast_dtype = cudf.dtype(t)
             if downcast_dtype.itemsize <= col.dtype.itemsize:
                 if col.can_cast_safely(downcast_dtype):
                     col = libcudf.unary.cast(col, downcast_dtype)
@@ -197,7 +197,7 @@ def _convert_str_col(col, errors, _downcast=None):
 
     is_integer = libstrings.is_integer(col)
     if is_integer.all():
-        return col.as_numerical_column(dtype=np.dtype("i8"))
+        return col.as_numerical_column(dtype=cudf.dtype("i8"))
 
     col = _proc_inf_empty_strings(col)
 
@@ -210,9 +210,9 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.as_numerical_column(dtype=np.dtype("f"))
+            return col.as_numerical_column(dtype=cudf.dtype("f"))
         else:
-            return col.as_numerical_column(dtype=np.dtype("d"))
+            return col.as_numerical_column(dtype=cudf.dtype("d"))
     else:
         if errors == "coerce":
             col = libcudf.string_casting.stod(col)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index d9a2fd89165..e3ed15ba2a6 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -215,7 +215,7 @@ def _apply_agg_series(self, sr, agg_name):
                 self.center,
                 agg_name,
             )
-        return sr._copy_construct(data=result_col)
+        return sr._from_data({sr.name: result_col}, sr._index)
 
     def _apply_agg_dataframe(self, df, agg_name):
         result_df = cudf.DataFrame({})
@@ -258,12 +258,12 @@ def apply(self, func, *args, **kwargs):
 
         See also
         --------
-        cudf.core.series.Series.applymap : Apply an elementwise function to
+        cudf.Series.applymap : Apply an elementwise function to
             transform the values in the Column.
 
         Notes
         -----
-        See notes of the :meth:`cudf.core.series.Series.applymap`
+        See notes of the :meth:`cudf.Series.applymap`
 
         """
         has_nulls = False
@@ -353,14 +353,15 @@ def __repr__(self):
 
 
 class RollingGroupby(Rolling):
-    def __init__(self, groupby, window, min_periods=None, center=False):
-        """
-        Grouped rolling window calculation.
+    """
+    Grouped rolling window calculation.
 
-        See also
-        --------
-        cudf.core.window.Rolling
-        """
+    See also
+    --------
+    cudf.core.window.Rolling
+    """
+
+    def __init__(self, groupby, window, min_periods=None, center=False):
         sort_order = groupby.grouping.keys.argsort()
 
         # TODO: there may be overlap between the columns
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index 5e54af86bb5..b568c108191 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -2,6 +2,8 @@
 import pandas as pd
 
 import cudf
+from cudf._lib.transform import bools_to_mask
+from cudf.core.column_accessor import ColumnAccessor
 
 __all__ = ["timeseries", "randomdata"]
 
@@ -9,7 +11,12 @@
 # TODO:
 # change default of name from category to str type when nvstring are merged
 def timeseries(
-    start="2000-01-01", end="2000-01-31", freq="1s", dtypes=None, seed=None,
+    start="2000-01-01",
+    end="2000-01-31",
+    freq="1s",
+    dtypes=None,
+    nulls_frequency=0,
+    seed=None,
 ):
     """ Create timeseries dataframe with random data
 
@@ -26,6 +33,8 @@ def timeseries(
         ``{"name": "category", "id": int, "x": float, "y": float}``
     freq : string
         String like '2s' or '1H' or '12W' for the time series frequency
+    nulls_frequency : float
+        Fill the series with the specified proportion of nulls. Default is 0.
     seed : int (optional)
         Randomstate seed
 
@@ -54,7 +63,21 @@ def timeseries(
     df = pd.DataFrame(columns, index=index, columns=sorted(columns))
     if df.index[-1] == end:
         df = df.iloc[:-1]
-    return cudf.from_pandas(df)
+
+    gdf = cudf.from_pandas(df)
+    for col in gdf:
+        mask = state.choice(
+            [True, False],
+            size=len(index),
+            p=[1 - nulls_frequency, nulls_frequency],
+        )
+        mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
+        masked_col = gdf[col]._column.set_mask(mask_buf)
+        gdf[col] = cudf.Series._from_data(
+            ColumnAccessor({None: masked_col}), index=gdf.index
+        )
+
+    return gdf
 
 
 def randomdata(nrows=10, dtypes=None, seed=None):
diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py
index a6713e85e76..9e38b6e896d 100644
--- a/python/cudf/cudf/io/avro.py
+++ b/python/cudf/cudf/io/avro.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2019, NVIDIA CORPORATION.
+import cudf
 from cudf import _lib as libcudf
 from cudf.utils import ioutils
 
@@ -14,8 +15,6 @@ def read_avro(
 ):
     """{docstring}"""
 
-    from cudf import DataFrame
-
     is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
         path_or_data=filepath_or_buffer, **kwargs,
     )
@@ -31,8 +30,8 @@ def read_avro(
         ValueError("URL content-encoding decompression is not supported")
 
     if engine == "cudf":
-        return DataFrame._from_table(
-            libcudf.avro.read_avro(
+        return cudf.DataFrame._from_data(
+            *libcudf.avro.read_avro(
                 filepath_or_buffer, columns, skiprows, num_rows
             )
         )
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index b8a76890913..9d97bee0396 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -35,12 +35,12 @@ def from_dlpack(pycapsule_obj):
     tensor is row-major, transpose it before passing it to this function.
     """
 
-    res = libdlpack.from_dlpack(pycapsule_obj)
+    data, _ = libdlpack.from_dlpack(pycapsule_obj)
 
-    if res._num_columns == 1:
-        return Series(res._data[0])
+    if len(data) == 1:
+        return Series._from_data(data)
     else:
-        return DataFrame(data=res._data)
+        return DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index b605bf90ff4..8a00d9c73a0 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -53,8 +53,8 @@ def read_json(
             else:
                 filepaths_or_buffers.append(tmp_source)
 
-        return cudf.DataFrame._from_table(
-            libjson.read_json(
+        return cudf.DataFrame._from_data(
+            *libjson.read_json(
                 filepaths_or_buffers, dtype, lines, compression, byte_range
             )
         )
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index a99f82fde7a..8f6002bb577 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -290,8 +290,8 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        df = DataFrame._from_table(
-            liborc.read_orc(
+        return DataFrame._from_data(
+            *liborc.read_orc(
                 filepaths_or_buffers,
                 columns,
                 stripes,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index a18486cff3c..fa748761695 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -210,6 +210,10 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
+    if columns is not None:
+        if not is_list_like(columns):
+            raise ValueError("Expected list like for columns")
+
     if filters is not None:
         # Convert filters to ds.Expression
         filters = pq._filters_to_expression(filters)
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 672e83e6f64..b101835e626 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -245,7 +245,7 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs):
 
 
 def gen_rand(dtype, size, **kwargs):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.kind == "f":
         res = np.random.random(size=size).astype(dtype)
         if kwargs.get("positive_only", False):
@@ -284,7 +284,7 @@ def gen_rand(dtype, size, **kwargs):
         return pd.to_datetime(
             np.random.randint(low=low, high=high, size=size), unit=time_unit
         )
-    elif dtype.kind == "U":
+    elif dtype.kind in ("O", "U"):
         return pd.util.testing.rands_array(10, size)
     raise NotImplementedError(f"dtype.kind={dtype.kind}")
 
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 5e03068f818..cdea22a05af 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -18,6 +18,7 @@
 from pyarrow import parquet as pq
 
 import cudf
+from cudf.utils.dtypes import np_to_pa_dtype
 
 
 class ColumnParameters:
@@ -94,6 +95,7 @@ def _write(tbl, path, format):
 def _generate_column(column_params, num_rows):
     # If cardinality is specified, we create a set to sample from.
     # Otherwise, we simply use the given generator to generate each value.
+
     if column_params.cardinality is not None:
         # Construct set of values to sample from where
         # set size = cardinality
@@ -127,7 +129,7 @@ def _generate_column(column_params, num_rows):
         if hasattr(column_params.dtype, "to_arrow"):
             arrow_type = column_params.dtype.to_arrow()
         elif column_params.dtype is not None:
-            arrow_type = pa.from_numpy_dtype(column_params.dtype)
+            arrow_type = np_to_pa_dtype(cudf.dtype(column_params.dtype))
         else:
             arrow_type = None
 
@@ -227,15 +229,15 @@ def get_dataframe(parameters, use_threads):
         ):
             arrow_type = pa.dictionary(
                 index_type=pa.int64(),
-                value_type=pa.from_numpy_dtype(
-                    type(next(iter(column_params.generator)))
+                value_type=np_to_pa_dtype(
+                    cudf.dtype(type(next(iter(column_params.generator))))
                 ),
             )
         elif hasattr(column_params.dtype, "to_arrow"):
             arrow_type = column_params.dtype.to_arrow()
         else:
-            arrow_type = pa.from_numpy_dtype(
-                type(next(iter(column_params.generator)))
+            arrow_type = np_to_pa_dtype(
+                cudf.dtype(type(next(iter(column_params.generator))))
                 if column_params.dtype is None
                 else column_params.dtype
             )
@@ -380,7 +382,7 @@ def rand_dataframe(
                 )
             )
         else:
-            dtype = np.dtype(dtype)
+            dtype = cudf.dtype(dtype)
             if dtype.kind in ("i", "u"):
                 column_params.append(
                     ColumnParameters(
@@ -428,7 +430,7 @@ def rand_dataframe(
                             dtype=dtype, size=cardinality
                         ),
                         is_sorted=False,
-                        dtype=np.dtype(dtype),
+                        dtype=cudf.dtype(dtype),
                     )
                 )
             elif dtype.kind == "m":
@@ -440,7 +442,7 @@ def rand_dataframe(
                             dtype=dtype, size=cardinality
                         ),
                         is_sorted=False,
-                        dtype=np.dtype(dtype),
+                        dtype=cudf.dtype(dtype),
                     )
                 )
             elif dtype.kind == "b":
@@ -450,7 +452,7 @@ def rand_dataframe(
                         null_frequency=null_frequency,
                         generator=boolean_generator(cardinality),
                         is_sorted=False,
-                        dtype=np.dtype(dtype),
+                        dtype=cudf.dtype(dtype),
                     )
                 )
             else:
@@ -538,7 +540,7 @@ def get_values_for_nested_data(dtype, lists_max_length):
     Returns list of values based on dtype.
     """
     cardinality = np.random.randint(0, lists_max_length)
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.kind in ("i", "u"):
         values = int_generator(dtype=dtype, size=cardinality)()
     elif dtype.kind == "f":
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 8277b8e7b32..abdac07d65d 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -13,7 +13,7 @@
 import pytest
 
 import cudf
-from cudf.core import Series
+from cudf import Series
 from cudf.core.index import as_index
 from cudf.testing import _utils as utils
 from cudf.utils.dtypes import (
@@ -931,7 +931,7 @@ def test_ufunc_ops(lhs, rhs, ops):
 def dtype_scalar(val, dtype):
     if dtype == "str":
         return str(val)
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.type in {np.datetime64, np.timedelta64}:
         res, _ = np.datetime_data(dtype)
         return dtype.type(val, res)
@@ -1695,13 +1695,15 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype):
     )
 
     if dtype == "datetime64[s]":
-        val = np.dtype(dtype).type(4, "s")
+        val = cudf.dtype(dtype).type(4, "s")
     elif dtype == "timedelta64[s]":
-        val = np.dtype(dtype).type(4, "s")
+        val = cudf.dtype(dtype).type(4, "s")
     elif dtype == "category":
         val = np.int64(4)
+    elif dtype == "str":
+        val = str(4)
     else:
-        val = np.dtype(dtype).type(4)
+        val = cudf.dtype(dtype).type(4)
 
     expected = val == data.to_pandas()
     got = val == data
@@ -1758,16 +1760,16 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.add,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
-            ["3.0", "4.0"],
             cudf.Decimal64Dtype(scale=2, precision=3),
+            ["3.0", "4.0"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
         ),
         (
             operator.add,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["3.75", "3.005"],
@@ -1785,7 +1787,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", "0.995"],
@@ -1794,7 +1796,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", "0.995"],
@@ -1812,11 +1814,11 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.mul,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["1.5", "3.0"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["2.25", "6.0"],
-            cudf.Decimal64Dtype(scale=5, precision=7),
+            cudf.Decimal64Dtype(scale=5, precision=8),
         ),
         (
             operator.mul,
@@ -1866,16 +1868,16 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.add,
             ["1.5", None, "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["1.5", None, "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=1, precision=2),
             ["3.0", None, "4.0"],
-            cudf.Decimal64Dtype(scale=2, precision=3),
+            cudf.Decimal64Dtype(scale=1, precision=3),
         ),
         (
             operator.add,
             ["1.5", None],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["3.75", None],
@@ -1884,7 +1886,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", None],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", None],
@@ -1893,7 +1895,7 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.sub,
             ["1.5", "2.0"],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["2.25", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", None],
@@ -1902,11 +1904,11 @@ def test_binops_with_NA_consistent(dtype, op):
         (
             operator.mul,
             ["1.5", None],
-            cudf.Decimal64Dtype(scale=2, precision=2),
+            cudf.Decimal64Dtype(scale=2, precision=3),
             ["1.5", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["2.25", None],
-            cudf.Decimal64Dtype(scale=5, precision=7),
+            cudf.Decimal64Dtype(scale=5, precision=8),
         ),
         (
             operator.mul,
@@ -2432,10 +2434,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
         (
             operator.truediv,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=2, precision=4),
+            cudf.Decimal64Dtype(scale=2, precision=5),
             decimal.Decimal(2),
             ["50", "100"],
-            cudf.Decimal64Dtype(scale=2, precision=6),
+            cudf.Decimal64Dtype(scale=2, precision=7),
             False,
         ),
         (
@@ -2459,10 +2461,10 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
         (
             operator.truediv,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=2, precision=3),
+            cudf.Decimal64Dtype(scale=2, precision=5),
             1,
             ["0", "0"],
-            cudf.Decimal64Dtype(scale=-2, precision=5),
+            cudf.Decimal64Dtype(scale=-2, precision=7),
             True,
         ),
         (
@@ -2793,11 +2795,11 @@ def test_column_null_scalar_comparison(dtype, null_scalar, cmpop):
     # a new series where all the elements are <NA>.
 
     if isinstance(null_scalar, np.datetime64):
-        if np.dtype(dtype).kind not in "mM":
+        if cudf.dtype(dtype).kind not in "mM":
             pytest.skip()
         null_scalar = null_scalar.astype(dtype)
 
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
 
     data = [1, 2, 3, 4, 5]
     sr = cudf.Series(data, dtype=dtype)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index d8e10a62a12..51327038c39 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -799,7 +799,7 @@ def test_categorical_setitem_with_nan():
 @pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"])
 @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]])
 def test_series_construction_with_nulls(input_obj, dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     input_obj = [
         dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj
     ]
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index f3387b3d27d..cc4c98b611f 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -362,7 +362,7 @@ def test_column_view_string_slice(slc):
 )
 def test_as_column_buffer(data, expected):
     actual_column = cudf.core.column.as_column(
-        cudf.core.Buffer(data), dtype=data.dtype
+        cudf.core.buffer.Buffer(data), dtype=data.dtype
     )
     assert_eq(cudf.Series(actual_column), cudf.Series(expected))
 
@@ -481,3 +481,29 @@ def test_concatenate_large_column_strings():
         match="total size of output is too large for a cudf column",
     ):
         cudf.concat([s_1, s_2])
+
+
+@pytest.mark.parametrize(
+    "alias,expect_dtype",
+    [
+        ("UInt8", "uint8"),
+        ("UInt16", "uint16"),
+        ("UInt32", "uint32"),
+        ("UInt64", "uint64"),
+        ("Int8", "int8"),
+        ("Int16", "int16"),
+        ("Int32", "int32"),
+        ("Int64", "int64"),
+        ("boolean", "bool"),
+        ("Float32", "float32"),
+        ("Float64", "float64"),
+    ],
+)
+@pytest.mark.parametrize(
+    "data", [[1, 2, 0]],
+)
+def test_astype_with_aliases(alias, expect_dtype, data):
+    pd_data = pd.Series(data)
+    gd_data = cudf.Series.from_pandas(pd_data)
+
+    assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias))
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index b6650600261..f06142f4cc9 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import pytest
 
+import cudf
 from cudf import Series
 from cudf.core.index import RangeIndex, as_index
 from cudf.testing._utils import (
@@ -82,7 +83,7 @@ def test_rangeindex_contains():
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES)
 def test_lists_contains(dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     inner_data = np.array([1, 2, 3], dtype=dtype)
 
     data = Series([inner_data])
@@ -96,7 +97,7 @@ def test_lists_contains(dtype):
 
 @pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES)
 def test_lists_contains_datetime(dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     inner_data = np.array([1, 2, 3])
 
     unit, _ = np.datetime_data(dtype)
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 0965b5298a4..21a6a9172db 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.core import Series
+from cudf import Series
 from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
 
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 5511a65d0a4..f04a5e6dca0 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -320,7 +320,6 @@ def test_csv_reader_dtype_dict(use_names):
     dtypes = df.dtypes.to_dict()
     gdf_names = list(gdf_dtypes.keys()) if use_names else None
     pdf_names = list(pdf_dtypes.keys()) if use_names else None
-
     gdf = read_csv(StringIO(buffer), dtype=dtypes, names=gdf_names)
     pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=pdf_names)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 9acf6783095..a337660b5b0 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1828,42 +1828,79 @@ def gdf(pdf):
 @pytest.mark.parametrize(
     "data",
     [
-        {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]},
-        {"x": [1, 2, 3], "y": [4, 5, 6]},
-        {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]},
-        {"x": [], "y": []},
+        {
+            "x": [np.nan, 2, 3, 4, 100, np.nan],
+            "y": [4, 5, 6, 88, 99, np.nan],
+            "z": [7, 8, 9, 66, np.nan, 77],
+        },
+        {"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]},
+        {
+            "x": [np.nan, np.nan, np.nan],
+            "y": [np.nan, np.nan, np.nan],
+            "z": [np.nan, np.nan, np.nan],
+        },
+        {"x": [], "y": [], "z": []},
         {"x": []},
     ],
 )
+@pytest.mark.parametrize("axis", [0, 1])
 @pytest.mark.parametrize(
     "func",
     [
-        lambda df, **kwargs: df.min(**kwargs),
-        lambda df, **kwargs: df.max(**kwargs),
-        lambda df, **kwargs: df.sum(**kwargs),
-        lambda df, **kwargs: df.product(**kwargs),
-        lambda df, **kwargs: df.cummin(**kwargs),
-        lambda df, **kwargs: df.cummax(**kwargs),
-        lambda df, **kwargs: df.cumsum(**kwargs),
-        lambda df, **kwargs: df.cumprod(**kwargs),
-        lambda df, **kwargs: df.mean(**kwargs),
-        lambda df, **kwargs: df.sum(**kwargs),
-        lambda df, **kwargs: df.max(**kwargs),
-        lambda df, **kwargs: df.std(ddof=1, **kwargs),
-        lambda df, **kwargs: df.var(ddof=1, **kwargs),
-        lambda df, **kwargs: df.std(ddof=2, **kwargs),
-        lambda df, **kwargs: df.var(ddof=2, **kwargs),
-        lambda df, **kwargs: df.kurt(**kwargs),
-        lambda df, **kwargs: df.skew(**kwargs),
-        lambda df, **kwargs: df.all(**kwargs),
-        lambda df, **kwargs: df.any(**kwargs),
+        "min",
+        "max",
+        "sum",
+        "prod",
+        "product",
+        "cummin",
+        "cummax",
+        "cumsum",
+        "cumprod",
+        "mean",
+        "median",
+        "sum",
+        "max",
+        "std",
+        "var",
+        "kurt",
+        "skew",
+        "all",
+        "any",
     ],
 )
 @pytest.mark.parametrize("skipna", [True, False, None])
-def test_dataframe_reductions(data, func, skipna):
+def test_dataframe_reductions(data, axis, func, skipna):
     pdf = pd.DataFrame(data=data)
     gdf = cudf.DataFrame.from_pandas(pdf)
-    assert_eq(func(pdf, skipna=skipna), func(gdf, skipna=skipna))
+
+    # Reductions can fail in numerous possible ways when attempting row-wise
+    # reductions, which are only partially supported. Catching the appropriate
+    # exception here allows us to detect API breakage in the form of changing
+    # exceptions.
+    expected_exception = None
+    if axis == 1:
+        if func in ("kurt", "skew"):
+            expected_exception = NotImplementedError
+        elif func not in cudf.core.dataframe._cupy_nan_methods_map:
+            if skipna is False:
+                expected_exception = NotImplementedError
+            elif any(col.nullable for name, col in gdf.iteritems()):
+                expected_exception = ValueError
+            elif func in ("cummin", "cummax"):
+                expected_exception = AttributeError
+
+    # Test different degrees of freedom for var and std.
+    all_kwargs = [{"ddof": 1}, {"ddof": 2}] if func in ("var", "std") else [{}]
+    for kwargs in all_kwargs:
+        if expected_exception is not None:
+            with pytest.raises(expected_exception):
+                getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs),
+        else:
+            assert_eq(
+                getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs),
+                getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs),
+                check_dtype=False,
+            )
 
 
 @pytest.mark.parametrize(
@@ -3423,8 +3460,6 @@ def test_all(data):
             expected = pdata.all(bool_only=True)
             assert_eq(got, expected)
         else:
-            with pytest.raises(NotImplementedError):
-                gdata.all(bool_only=False)
             with pytest.raises(NotImplementedError):
                 gdata.all(level="a")
 
@@ -3484,8 +3519,6 @@ def test_any(data, axis):
             expected = pdata.any(bool_only=True)
             assert_eq(got, expected)
         else:
-            with pytest.raises(NotImplementedError):
-                gdata.any(bool_only=False)
             with pytest.raises(NotImplementedError):
                 gdata.any(level="a")
 
@@ -3616,9 +3649,7 @@ def test_as_column_types():
     assert_eq(pds, gds)
 
     pds = pd.Series(pd.Index(["1", "18", "9"]), dtype="int")
-    gds = cudf.Series(
-        cudf.core.index.StringIndex(["1", "18", "9"]), dtype="int"
-    )
+    gds = cudf.Series(cudf.StringIndex(["1", "18", "9"]), dtype="int")
 
     assert_eq(pds, gds)
 
@@ -5054,6 +5085,18 @@ def test_insert(data):
     assert_eq(pdf, gdf)
 
 
+@pytest.mark.parametrize(
+    "data", [{"A": [1, 2, 3], "B": ["a", "b", "c"]}],
+)
+def test_insert_NA(data):
+    pdf = pd.DataFrame.from_dict(data)
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pdf["C"] = pd.NA
+    gdf["C"] = cudf.NA
+    assert_eq(pdf, gdf)
+
+
 def test_cov():
     gdf = cudf.datasets.randomdata(10)
     pdf = gdf.to_pandas()
@@ -5372,14 +5415,6 @@ def test_change_column_dtype_in_empty():
     assert_eq(pdf, gdf)
 
 
-def test_dataframe_from_table_empty_index():
-    df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
-    odict = df._data
-    tbl = cudf._lib.table.Table(odict)
-
-    result = cudf.DataFrame._from_table(tbl)  # noqa: F841
-
-
 @pytest.mark.parametrize("dtype", ["int64", "str"])
 def test_dataframe_from_dictionary_series_same_name_index(dtype):
     pd_idx1 = pd.Index([1, 2, 0], name="test_index").astype(dtype)
@@ -8108,17 +8143,7 @@ def custom_func(df, column):
 
 
 @pytest.mark.parametrize(
-    "op",
-    [
-        "count",
-        "cummin",
-        "cummax",
-        "cummax",
-        "cumprod",
-        "kurt",
-        "kurtosis",
-        "skew",
-    ],
+    "op", ["count", "kurt", "kurtosis", "skew"],
 )
 def test_dataframe_axis1_unsupported_ops(op):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": [8, 9, 10]})
@@ -8732,3 +8757,60 @@ def test_frame_series_where():
     expected = gdf.where(gdf.notna(), gdf.mean())
     actual = pdf.where(pdf.notna(), pdf.mean(), axis=1)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "array,is_error",
+    [
+        (cupy.arange(20, 40).reshape(-1, 2), False),
+        (cupy.arange(20, 50).reshape(-1, 3), True),
+        (np.arange(20, 40).reshape(-1, 2), False),
+        (np.arange(20, 30).reshape(-1, 1), False),
+        (cupy.arange(20, 30).reshape(-1, 1), False),
+    ],
+)
+def test_dataframe_indexing_setitem_np_cp_array(array, is_error):
+    gdf = cudf.DataFrame({"a": range(10), "b": range(10)})
+    pdf = gdf.to_pandas()
+    if not is_error:
+        gdf.loc[:, ["a", "b"]] = array
+        pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array)
+
+        assert_eq(gdf, pdf)
+    else:
+        assert_exceptions_equal(
+            lfunc=pdf.loc.__setitem__,
+            rfunc=gdf.loc.__setitem__,
+            lfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)],
+                {},
+            ),
+            rfunc_args_and_kwargs=(
+                [(slice(None, None, None), ["a", "b"]), array],
+                {},
+            ),
+            compare_error_message=False,
+            expected_error_message="shape mismatch: value array of shape "
+            "(10, 3) could not be broadcast to indexing "
+            "result of shape (10, 2)",
+        )
+
+
+@pytest.mark.parametrize(
+    "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}],
+)
+def test_frame_series_where_other(data):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expected = gdf.where(gdf["b"] == 1, cudf.NA)
+    actual = pdf.where(pdf["b"] == 1, pd.NA)
+    assert_eq(
+        actual.fillna(-1).values,
+        expected.fillna(-1).values,
+        check_dtype=False,
+    )
+
+    expected = gdf.where(gdf["b"] == 1, 0)
+    actual = pdf.where(pdf["b"] == 1, 0)
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py
index b7bc89f008d..c9f07eab5dd 100644
--- a/python/cudf/cudf/tests/test_datasets.py
+++ b/python/cudf/cudf/tests/test_datasets.py
@@ -6,10 +6,10 @@
 
 def test_dataset_timeseries():
     gdf1 = gd.datasets.timeseries(
-        dtypes={"x": int, "y": float}, freq="120s", seed=1
+        dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1
     )
     gdf2 = gd.datasets.timeseries(
-        dtypes={"x": int, "y": float}, freq="120s", seed=1
+        dtypes={"x": int, "y": float}, freq="120s", nulls_frequency=0.3, seed=1
     )
 
     assert_eq(gdf1, gdf2)
@@ -23,6 +23,7 @@ def test_dataset_timeseries():
         "2010",
         freq="2H",
         dtypes={"value": float, "name": "category", "id": int},
+        nulls_frequency=0.7,
         seed=1,
     )
 
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 5f5a0a78414..9f19bf8b960 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -12,7 +12,8 @@
 import pytest
 
 import cudf
-from cudf.core import DataFrame, Series
+import cudf.testing.dataset_generator as dataset_generator
+from cudf import DataFrame, Series
 from cudf.core.index import DatetimeIndex
 from cudf.testing._utils import (
     DATETIME_TYPES,
@@ -1299,6 +1300,58 @@ def test_is_leap_year():
     assert_eq(expect2, got2)
 
 
+def test_quarter():
+    data = [
+        "2020-05-31 08:00:00",
+        "1999-12-31 18:40:00",
+        "2000-12-31 04:00:00",
+        "1900-02-28 07:00:00",
+        "1800-03-14 07:30:00",
+        "2100-03-14 07:30:00",
+        "1970-01-01 00:00:00",
+        "1969-12-31 12:59:00",
+    ]
+    dtype = "datetime64[s]"
+
+    # Series
+    ps = pd.Series(data, dtype=dtype)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.dt.quarter
+    got = gs.dt.quarter
+
+    assert_eq(expect, got, check_dtype=False)
+
+    # DatetimeIndex
+    pIndex = pd.DatetimeIndex(data)
+    gIndex = cudf.from_pandas(pIndex)
+
+    expect2 = pIndex.quarter
+    got2 = gIndex.quarter
+
+    assert isinstance(got2, cudf.Int8Index)
+    assert_eq(expect2.values, got2.values, check_dtype=False)
+
+
+@pytest.mark.parametrize("dtype", DATETIME_TYPES)
+def test_days_in_months(dtype):
+    nrows = 1000
+
+    data = dataset_generator.rand_dataframe(
+        dtypes_meta=[
+            {"dtype": dtype, "null_frequency": 0.4, "cardinality": nrows}
+        ],
+        rows=nrows,
+        use_threads=False,
+        seed=23,
+    )
+
+    ps = data.to_pandas()["0"]
+    gs = cudf.from_pandas(ps)
+
+    assert_eq(ps.dt.days_in_month, gs.dt.days_in_month)
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -1326,3 +1379,174 @@ def test_is_month_start(data, dtype):
     got = gs.dt.is_month_start
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            "2020-05-31",
+            "2020-02-29",
+            None,
+            "1999-12-01",
+            "2000-12-21",
+            None,
+            "1900-02-28",
+            "1800-03-14",
+            "2100-03-10",
+            "1970-01-01",
+            "1969-12-11",
+        ]
+    ],
+)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
+def test_is_month_end(data, dtype):
+    # Series
+    ps = pd.Series(data, dtype=dtype)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.dt.is_month_end
+    got = gs.dt.is_month_end
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            "2020-05-31",
+            None,
+            "1999-12-01",
+            "2000-12-21",
+            None,
+            "1900-01-01",
+            "1800-03-14",
+            "2100-03-10",
+            "1970-01-01",
+            "1969-12-11",
+            "2017-12-30",
+            "2017-12-31",
+            "2018-01-01",
+        ]
+    ],
+)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
+def test_is_year_start(data, dtype):
+    ps = pd.Series(data, dtype=dtype)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.dt.is_year_start
+    got = gs.dt.is_year_start
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            "2020-05-31",
+            None,
+            "1999-12-01",
+            "2000-12-21",
+            None,
+            "1900-12-31",
+            "1800-03-14",
+            "2017-12-30",
+            "2017-12-31",
+            "2020-12-31 08:00:00",
+            None,
+            "1999-12-31 18:40:00",
+            "2000-12-31 04:00:00",
+            None,
+            "1800-12-14 07:30:00",
+            "2100-12-14 07:30:00",
+            "2020-05-31",
+        ]
+    ],
+)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
+def test_is_year_end(data, dtype):
+    ps = pd.Series(data, dtype=dtype)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.dt.is_year_end
+    got = gs.dt.is_year_end
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            "2020-05-01",
+            "2020-05-31",
+            "2020-02-29",
+            None,
+            "1999-12-01",
+            "2000-12-21",
+            None,
+            "1900-02-28",
+            "1800-03-14",
+            "2100-03-10",
+            "1970-04-1",
+            "1970-01-01",
+            "1969-12-11",
+            "2020-12-31",
+        ]
+    ],
+)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
+def test_is_quarter_start(data, dtype):
+    # Series
+    ps = pd.Series(data, dtype=dtype)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.dt.is_quarter_start
+    got = gs.dt.is_quarter_start
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            "2020-05-01",
+            "2020-05-31",
+            "2020-02-29",
+            None,
+            "1999-12-01",
+            "2000-12-21",
+            None,
+            "1900-02-28",
+            "1800-03-14",
+            "2100-03-10",
+            "1970-04-1",
+            "1970-01-01",
+            "1969-12-11",
+            "2020-12-31",
+        ]
+    ],
+)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
+def test_is_quarter_end(data, dtype):
+    # Series
+    ps = pd.Series(data, dtype=dtype)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.dt.is_quarter_end
+    got = gs.dt.is_quarter_end
+
+    assert_eq(expect, got)
+
+
+def test_error_values():
+    s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
+    with pytest.raises(
+        NotImplementedError,
+        match="DateTime Arrays is not yet implemented in cudf",
+    ):
+        s.values
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index d2de44b0c8f..51f05e1b876 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
 from cudf.testing._utils import (
     FLOAT_TYPES,
     INTEGER_TYPES,
@@ -24,7 +24,7 @@
     [1],
     [-1],
     [1, 2, 3, 4],
-    [42, 1729, 4104],
+    [42, 17, 41],
     [1, 2, None, 4],
     [None, None, None],
     [],
@@ -164,21 +164,43 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
 )
 @pytest.mark.parametrize(
     "from_dtype",
-    [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 10)],
+    [
+        Decimal64Dtype(7, 2),
+        Decimal64Dtype(11, 4),
+        Decimal64Dtype(18, 10),
+        Decimal32Dtype(7, 2),
+        Decimal32Dtype(5, 3),
+        Decimal32Dtype(9, 5),
+    ],
 )
 @pytest.mark.parametrize(
     "to_dtype",
-    [Decimal64Dtype(7, 2), Decimal64Dtype(18, 10), Decimal64Dtype(11, 4)],
+    [
+        Decimal64Dtype(7, 2),
+        Decimal64Dtype(18, 10),
+        Decimal64Dtype(11, 4),
+        Decimal32Dtype(7, 2),
+        Decimal32Dtype(9, 5),
+        Decimal32Dtype(5, 3),
+    ],
 )
 def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
-    got = data.astype(from_dtype)
+    if from_dtype.scale > to_dtype.MAX_PRECISION:
+        pytest.skip(
+            "This is supposed to overflow because the representation value in "
+            "the source exceeds the max representable in destination dtype."
+        )
+    s = data.astype(from_dtype)
 
-    pa_arr = got.to_arrow().cast(
+    pa_arr = s.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
     )
-    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+    if isinstance(to_dtype, Decimal32Dtype):
+        expected = cudf.Series(Decimal32Column.from_arrow(pa_arr))
+    elif isinstance(to_dtype, Decimal64Dtype):
+        expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
 
-    got = got.astype(to_dtype)
+    got = s.astype(to_dtype)
 
     assert_eq(got, expected)
 
@@ -347,3 +369,11 @@ def test_serialize_decimal_columns(data):
     df = cudf.DataFrame(data)
     recreated = df.__class__.deserialize(*df.serialize())
     assert_eq(recreated, df)
+
+
+def test_decimal_invalid_precision():
+    with pytest.raises(pa.ArrowInvalid):
+        _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2))
+
+    with pytest.raises(pa.ArrowInvalid):
+        _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1))
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 41d7f5d215e..ee6cc7b6df6 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -257,3 +257,62 @@ def test_lists_of_structs_dtype(data):
 
     assert_column_array_dtype_equal(got._column, expected)
     assert expected.equals(got._column.to_arrow())
+
+
+@pytest.mark.parametrize(
+    "in_dtype,expect",
+    [
+        (np.dtype("int8"), np.dtype("int8")),
+        (np.int8, np.dtype("int8")),
+        (np.float16, np.dtype("float32")),
+        (pd.Int8Dtype(), np.dtype("int8")),
+        (pd.StringDtype(), np.dtype("object")),
+        ("int8", np.dtype("int8")),
+        ("boolean", np.dtype("bool")),
+        ("bool_", np.dtype("bool")),
+        (np.bool_, np.dtype("bool")),
+        (int, np.dtype("int64")),
+        (float, np.dtype("float64")),
+        (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
+        ("float16", np.dtype("float32")),
+        (np.dtype("U"), np.dtype("object")),
+        ("timedelta64", np.dtype("<m8")),
+        ("timedelta64[ns]", np.dtype("<m8[ns]")),
+        ("timedelta64[ms]", np.dtype("<m8[ms]")),
+        ("timedelta64[D]", np.dtype("<m8[D]")),
+        ("<m8[s]", np.dtype("<m8[s]")),
+        ("datetime64", np.dtype("<M8")),
+        ("datetime64[ns]", np.dtype("<M8[ns]")),
+        ("datetime64[ms]", np.dtype("<M8[ms]")),
+        ("datetime64[D]", np.dtype("<M8[D]")),
+        ("<M8[s]", np.dtype("<M8[s]")),
+        (cudf.ListDtype("int64"), cudf.ListDtype("int64")),
+        ("category", cudf.CategoricalDtype()),
+        (
+            cudf.CategoricalDtype(categories=("a", "b", "c")),
+            cudf.CategoricalDtype(categories=("a", "b", "c")),
+        ),
+        (
+            pd.CategoricalDtype(categories=("a", "b", "c")),
+            cudf.CategoricalDtype(categories=("a", "b", "c")),
+        ),
+        (
+            # this is a pandas.core.arrays.numpy_.PandasDtype...
+            pd.array([1], dtype="int16").dtype,
+            np.dtype("int16"),
+        ),
+        (pd.IntervalDtype("int"), cudf.IntervalDtype("int64")),
+        (cudf.IntervalDtype("int"), cudf.IntervalDtype("int64")),
+        (pd.IntervalDtype("int64"), cudf.IntervalDtype("int64")),
+    ],
+)
+def test_dtype(in_dtype, expect):
+    assert_eq(cudf.dtype(in_dtype), expect)
+
+
+@pytest.mark.parametrize(
+    "in_dtype", ["complex", np.complex128, complex, "S", "a", "V"]
+)
+def test_dtype_raise(in_dtype):
+    with pytest.raises(TypeError):
+        cudf.dtype(in_dtype)
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index 3df0031745e..46cbc9d2b52 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -6,7 +6,7 @@
 import pytest
 
 import cudf
-from cudf.core import DataFrame, Index
+from cudf import DataFrame, Index
 from cudf.testing._utils import assert_eq
 
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index e423a64fe4d..df6a9336e97 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -13,7 +13,7 @@
 import rmm
 
 import cudf
-from cudf.core import DataFrame, Series
+from cudf import DataFrame, Series
 from cudf.core._compat import PANDAS_GE_110
 from cudf.testing._utils import (
     DATETIME_TYPES,
@@ -985,7 +985,7 @@ def test_groupby_index_type():
     df["string_col"] = ["a", "b", "c"]
     df["counts"] = [1, 2, 3]
     res = df.groupby(by="string_col").counts.sum()
-    assert isinstance(res.index, cudf.core.index.StringIndex)
+    assert isinstance(res.index, cudf.StringIndex)
 
 
 @pytest.mark.parametrize(
@@ -1702,7 +1702,6 @@ def test_groupby_2keys_scan(nelem, func):
     # pd.groupby.cumcount returns a series.
     if isinstance(expect_df, pd.Series):
         expect_df = expect_df.to_frame("val")
-    expect_df = expect_df.set_index([pdf["x"], pdf["y"]]).sort_index()
 
     check_dtype = False if func in _index_type_aggs else True
     assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
@@ -1734,10 +1733,6 @@ def test_groupby_shift_row(nelem, shift_perc, direction, fill_value):
     )
     got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = pd.MultiIndex.from_frame(gdf[["x", "y"]].to_pandas())
     assert_groupby_results_equal(
         expected[["val", "val2"]], got[["val", "val2"]]
     )
@@ -1776,10 +1771,6 @@ def test_groupby_shift_row_mixed_numerics(
     expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
     got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = gdf["0"].to_pandas()
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
@@ -1817,10 +1808,6 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction):
     expected = pdf.groupby(["0"]).shift(periods=n_shift)
     got = gdf.groupby(["0"]).shift(periods=n_shift)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = gdf["0"].to_pandas()
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
@@ -1880,10 +1867,6 @@ def test_groupby_shift_row_mixed_fill(
 
     got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value)
 
-    # Pandas returns shifted column in original row order. We set its index
-    # to be the key columns, so that `assert_groupby_results_equal` can sort
-    # rows by key columns to make sure cudf and pandas results matches.
-    expected.index = gdf["0"].to_pandas()
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
@@ -1916,9 +1899,6 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
     expected = gdf
     got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value)
 
-    # Here, the result should be the same as input due to 0-shift, only the
-    # key orders are different.
-    expected = expected.set_index("0")
     assert_groupby_results_equal(
         expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
     )
@@ -2106,3 +2086,31 @@ def test_groupby_describe(data, group):
     expect = pdf.groupby(group).describe()
 
     assert_groupby_results_equal(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [], "b": []},
+        {"a": [2, 1, 2, 1, 1, 3], "b": [None, 1, 2, None, 2, None]},
+        {"a": [None], "b": [None]},
+        {"a": [2, 1, 1], "b": [None, 1, 0], "c": [None, 0, 1]},
+    ],
+)
+@pytest.mark.parametrize("agg", ["first", "last", ["first", "last"]])
+def test_groupby_first(data, agg):
+    pdf = pd.DataFrame(data)
+    gdf = cudf.from_pandas(pdf)
+    expect = pdf.groupby("a").agg(agg)
+    got = gdf.groupby("a").agg(agg)
+    assert_groupby_results_equal(expect, got, check_dtype=False)
+
+
+def test_groupby_apply_series():
+    def foo(x):
+        return x.sum()
+
+    got = make_frame(DataFrame, 100).groupby("x").y.apply(foo)
+    expect = make_frame(pd.DataFrame, 100).groupby("x").y.apply(foo)
+
+    assert_groupby_results_equal(expect, got)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index f03454c479a..f80bdec0ab5 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -125,7 +125,16 @@ def test_index_comparision():
 
 
 @pytest.mark.parametrize(
-    "func", [lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()]
+    "func",
+    [
+        lambda x: x.min(),
+        lambda x: x.max(),
+        lambda x: x.sum(),
+        lambda x: x.mean(),
+        lambda x: x.any(),
+        lambda x: x.all(),
+        lambda x: x.prod(),
+    ],
 )
 def test_reductions(func):
     x = np.asarray([4, 5, 6, 10])
@@ -323,7 +332,7 @@ def test_index_copy_datetime(name, dtype, deep=True):
 @pytest.mark.parametrize("name", ["x"])
 @pytest.mark.parametrize("dtype", ["category", "object"])
 def test_index_copy_string(name, dtype, deep=True):
-    cidx = cudf.core.index.StringIndex(["a", "b", "c"])
+    cidx = cudf.StringIndex(["a", "b", "c"])
     pidx = cidx.to_pandas()
 
     pidx_copy = pidx.copy(name=name, deep=deep, dtype=dtype)
@@ -380,7 +389,7 @@ def test_index_copy_category(name, dtype, deep=True):
     "idx",
     [
         cudf.DatetimeIndex(["2001", "2002", "2003"]),
-        cudf.core.index.StringIndex(["a", "b", "c"]),
+        cudf.StringIndex(["a", "b", "c"]),
         cudf.Int64Index([1, 2, 3]),
         cudf.Float64Index([1.0, 2.0, 3.0]),
         cudf.CategoricalIndex([1, 2, 3]),
@@ -425,7 +434,7 @@ def test_index_copy_deep(idx, deep):
                 idx._values.categories.base_data.ptr
                 == idx_copy._values.categories.base_data.ptr
             ) == same_ref
-    elif isinstance(idx, cudf.core.index.StringIndex):
+    elif isinstance(idx, cudf.StringIndex):
         children = idx._values._base_children
         copy_children = idx_copy._values._base_children
         assert all(
@@ -470,7 +479,7 @@ def test_rangeindex_slice_attr_name():
 def test_from_pandas_str():
     idx = ["a", "b", "c"]
     pidx = pd.Index(idx, name="idx")
-    gidx_1 = cudf.core.index.StringIndex(idx, name="idx")
+    gidx_1 = cudf.StringIndex(idx, name="idx")
     gidx_2 = cudf.from_pandas(pidx)
 
     assert_eq(gidx_1, gidx_2)
@@ -2316,3 +2325,22 @@ def test_get_loc_multi_string(idx, key, method):
         got = gi.get_loc(key, method=method)
 
         assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "objs",
+    [
+        [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)],
+        [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)],
+        [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)],
+    ],
+)
+def test_range_index_concat(objs):
+    cudf_objs = [cudf.from_pandas(obj) for obj in objs]
+
+    actual = cudf.concat(cudf_objs)
+
+    expected = objs[0]
+    for obj in objs[1:]:
+        expected = expected.append(obj)
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
new file mode 100644
index 00000000000..66556c48828
--- /dev/null
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -0,0 +1,119 @@
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq, assert_exceptions_equal
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # basics
+        {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]},
+        {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]},
+        {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]},
+    ],
+)
+@pytest.mark.parametrize("method", ["linear"])
+@pytest.mark.parametrize("axis", [0])
+def test_interpolate_dataframe(data, method, axis):
+    # Pandas interpolate methods do not seem to work
+    # with nullable dtypes yet, so this method treats
+    # NAs as NaNs
+    # https://github.com/pandas-dev/pandas/issues/40252
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    expect = pdf.interpolate(method=method, axis=axis)
+    got = gdf.interpolate(method=method, axis=axis)
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0],
+        [1.0, None, 3.0],
+        [None, 2.0, None, 4.0],
+        [1.0, None, 3.0, None],
+        [None, None, 3.0, 4.0],
+        [1.0, 2.0, None, None],
+        [None, None, None, None],
+        [0.1, 0.2, 0.3],
+    ],
+)
+@pytest.mark.parametrize("method", ["linear"])
+@pytest.mark.parametrize("axis", [0])
+def test_interpolate_series(data, method, axis):
+    gsr = cudf.Series(data)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method=method, axis=axis)
+    got = gsr.interpolate(method=method, axis=axis)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])]
+)
+def test_interpolate_series_unsorted_index(data, index):
+    gsr = cudf.Series(data, index=index)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method="values")
+    got = gsr.interpolate(method="values")
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0, 4.0],
+        [None, 2.0, 3.0, 4.0],
+        [1.0, 2.0, 3.0, None],
+        [None, None, 3.0, 4.0],
+        [1.0, 2.0, None, None],
+        [1.0, None, 3.0, None],
+        [None, 2.0, None, 4.0],
+        [None, None, None, None],
+    ],
+)
+@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]])
+@pytest.mark.parametrize("method", ["index", "values"])
+def test_interpolate_series_values_or_index(data, index, method):
+    gsr = cudf.Series(data, index=index)
+    psr = gsr.to_pandas()
+
+    expect = psr.interpolate(method=method)
+    got = gsr.interpolate(method=method)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data,kwargs",
+    [
+        (
+            {"A": ["a", "b", "c"], "B": ["d", "e", "f"]},
+            {"axis": 0, "method": "linear"},
+        ),
+        ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "backward"}),
+        ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "backward"}),
+        ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "forward"}),
+        (
+            {"A": [1, 2, 3]},
+            {"method": "backfill", "limit_direction": "forward"},
+        ),
+    ],
+)
+def test_interpolate_dataframe_error_cases(data, kwargs):
+    gsr = cudf.DataFrame(data)
+    psr = gsr.to_pandas()
+
+    assert_exceptions_equal(
+        lfunc=psr.interpolate,
+        rfunc=gsr.interpolate,
+        lfunc_args_and_kwargs=([], kwargs),
+        rfunc_args_and_kwargs=([], kwargs),
+    )
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 4ae7c40ead8..c37939df7d3 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -810,7 +810,7 @@ def test_join_datetimes_index(dtype):
     pdf = pdf_lhs.join(pdf_rhs, sort=True)
     gdf = gdf_lhs.join(gdf_rhs, sort=True)
 
-    assert gdf["d"].dtype == np.dtype(dtype)
+    assert gdf["d"].dtype == cudf.dtype(dtype)
 
     assert_join_results_equal(pdf, gdf, how="inner")
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 0b138f446ae..8c06dbea03f 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -278,7 +278,7 @@ def test_json_lines_byte_range(json_input):
 )
 def test_json_lines_dtypes(json_input, dtype):
     df = cudf.read_json(json_input, lines=True, dtype=dtype)
-    assert all(df.dtypes == ["float32", "int32", "int16"])
+    assert all(df.dtypes == ["float64", "int64", "int16"])
 
 
 @pytest.mark.parametrize(
@@ -301,7 +301,7 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
     pd_df.to_json(fname, compression=out_comp, lines=True, orient="records")
 
     cu_df = cudf.read_json(
-        str(fname), compression=in_comp, lines=True, dtype=["int", "int"]
+        str(fname), compression=in_comp, lines=True, dtype=["int32", "int32"]
     )
     assert_eq(pd_df, cu_df)
 
diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py
index 29a787768f2..bac324d9c1c 100644
--- a/python/cudf/cudf/tests/test_label_encode.py
+++ b/python/cudf/cudf/tests/test_label_encode.py
@@ -6,7 +6,8 @@
 import numpy as np
 import pytest
 
-from cudf.core import DataFrame, Series
+import cudf
+from cudf import DataFrame, Series
 
 
 def _random_float(nelem, dtype):
@@ -18,7 +19,7 @@ def _random_int(nelem, dtype):
 
 
 def _random(nelem, dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if dtype.kind in {"i", "u"}:
         return _random_int(nelem, dtype)
     elif dtype.kind == "f":
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index e9c828ec0f5..7643bfdf050 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.core import MultiIndex, Series
+from cudf import MultiIndex, Series
 from cudf.core.index import (
     CategoricalIndex,
     DatetimeIndex,
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index c8e5a9f071b..18a82b58670 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1522,3 +1522,31 @@ def test_multiindex_rename_error(names):
         lfunc_args_and_kwargs=([], {"names": names}),
         rfunc_args_and_kwargs=([], {"names": names}),
     )
+
+
+@pytest.mark.parametrize(
+    "key",
+    [0, 1, [], [0, 1], slice(None), slice(0, 0), slice(0, 1), slice(0, 2)],
+)
+def test_multiindex_indexing(key):
+    gi = cudf.MultiIndex.from_frame(
+        cudf.DataFrame({"a": [1, 2, 3], "b": [True, False, False]})
+    )
+    pi = gi.to_pandas()
+
+    assert_eq(gi[key], pi[key], exact=False)
+
+
+def test_multiIndex_duplicate_names():
+    gi = cudf.MultiIndex(
+        levels=[["a", "b"], ["b", "a"]],
+        codes=[[0, 0], [0, 1]],
+        names=["a", "a"],
+    )
+    pi = pd.MultiIndex(
+        levels=[["a", "b"], ["b", "a"]],
+        codes=[[0, 0], [0, 1]],
+        names=["a", "a"],
+    )
+
+    assert_eq(gi, pi)
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 7a766a49a62..2e1ce5cddfc 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -390,7 +390,7 @@ def test_to_numeric_error(data, errors):
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES)
 @pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]])
 def test_series_construction_with_nulls(dtype, input_obj):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     # numpy case
 
     expect = pd.Series(input_obj, dtype=cudf_dtypes_to_pandas_dtypes[dtype])
diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py
index e5efe2f027d..55b5a38c3e5 100644
--- a/python/cudf/cudf/tests/test_numpy_interop.py
+++ b/python/cudf/cudf/tests/test_numpy_interop.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from cudf.core import DataFrame, Series
+from cudf import DataFrame, Series
 from cudf.testing._utils import assert_eq
 
 
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index bbec4594e15..0a3ead6cf31 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -7,7 +7,7 @@
 import pytest
 
 import cudf
-from cudf.core import DataFrame, GenericIndex, Series
+from cudf import DataFrame, Index, Series
 from cudf.testing import _utils as utils
 
 
@@ -86,7 +86,7 @@ def test_onehot_generic_index():
     indices = np.random.randint(low=0, high=100, size=size)
     df = DataFrame()
     values = np.random.randint(low=0, high=4, size=size)
-    df["fo"] = Series(values, index=GenericIndex(indices))
+    df["fo"] = Series(values, index=Index(indices))
     out = df.one_hot_encoding(
         "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
     )
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 635332d5c24..33ce8427a71 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -844,7 +844,7 @@ def test_orc_string_stream_offset_issue():
 
 
 # Data is generated using pyorc module
-def generate_list_struct_buff(size=28000):
+def generate_list_struct_buff(size=100_000):
     rd = random.Random(1)
     np.random.seed(seed=1)
 
@@ -963,7 +963,7 @@ def generate_list_struct_buff(size=28000):
         ["lvl2_struct", "lvl1_struct"],
     ],
 )
-@pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 28000])
+@pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
 def test_lists_struct_nests(
     columns, num_rows, use_index,
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
index dab74050437..8f54e17c0c3 100644
--- a/python/cudf/cudf/tests/test_pack.py
+++ b/python/cudf/cudf/tests/test_pack.py
@@ -18,8 +18,8 @@
 import numpy as np
 import pandas as pd
 
+from cudf import DataFrame, GenericIndex, Series
 from cudf._lib.copying import pack, unpack
-from cudf.core import DataFrame, GenericIndex, Series
 from cudf.testing._utils import assert_eq
 
 
@@ -61,7 +61,8 @@ def assert_packed_frame_equality(df):
 
     packed = pack(df)
     del df
-    unpacked = DataFrame._from_table(unpack(packed))
+    tbl = unpack(packed)
+    unpacked = DataFrame(tbl._data, tbl._index)
 
     assert_eq(unpacked, pdf)
 
@@ -196,15 +197,15 @@ def check_packed_pickled_equality(df):
         )
         for b in buffers:
             assert isinstance(b, pickle.PickleBuffer)
-        loaded = DataFrame._from_table(
-            unpack(pickle.loads(serialbytes, buffers=buffers))
-        )
+        tbl = unpack(pickle.loads(serialbytes, buffers=buffers))
+        loaded = DataFrame(tbl._data, tbl._index)
         assert_eq(loaded, df)
 
 
 def assert_packed_frame_picklable(df):
     serialbytes = pickle.dumps(pack(df))
-    loaded = DataFrame._from_table(unpack(pickle.loads(serialbytes)))
+    tbl = unpack(pickle.loads(serialbytes))
+    loaded = DataFrame(tbl._data, tbl._index)
     assert_eq(loaded, df)
 
 
@@ -269,7 +270,8 @@ def check_packed_serialized_equality(df):
 def assert_packed_frame_serializable(df):
     packed = pack(df)
     header, frames = packed.serialize()
-    loaded = DataFrame._from_table(unpack(packed.deserialize(header, frames)))
+    tbl = unpack(packed.deserialize(header, frames))
+    loaded = DataFrame(tbl._data, tbl._index)
     assert_eq(loaded, df)
 
 
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index a8a45fc3c28..c90d6f23c2d 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 import cudf
-from cudf.core import DataFrame
+from cudf import DataFrame
 from cudf.testing._utils import assert_eq
 
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 21dc8315e32..e4a61a2a37e 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1140,6 +1140,84 @@ def test_parquet_reader_struct_basic(tmpdir, data):
     assert expect.equals(got.to_arrow())
 
 
+def select_columns_params():
+    dfs = [
+        # struct
+        (
+            [
+                {"a": 1, "b": 2},
+                {"a": 10, "b": 20},
+                {"a": None, "b": 22},
+                {"a": None, "b": None},
+                {"a": 15, "b": None},
+            ],
+            [["struct"], ["struct.a"], ["struct.b"], ["c"]],
+        ),
+        # struct-of-list
+        (
+            [
+                {"a": 1, "b": 2, "c": [1, 2, 3]},
+                {"a": 10, "b": 20, "c": [4, 5]},
+                {"a": None, "b": 22, "c": [6]},
+                {"a": None, "b": None, "c": None},
+                {"a": 15, "b": None, "c": [-1, -2]},
+                None,
+                {"a": 100, "b": 200, "c": [-10, None, -20]},
+            ],
+            [
+                ["struct"],
+                ["struct.c"],
+                ["struct.c.list"],
+                ["struct.c.list.item"],
+                ["struct.b", "struct.c"],
+                ["struct.b", "struct.d", "struct.c"],
+            ],
+        ),
+        # list-of-struct
+        (
+            [
+                [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+                None,
+                [{"a": 10, "b": 20}],
+                [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+            ],
+            [
+                ["struct"],
+                ["struct.list"],
+                ["struct.list.item"],
+                ["struct.list.item.a", "struct.list.item.b"],
+                ["struct.list.item.c"],
+            ],
+        ),
+        # struct with "." in field names
+        (
+            [
+                {"a.b": 1, "b.a": 2},
+                {"a.b": 10, "b.a": 20},
+                {"a.b": None, "b.a": 22},
+                {"a.b": None, "b.a": None},
+                {"a.b": 15, "b.a": None},
+            ],
+            [["struct"], ["struct.a"], ["struct.b.a"]],
+        ),
+    ]
+    for df_col_pair in dfs:
+        for cols in df_col_pair[1]:
+            yield df_col_pair[0], cols
+
+
+@pytest.mark.parametrize("data, columns", select_columns_params())
+def test_parquet_reader_struct_select_columns(tmpdir, data, columns):
+    table = pa.Table.from_pydict({"struct": data})
+    buff = BytesIO()
+
+    pa.parquet.write_table(table, buff)
+
+    expect = pq.ParquetFile(buff).read(columns=columns)
+    got = cudf.read_parquet(buff, columns=columns)
+    assert expect.equals(got.to_arrow())
+
+
 def test_parquet_reader_struct_los_large(tmpdir):
     num_rows = 256
     list_size = 64
@@ -1860,26 +1938,18 @@ def test_parquet_writer_list_statistics(tmpdir):
             ]
         },
         # List of Structs
-        pytest.param(
-            {
-                "family": [
-                    [
-                        None,
-                        {"human?": True, "deets": {"weight": 2.4, "age": 27}},
-                    ],
-                    [
-                        {"human?": None, "deets": {"weight": 5.3, "age": 25}},
-                        {"human?": False, "deets": {"weight": 8.0, "age": 31}},
-                        {"human?": False, "deets": None},
-                    ],
-                    [],
-                    [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
-                ]
-            },
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/7561"
-            ),
-        ),
+        {
+            "family": [
+                [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}],
+                [
+                    {"human?": None, "deets": {"weight": 5.3, "age": 25}},
+                    {"human?": False, "deets": {"weight": 8.0, "age": 31}},
+                    {"human?": False, "deets": None},
+                ],
+                [],
+                [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
+            ]
+        },
         # Struct of Lists
         pytest.param(
             {
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 48a25fcfadb..0f8b46cee35 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import pytest
 
-from cudf.core import DataFrame, GenericIndex, Series
+from cudf import DataFrame, GenericIndex, Series
 from cudf.core.buffer import Buffer
 from cudf.testing._utils import assert_eq
 
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index 8dc5df2dd7c..07c6cce5cd3 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -11,7 +11,7 @@
 import pytest
 
 import cudf
-from cudf.core import DataFrame
+from cudf import DataFrame
 from cudf.testing._utils import assert_eq
 from cudf.utils import queryutils
 
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index 3c98496def3..563278e3a8f 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import pytest
 
-from cudf.core import DataFrame
+from cudf import DataFrame
 from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 7cbc56f943c..89d665382d3 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -11,7 +11,7 @@
 import pytest
 
 import cudf
-from cudf.core import Series
+from cudf import Series
 from cudf.core.dtypes import Decimal64Dtype
 from cudf.testing import _utils as utils
 from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand
@@ -25,7 +25,7 @@
 
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_sum(dtype, nelem):
-    dtype = np.dtype(dtype).type
+    dtype = cudf.dtype(dtype).type
     data = gen_rand(dtype, nelem)
     sr = Series(data)
 
@@ -69,8 +69,8 @@ def test_sum_decimal(dtype, nelem):
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_product(dtype, nelem):
     np.random.seed(0)
-    dtype = np.dtype(dtype).type
-    if np.dtype(dtype).kind in {"u", "i"}:
+    dtype = cudf.dtype(dtype).type
+    if cudf.dtype(dtype).kind in {"u", "i"}:
         data = np.ones(nelem, dtype=dtype)
         # Set at most 30 items to [0..2) to keep the value within 2^32
         for _ in range(30):
@@ -107,23 +107,28 @@ def test_product_decimal(dtype):
 
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_sum_of_squares(dtype, nelem):
-    dtype = np.dtype(dtype).type
+    dtype = cudf.dtype(dtype).type
     data = gen_rand(dtype, nelem)
     sr = Series(data)
+    df = cudf.DataFrame(sr)
 
     got = sr.sum_of_squares()
-    # got = dtype(got)
+    got_df = df.sum_of_squares()
     expect = (data ** 2).sum()
 
-    if np.dtype(dtype).kind in {"u", "i"}:
+    if cudf.dtype(dtype).kind in {"u", "i"}:
         if 0 <= expect <= np.iinfo(dtype).max:
             np.testing.assert_array_almost_equal(expect, got)
+            np.testing.assert_array_almost_equal(expect, got_df.iloc[0])
         else:
             print("overflow, passing")
     else:
         np.testing.assert_approx_equal(
             expect, got, significant=accuracy_for_dtype[dtype]
         )
+        np.testing.assert_approx_equal(
+            expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype]
+        )
 
 
 @pytest.mark.parametrize(
@@ -141,7 +146,7 @@ def test_sum_of_squares_decimal(dtype):
 
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_min(dtype, nelem):
-    dtype = np.dtype(dtype).type
+    dtype = cudf.dtype(dtype).type
     data = gen_rand(dtype, nelem)
     sr = Series(data)
 
@@ -167,7 +172,7 @@ def test_min_decimal(dtype, nelem):
 
 @pytest.mark.parametrize("dtype,nelem", params)
 def test_max(dtype, nelem):
-    dtype = np.dtype(dtype).type
+    dtype = cudf.dtype(dtype).type
     data = gen_rand(dtype, nelem)
     sr = Series(data)
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 4906349ecba..3ef0e2edaed 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -24,11 +24,11 @@ def test_null_series(nrows, dtype):
     data = cudf.Series(np.random.randint(1, 9, size))
     column = data.set_mask(mask)
     sr = cudf.Series(column).astype(dtype)
-    if dtype != "category" and np.dtype(dtype).kind in {"u", "i"}:
+    if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}:
         ps = pd.Series(
             sr._column.data_array_view.copy_to_host(),
             dtype=cudf_dtypes_to_pandas_dtypes.get(
-                np.dtype(dtype), np.dtype(dtype)
+                cudf.dtype(dtype), cudf.dtype(dtype)
             ),
         )
         ps[sr.isnull().to_pandas()] = pd.NA
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 605005f41fc..a9919900256 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -198,7 +198,7 @@ def test_null_scalar(dtype):
     s = cudf.Scalar(None, dtype=dtype)
     assert s.value is cudf.NA
     assert s.dtype == (
-        np.dtype(dtype)
+        cudf.dtype(dtype)
         if not isinstance(dtype, cudf.Decimal64Dtype)
         else dtype
     )
@@ -239,7 +239,7 @@ def test_generic_null_scalar_construction_fails(value):
 def test_scalar_dtype_and_validity(dtype):
     s = cudf.Scalar(1, dtype=dtype)
 
-    assert s.dtype == np.dtype(dtype)
+    assert s.dtype == cudf.dtype(dtype)
     assert s.is_valid() is True
 
 
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 95942045654..ef9f853bd11 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -7,7 +7,7 @@
 import pandas as pd
 import pytest
 
-from cudf.core import DataFrame, Series
+from cudf import DataFrame, Series
 from cudf.core.column import NumericalColumn
 from cudf.testing._utils import (
     DATETIME_TYPES,
@@ -359,3 +359,22 @@ def _check_scatter_by_map(dfs, col):
     if keep:
         for frame in multiindex_result:
             isinstance(frame.index, type(df2.index))
+
+
+@pytest.mark.parametrize(
+    "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args))
+)
+@pytest.mark.parametrize(
+    "kind", ["quicksort", "mergesort", "heapsort", "stable"]
+)
+def test_dataframe_sort_values_kind(nelem, dtype, kind):
+    np.random.seed(0)
+    df = DataFrame()
+    df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype)
+    df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype)
+    sorted_df = df.sort_values(by="a", kind=kind)
+    # Check
+    sorted_index = np.argsort(aa, kind="mergesort")
+    assert_eq(sorted_df.index.values, sorted_index)
+    assert_eq(sorted_df["a"].values, aa[sorted_index])
+    assert_eq(sorted_df["b"].values, bb[sorted_index])
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index 50c8f3f41a8..e10ad8e5306 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -6,8 +6,8 @@
 import pytest
 from numba import cuda
 
+from cudf import DataFrame, Series
 from cudf.comm.gpuarrow import GpuArrowReader
-from cudf.core import DataFrame, Series
 from cudf.testing._utils import assert_eq
 
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index a8c00ce031e..9a7ef4e2099 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -852,6 +852,32 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise):
         assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "data", [["hello", "world", None, "", "!"]],
+)
+@pytest.mark.parametrize(
+    "repeats",
+    [
+        2,
+        0,
+        -3,
+        [5, 4, 3, 2, 6],
+        [5, None, 3, 2, 6],
+        [0, 0, 0, 0, 0],
+        [-1, -2, -3, -4, -5],
+        [None, None, None, None, None],
+    ],
+)
+def test_string_repeat(data, repeats):
+    ps = pd.Series(data)
+    gs = cudf.from_pandas(ps)
+
+    expect = ps.str.repeat(repeats)
+    got = gs.str.repeat(repeats)
+
+    assert_eq(expect, got)
+
+
 # Pandas isn't respect the `n` parameter so ignoring it in test parameters
 @pytest.mark.parametrize(
     "pat,regex",
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index e9d340185ec..702a3fe75d6 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -258,3 +258,25 @@ def test_struct_slice(series, start, end):
     else:
         expected = cudf.Series(series[start:end])
         assert sr[start:end].to_arrow() == expected.to_arrow()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [{}],
+        [{"a": None}],
+        [{"a": 1}],
+        [{"a": "one"}],
+        [{"a": 1}, {"a": 2}],
+        [{"a": 1, "b": "one"}, {"a": 2, "b": "two"}],
+        [{"b": "two", "a": None}, None, {"a": "one", "b": "two"}],
+    ],
+)
+def test_struct_field_errors(data):
+    got = cudf.Series(data)
+
+    with pytest.raises(KeyError):
+        got.struct.field("notWithinFields")
+
+    with pytest.raises(IndexError):
+        got.struct.field(100)
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 6c3fdd4640a..d0b1ba0758e 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -507,8 +507,8 @@ def test_character_tokenize_index():
     actual = sr.str.character_tokenize()
     assert_eq(expected, actual)
 
-    sr = cudf.core.index.as_index([""])
-    expected = cudf.core.index.StringIndex([], dtype="object")
+    sr = cudf.Index([""])
+    expected = cudf.StringIndex([], dtype="object")
 
     actual = sr.str.character_tokenize()
     assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index a65fdeeb0dd..75923a0b284 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1386,3 +1386,12 @@ def test_timedelta_reductions(data, op, dtype):
         assert True
     else:
         assert_eq(expected.to_numpy(), actual)
+
+
+def test_error_values():
+    s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]")
+    with pytest.raises(
+        NotImplementedError,
+        match="TimeDelta Arrays is not yet implemented in cudf",
+    ):
+        s.values
diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py
index 582d5a43edf..0c246554082 100644
--- a/python/cudf/cudf/tests/test_transform.py
+++ b/python/cudf/cudf/tests/test_transform.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 
-from cudf.core import Series
+from cudf import Series
 from cudf.testing._utils import NUMERIC_TYPES
 
 supported_types = NUMERIC_TYPES
diff --git a/python/cudf/cudf/tests/test_udf_binops.py b/python/cudf/cudf/tests/test_udf_binops.py
index df7361ab183..4d6188acf8c 100644
--- a/python/cudf/cudf/tests/test_udf_binops.py
+++ b/python/cudf/cudf/tests/test_udf_binops.py
@@ -6,8 +6,8 @@
 from numba.cuda import compile_ptx
 from numba.np import numpy_support
 
-from cudf import _lib as libcudf
-from cudf.core import Series
+import cudf
+from cudf import Series, _lib as libcudf
 from cudf.utils import dtypes as dtypeutils
 
 
@@ -27,7 +27,7 @@ def test_generic_ptx(dtype):
     def generic_function(a, b):
         return a ** 3 + b
 
-    nb_type = numpy_support.from_dtype(np.dtype(dtype))
+    nb_type = numpy_support.from_dtype(cudf.dtype(dtype))
     type_signature = (nb_type, nb_type)
 
     ptx_code, output_type = compile_ptx(
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 2089f764724..25ebe6fa710 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -9,7 +9,7 @@
 import pytest
 
 import cudf
-from cudf.core import Series
+from cudf import Series
 from cudf.testing import _utils as utils
 
 _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor]
@@ -35,7 +35,7 @@ def test_series_invert(dtype):
 def test_series_not(dtype):
     import pandas as pd
 
-    dtype = np.dtype(dtype).type
+    dtype = cudf.dtype(dtype).type
     arr = pd.Series(np.random.choice([True, False], 1000)).astype(dtype)
     if dtype is not np.bool_:
         arr = arr * (np.random.random(1000) * 100).astype(dtype)
@@ -134,7 +134,7 @@ def generate_valid_scalar_unaop_combos():
 
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
-    slr_host = np.dtype(dtype).type(slr)
+    slr_host = cudf.dtype(dtype).type(slr)
     slr_device = cudf.Scalar(slr, dtype=dtype)
 
     expect = op(slr_host)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index e1ae87e5089..d47fbee8e84 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -54,16 +54,16 @@
 }
 
 cudf_dtypes_to_pandas_dtypes = {
-    np.dtype("uint8"): pd.UInt8Dtype(),
-    np.dtype("uint16"): pd.UInt16Dtype(),
-    np.dtype("uint32"): pd.UInt32Dtype(),
-    np.dtype("uint64"): pd.UInt64Dtype(),
-    np.dtype("int8"): pd.Int8Dtype(),
-    np.dtype("int16"): pd.Int16Dtype(),
-    np.dtype("int32"): pd.Int32Dtype(),
-    np.dtype("int64"): pd.Int64Dtype(),
-    np.dtype("bool_"): pd.BooleanDtype(),
-    np.dtype("object"): pd.StringDtype(),
+    cudf.dtype("uint8"): pd.UInt8Dtype(),
+    cudf.dtype("uint16"): pd.UInt16Dtype(),
+    cudf.dtype("uint32"): pd.UInt32Dtype(),
+    cudf.dtype("uint64"): pd.UInt64Dtype(),
+    cudf.dtype("int8"): pd.Int8Dtype(),
+    cudf.dtype("int16"): pd.Int16Dtype(),
+    cudf.dtype("int32"): pd.Int32Dtype(),
+    cudf.dtype("int64"): pd.Int64Dtype(),
+    cudf.dtype("bool_"): pd.BooleanDtype(),
+    cudf.dtype("object"): pd.StringDtype(),
 }
 
 pyarrow_dtypes_to_pandas_dtypes = {
@@ -80,23 +80,37 @@
 }
 
 pandas_dtypes_to_cudf_dtypes = {
-    pd.UInt8Dtype(): np.dtype("uint8"),
-    pd.UInt16Dtype(): np.dtype("uint16"),
-    pd.UInt32Dtype(): np.dtype("uint32"),
-    pd.UInt64Dtype(): np.dtype("uint64"),
-    pd.Int8Dtype(): np.dtype("int8"),
-    pd.Int16Dtype(): np.dtype("int16"),
-    pd.Int32Dtype(): np.dtype("int32"),
-    pd.Int64Dtype(): np.dtype("int64"),
-    pd.BooleanDtype(): np.dtype("bool_"),
-    pd.StringDtype(): np.dtype("object"),
+    pd.UInt8Dtype(): cudf.dtype("uint8"),
+    pd.UInt16Dtype(): cudf.dtype("uint16"),
+    pd.UInt32Dtype(): cudf.dtype("uint32"),
+    pd.UInt64Dtype(): cudf.dtype("uint64"),
+    pd.Int8Dtype(): cudf.dtype("int8"),
+    pd.Int16Dtype(): cudf.dtype("int16"),
+    pd.Int32Dtype(): cudf.dtype("int32"),
+    pd.Int64Dtype(): cudf.dtype("int64"),
+    pd.BooleanDtype(): cudf.dtype("bool_"),
+    pd.StringDtype(): cudf.dtype("object"),
+}
+
+pandas_dtypes_alias_to_cudf_alias = {
+    "UInt8": "uint8",
+    "UInt16": "uint16",
+    "UInt32": "uint32",
+    "UInt64": "uint64",
+    "Int8": "int8",
+    "Int16": "int16",
+    "Int32": "int32",
+    "Int64": "int64",
+    "boolean": "bool",
 }
 
 if PANDAS_GE_120:
-    cudf_dtypes_to_pandas_dtypes[np.dtype("float32")] = pd.Float32Dtype()
-    cudf_dtypes_to_pandas_dtypes[np.dtype("float64")] = pd.Float64Dtype()
-    pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = np.dtype("float32")
-    pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = np.dtype("float64")
+    cudf_dtypes_to_pandas_dtypes[cudf.dtype("float32")] = pd.Float32Dtype()
+    cudf_dtypes_to_pandas_dtypes[cudf.dtype("float64")] = pd.Float64Dtype()
+    pandas_dtypes_to_cudf_dtypes[pd.Float32Dtype()] = cudf.dtype("float32")
+    pandas_dtypes_to_cudf_dtypes[pd.Float64Dtype()] = cudf.dtype("float64")
+    pandas_dtypes_alias_to_cudf_alias["Float32"] = "float32"
+    pandas_dtypes_alias_to_cudf_alias["Float64"] = "float64"
 
 SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
 UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
@@ -140,7 +154,7 @@ def np_to_pa_dtype(dtype):
             return pa.duration(time_unit)
         # default fallback unit is ns
         return pa.duration("ns")
-    return _np_pa_dtypes[np.dtype(dtype).type]
+    return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
 def get_numeric_type_info(dtype):
@@ -202,7 +216,7 @@ def cudf_dtype_to_pa_type(dtype):
     ):
         return dtype.to_arrow()
     else:
-        return np_to_pa_dtype(np.dtype(dtype))
+        return np_to_pa_dtype(cudf.dtype(dtype))
 
 
 def cudf_dtype_from_pa_type(typ):
@@ -337,7 +351,7 @@ def min_signed_type(x, min_size=8):
     that can represent the integer ``x``
     """
     for int_dtype in np.sctypes["int"]:
-        if (np.dtype(int_dtype).itemsize * 8) >= min_size:
+        if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
     # resort to using `int64` and let numpy raise appropriate exception:
@@ -350,7 +364,7 @@ def min_unsigned_type(x, min_size=8):
     that can represent the integer ``x``
     """
     for int_dtype in np.sctypes["uint"]:
-        if (np.dtype(int_dtype).itemsize * 8) >= min_size:
+        if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
     # resort to using `uint64` and let numpy raise appropriate exception:
@@ -374,47 +388,22 @@ def min_column_type(x, expected_type):
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
-        if result_type == np.dtype("float16"):
-            # cuDF does not support float16 dtype
-            result_type = np.dtype("float32")
-        return result_type
 
-    if np.issubdtype(expected_type, np.integer):
+    elif np.issubdtype(expected_type, np.integer):
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
-        return np.promote_types(max_bound_dtype, min_bound_dtype)
+        result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
+    else:
+        result_type = x.dtype
 
-    return x.dtype
+    return cudf.dtype(result_type)
 
 
 def get_min_float_dtype(col):
     max_bound_dtype = np.min_scalar_type(float(col.max()))
     min_bound_dtype = np.min_scalar_type(float(col.min()))
     result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
-    if result_type == np.dtype("float16"):
-        # cuDF does not support float16 dtype
-        result_type = np.dtype("float32")
-    return result_type
-
-
-def check_cast_unsupported_dtype(dtype):
-    if is_categorical_dtype(dtype):
-        return dtype
-
-    if isinstance(dtype, pd.core.arrays.numpy_.PandasDtype):
-        dtype = dtype.numpy_dtype
-    else:
-        dtype = np.dtype(dtype)
-
-    if dtype in cudf._lib.types.np_to_cudf_types:
-        return dtype
-
-    if dtype == np.dtype("float16"):
-        return np.dtype("float32")
-
-    raise NotImplementedError(
-        f"Cannot cast {dtype} dtype, as it is not supported by CuDF."
-    )
+    return cudf.dtype(result_type)
 
 
 def is_mixed_with_object_dtype(lhs, rhs):
@@ -438,7 +427,7 @@ def get_time_unit(obj):
 
 
 def _get_nan_for_dtype(dtype):
-    dtype = np.dtype(dtype)
+    dtype = cudf.dtype(dtype)
     if pd.api.types.is_datetime64_dtype(
         dtype
     ) or pd.api.types.is_timedelta64_dtype(dtype):
@@ -536,7 +525,7 @@ def find_common_type(dtypes):
                 [dtype for dtype in dtypes if is_decimal_dtype(dtype)]
             )
         else:
-            return np.dtype("O")
+            return cudf.dtype("O")
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately
@@ -553,11 +542,7 @@ def find_common_type(dtypes):
         dtypes.add(np.result_type(*td_dtypes))
 
     common_dtype = np.find_common_type(list(dtypes), [])
-    if common_dtype == np.dtype("float16"):
-        # cuDF does not support float16 dtype
-        return np.dtype("float32")
-    else:
-        return common_dtype
+    return cudf.dtype(common_dtype)
 
 
 def _can_cast(from_dtype, to_dtype):
@@ -567,10 +552,12 @@ def _can_cast(from_dtype, to_dtype):
     `np.can_cast` but with some special handling around
     cudf specific dtypes.
     """
+    if from_dtype in {None, cudf.NA}:
+        return True
     if isinstance(from_dtype, type):
-        from_dtype = np.dtype(from_dtype)
+        from_dtype = cudf.dtype(from_dtype)
     if isinstance(to_dtype, type):
-        to_dtype = np.dtype(to_dtype)
+        to_dtype = cudf.dtype(to_dtype)
 
     # TODO : Add precision & scale checking for
     # decimal types in future
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index 4bd19720151..77963f8bcc1 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -139,6 +139,15 @@ def _try_get_old_or_new_symbols():
             # CUDA Driver Version Check:
             # Driver Runtime version is >= Runtime version
             pass
+        elif (
+            cuda_driver_supported_rt_version >= 11000
+            and cuda_runtime_version >= 11000
+        ):
+            # With cuda enhanced compatibitlity any code compiled
+            # with 11.x version of cuda can now run on any
+            # driver >= 450.80.02. 11000 is the minimum cuda
+            # version 450.80.02 supports.
+            pass
         else:
             from cudf.errors import UnSupportedCUDAError
 
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 2aaea8435e0..1927ef96e6f 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -67,8 +67,8 @@
 
 See Also
 --------
-cudf.io.csv.read_csv
-cudf.io.json.read_json
+cudf.read_csv
+cudf.read_json
 """.format(
     remote_data_sources=_docstring_remote_sources
 )
@@ -175,7 +175,7 @@
 --------
 cudf.io.parquet.read_parquet_metadata
 cudf.io.parquet.to_parquet
-cudf.io.orc.read_orc
+cudf.read_orc
 """.format(
     remote_data_sources=_docstring_remote_sources
 )
@@ -217,7 +217,7 @@
 See Also
 --------
 cudf.io.parquet.read_parquet
-cudf.io.orc.read_orc
+cudf.read_orc
 """
 doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)
 
@@ -256,6 +256,12 @@
 Number of stripes
 List of column names
 
+Notes
+-----
+Support for reading files with struct columns is currently experimental,
+the output may not be as reliable as reading for other datatypes.
+{remote_data_sources}
+
 Examples
 --------
 >>> import cudf
@@ -270,7 +276,7 @@
 
 See Also
 --------
-cudf.io.orc.read_orc
+cudf.read_orc
 """
 doc_read_orc_metadata = docfmt_partial(docstring=_docstring_read_orc_metadata)
 
@@ -296,7 +302,7 @@
 
 See Also
 --------
-cudf.io.orc.read_orc
+cudf.read_orc
 """
 doc_read_orc_statistics = docfmt_partial(
     docstring=_docstring_read_orc_statistics
@@ -385,7 +391,7 @@
 
 See Also
 --------
-cudf.io.orc.read_orc
+cudf.read_orc
 """
 doc_to_orc = docfmt_partial(docstring=_docstring_to_orc)
 
@@ -687,7 +693,7 @@
 
 See Also
 --------
-cudf.io.hdf.read_hdf : Read from HDF file.
+cudf.read_hdf : Read from HDF file.
 cudf.io.parquet.to_parquet : Write a DataFrame to the binary parquet format.
 cudf.io.feather.to_feather : Write out feather-format for DataFrames.
 """
@@ -898,7 +904,7 @@
 
 See Also
 --------
-cudf.io.csv.to_csv
+cudf.to_csv
 """.format(
     remote_data_sources=_docstring_remote_sources
 )
@@ -963,7 +969,7 @@
 
 See Also
 --------
-cudf.io.csv.read_csv
+cudf.read_csv
 """
 doc_to_csv = docfmt_partial(
     docstring=_docstring_to_csv.format(
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 209f61ad399..c9d38c8399e 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -17,7 +17,7 @@
 from cudf.utils.dtypes import to_cudf_compatible_scalar
 
 # The size of the mask in bytes
-mask_dtype = np.dtype(np.int32)
+mask_dtype = cudf.dtype(np.int32)
 mask_bitsize = mask_dtype.itemsize * 8
 
 
@@ -42,10 +42,7 @@ def scalar_broadcast_to(scalar, size, dtype=None):
     if isinstance(size, (tuple, list)):
         size = size[0]
 
-    if scalar is None or (
-        isinstance(scalar, (np.datetime64, np.timedelta64))
-        and np.isnat(scalar)
-    ):
+    if cudf._lib.scalar._is_null_host_scalar(scalar):
         if dtype is None:
             dtype = "object"
         return column.column_empty(size, dtype=dtype, masked=True)
@@ -70,7 +67,7 @@ def scalar_broadcast_to(scalar, size, dtype=None):
     scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
     dtype = scalar.dtype
 
-    if np.dtype(dtype).kind in ("O", "U"):
+    if cudf.dtype(dtype).kind in ("O", "U"):
         gather_map = column.full(size, 0, dtype="int32")
         scalar_str_col = column.as_column([scalar], dtype="str")
         return scalar_str_col[gather_map]
diff --git a/python/cudf/requirements/cuda-11.0/dev_requirements.txt b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
index efb22ddd5a4..f69c246832b 100644
--- a/python/cudf/requirements/cuda-11.0/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.0/dev_requirements.txt
@@ -23,6 +23,7 @@ packaging
 pandas>=1.0,<1.3.0dev0
 pandoc==2.0a4
 protobuf
+pydata-sphinx-theme
 pyorc
 pytest
 pytest-benchmark
@@ -33,7 +34,6 @@ setuptools
 sphinx
 sphinx-copybutton
 sphinx-markdown-tables
-sphinx_rtd_theme
 sphinxcontrib-websupport
 transformers
 typing_extensions
diff --git a/python/cudf/requirements/cuda-11.2/dev_requirements.txt b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
index cb88f74399f..e55dc2f921a 100644
--- a/python/cudf/requirements/cuda-11.2/dev_requirements.txt
+++ b/python/cudf/requirements/cuda-11.2/dev_requirements.txt
@@ -23,6 +23,7 @@ packaging
 pandas>=1.0,<1.3.0dev0
 pandoc==2.0a4
 protobuf
+pydata-sphinx-theme
 pyorc
 pytest
 pytest-benchmark
@@ -33,7 +34,6 @@ setuptools
 sphinx
 sphinx-copybutton
 sphinx-markdown-tables
-sphinx_rtd_theme
 sphinxcontrib-websupport
 transformers
 typing_extensions
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 53543b9e886..c0204190957 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+from collections.abc import Iterator
+
 import cupy as cp
 import numpy as np
 import pandas as pd
@@ -51,8 +53,8 @@ def _nonempty_index(idx):
         data = np.array([start, "1970-01-02"], dtype=idx.dtype)
         values = cudf.core.column.as_column(data)
         return cudf.core.index.DatetimeIndex(values, name=idx.name)
-    elif isinstance(idx, cudf.core.index.StringIndex):
-        return cudf.core.index.StringIndex(["cat", "dog"], name=idx.name)
+    elif isinstance(idx, cudf.StringIndex):
+        return cudf.StringIndex(["cat", "dog"], name=idx.name)
     elif isinstance(idx, cudf.core.index.CategoricalIndex):
         key = tuple(idx._data.keys())
         assert len(key) == 1
@@ -67,10 +69,10 @@ def _nonempty_index(idx):
         return cudf.core.index.GenericIndex(
             np.arange(2, dtype=idx.dtype), name=idx.name
         )
-    elif isinstance(idx, cudf.core.MultiIndex):
+    elif isinstance(idx, cudf.core.multiindex.MultiIndex):
         levels = [meta_nonempty(lev) for lev in idx.levels]
         codes = [[0, 0] for i in idx.levels]
-        return cudf.core.MultiIndex(
+        return cudf.core.multiindex.MultiIndex(
             levels=levels, codes=codes, names=idx.names
         )
 
@@ -256,6 +258,52 @@ def is_categorical_dtype_cudf(obj):
     return cudf.utils.dtypes.is_categorical_dtype(obj)
 
 
+try:
+    from dask.dataframe.dispatch import percentile_dispatch
+
+    @percentile_dispatch.register((cudf.Series, cp.ndarray, cudf.Index))
+    def percentile_cudf(a, q, interpolation="linear"):
+        # Cudf dispatch to the equivalent of `np.percentile`:
+        # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
+        a = cudf.Series(a)
+        # a is series.
+        n = len(a)
+        if not len(a):
+            return None, n
+        if isinstance(q, Iterator):
+            q = list(q)
+
+        if cudf.utils.dtypes.is_categorical_dtype(a.dtype):
+            result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
+
+            return (
+                pd.Categorical.from_codes(
+                    result, a.dtype.categories, a.dtype.ordered
+                ),
+                n,
+            )
+        if np.issubdtype(a.dtype, np.datetime64):
+            result = a.quantile(
+                [i / 100.0 for i in q], interpolation=interpolation
+            )
+
+            if q[0] == 0:
+                # https://github.com/dask/dask/issues/6864
+                result[0] = min(result[0], a.min())
+            return result.to_pandas(), n
+        if not np.issubdtype(a.dtype, np.number):
+            interpolation = "nearest"
+        return (
+            a.quantile(
+                [i / 100.0 for i in q], interpolation=interpolation
+            ).to_pandas(),
+            n,
+        )
+
+
+except ImportError:
+    pass
+
 try:
     from dask.dataframe.dispatch import union_categoricals_dispatch
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 2ec457018d9..600d6cc7412 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -16,6 +16,8 @@
 from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy
 from dask.highlevelgraph import HighLevelGraph
 
+import cudf
+
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
     def __init__(self, *args, **kwargs):
@@ -71,15 +73,28 @@ def aggregate(self, arg, split_every=None, split_out=1):
             "min",
             "max",
             "collect",
+            "first",
+            "last",
         }
         if (
             isinstance(self.obj, DaskDataFrame)
-            and isinstance(self.index, (str, list))
+            and (
+                isinstance(self.index, str)
+                or (
+                    isinstance(self.index, list)
+                    and all(isinstance(x, str) for x in self.index)
+                )
+            )
             and _is_supported(arg, _supported)
         ):
+            if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
+                keys = self._meta.grouping.keys.names
+            else:
+                keys = self._meta.grouping.keys.name
+
             return groupby_agg(
                 self.obj,
-                self.index,
+                keys,
                 arg,
                 split_every=split_every,
                 split_out=split_out,
@@ -127,7 +142,10 @@ def aggregate(self, arg, split_every=None, split_out=1):
             "min",
             "max",
             "collect",
+            "first",
+            "last",
         }
+
         if (
             isinstance(self.obj, DaskDataFrame)
             and isinstance(self.index, (str, list))
@@ -165,7 +183,16 @@ def groupby_agg(
 
         This aggregation algorithm only supports the following options:
 
-        {"count", "mean", "std", "var", "sum", "min", "max", "collect"}
+        - "count"
+        - "mean"
+        - "std"
+        - "var"
+        - "sum"
+        - "min"
+        - "max"
+        - "collect"
+        - "first"
+        - "last"
 
         This "optimized" approach is more performant than the algorithm
         in `dask.dataframe`, because it allows the cudf backend to
@@ -208,6 +235,8 @@ def groupby_agg(
         "min",
         "max",
         "collect",
+        "first",
+        "last",
     }
     if not _is_supported(aggs, _supported):
         raise ValueError(
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 510b5730169..0ac0af2842b 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -7,7 +7,7 @@
 from pyarrow import parquet as pq
 
 from dask import dataframe as dd
-from dask.dataframe.io.parquet.arrow import ArrowEngine
+from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine
 
 try:
     from dask.dataframe.io.parquet import (
@@ -19,12 +19,20 @@
 import cudf
 from cudf.core.column import as_column, build_categorical_column
 from cudf.io import write_to_dataset
+from cudf.utils.dtypes import cudf_dtype_from_pa_type
 
 
-class CudfEngine(ArrowEngine):
+class CudfEngine(ArrowDatasetEngine):
     @staticmethod
     def read_metadata(*args, **kwargs):
-        meta, stats, parts, index = ArrowEngine.read_metadata(*args, **kwargs)
+        meta, stats, parts, index = ArrowDatasetEngine.read_metadata(
+            *args, **kwargs
+        )
+        if parts:
+            # Re-set "object" dtypes align with pa schema
+            set_object_dtypes_from_pa_schema(
+                meta, parts[0].get("common_kwargs", {}).get("schema", None),
+            )
 
         # If `strings_to_categorical==True`, convert objects to int32
         strings_to_cats = kwargs.get("strings_to_categorical", False)
@@ -59,7 +67,6 @@ def read_partition(
             pieces = [pieces]
 
         strings_to_cats = kwargs.get("strings_to_categorical", False)
-
         if len(pieces) > 1:
 
             paths = []
@@ -72,6 +79,9 @@ def read_partition(
                     rgs.append(None)
                 else:
                     (path, row_group, partition_keys) = piece
+
+                    row_group = None if row_group == [None] else row_group
+
                     paths.append(path)
                     rgs.append(
                         [row_group]
@@ -96,6 +106,7 @@ def read_partition(
                 partition_keys = []
             else:
                 (path, row_group, partition_keys) = pieces[0]
+                row_group = None if row_group == [None] else row_group
 
             if cudf.utils.ioutils._is_local_filesystem(fs):
                 df = cudf.read_parquet(
@@ -117,6 +128,9 @@ def read_partition(
                         **kwargs.get("read", {}),
                     )
 
+        # Re-set "object" dtypes align with pa schema
+        set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None))
+
         if index and (index[0] in df.columns):
             df = df.set_index(index[0])
         elif index is False and set(df.index.names).issubset(columns):
@@ -127,17 +141,22 @@ def read_partition(
         if partition_keys:
             if partitions is None:
                 raise ValueError("Must pass partition sets")
+
             for i, (name, index2) in enumerate(partition_keys):
-                categories = [
-                    val.as_py() for val in partitions.levels[i].dictionary
-                ]
 
-                col = as_column(index2).as_frame().repeat(len(df))._data[None]
+                # Build the column from `codes` directly
+                # (since the category is often a larger dtype)
+                codes = (
+                    as_column(partitions[i].keys.index(index2))
+                    .as_frame()
+                    .repeat(len(df))
+                    ._data[None]
+                )
                 df[name] = build_categorical_column(
-                    categories=categories,
-                    codes=as_column(col.base_data, dtype=col.dtype),
-                    size=col.size,
-                    offset=col.offset,
+                    categories=partitions[i].keys,
+                    codes=codes,
+                    size=codes.size,
+                    offset=codes.offset,
                     ordered=False,
                 )
 
@@ -233,6 +252,18 @@ def aggregate_metadata(cls, meta_list, fs, out_path):
             return meta
 
 
+def set_object_dtypes_from_pa_schema(df, schema):
+    # Simple utility to modify cudf DataFrame
+    # "object" dtypes to agree with a specific
+    # pyarrow schema.
+    if schema:
+        for name in df.columns:
+            if name in schema.names and df[name].dtype == "O":
+                df[name] = df[name].astype(
+                    cudf_dtype_from_pa_type(schema.field(name).type)
+                )
+
+
 def read_parquet(
     path,
     columns=None,
@@ -243,9 +274,9 @@ def read_parquet(
     """ Read parquet files into a Dask DataFrame
 
     Calls ``dask.dataframe.read_parquet`` to cordinate the execution of
-    ``cudf.read_parquet``, and ultimately read multiple partitions into a
-    single Dask dataframe. The Dask version must supply an ``ArrowEngine``
-    class to support full functionality.
+    ``cudf.read_parquet``, and ultimately read multiple partitions into
+    a single Dask dataframe. The Dask version must supply an
+    ``ArrowDatasetEngine`` class to support full functionality.
     See ``cudf.read_parquet`` and Dask documentation for further details.
 
     Examples
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 740a2d48ce2..a5492bc5fc0 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -455,17 +455,24 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     p1 = os.path.join(tmpdir, "part.1.parquet")
     df1.to_parquet(p1, engine="pyarrow")
 
-    with pytest.raises(RuntimeError):
-        # Pyarrow will fail to aggregate metadata
-        # if gather_statistics=True
-        dask_cudf.read_parquet(str(tmpdir), gather_statistics=True,).compute()
+    # New pyarrow-dataset base can handle an inconsistent
+    # schema (even without a _metadata file), but computing
+    # and dtype validation may fail
+    ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
 
     # Add global metadata file.
     # Dask-CuDF can do this without requiring schema
-    # consistency.  Once the _metadata file is avaible,
-    # parsing metadata should no longer be a problem
+    # consistency.
     dask_cudf.io.parquet.create_metadata_file([p0, p1])
 
-    # Check that we can now read the ddf
+    # Check that we can still read the ddf
     # with the _metadata file present
-    dask_cudf.read_parquet(str(tmpdir), gather_statistics=True,).compute()
+    ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True)
+
+    # Check that the result is the same with and
+    # without the _metadata file.  Note that we must
+    # call `compute` on `ddf1`, because the dtype of
+    # the inconsistent column ("a") may be "object"
+    # before computing, and "int" after
+    dd.assert_eq(ddf1.compute(), ddf2)
+    dd.assert_eq(ddf1.compute(), ddf2.compute())
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index cf5203a22e5..ace9701b677 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -59,7 +59,7 @@ def test_from_cudf_with_generic_idx():
 
     ddf = dgd.from_cudf(cdf, npartitions=2)
 
-    assert isinstance(ddf.index.compute(), cudf.core.index.GenericIndex)
+    assert isinstance(ddf.index.compute(), cudf.RangeIndex)
     dd.assert_eq(ddf.loc[1:2, ["a"]], cdf.loc[1:2, ["a"]])
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 84de32952e5..61fa32b76ed 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -594,3 +594,54 @@ def test_groupby_unique_lists():
     dd.assert_eq(
         gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(),
     )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [], "b": []},
+        {"a": [2, 1, 2, 1, 1, 3], "b": [None, 1, 2, None, 2, None]},
+        {"a": [None], "b": [None]},
+        {"a": [2, 1, 1], "b": [None, 1, 0], "c": [None, 0, 1]},
+    ],
+)
+@pytest.mark.parametrize("agg", ["first", "last"])
+def test_groupby_first_last(data, agg):
+    pdf = pd.DataFrame(data)
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    ddf = dd.from_pandas(pdf, npartitions=2)
+    gddf = dask_cudf.from_cudf(gdf, npartitions=2)
+
+    dd.assert_eq(
+        ddf.groupby("a").agg(agg).compute(),
+        gddf.groupby("a").agg(agg).compute(),
+    )
+
+    dd.assert_eq(
+        getattr(ddf.groupby("a"), agg)().compute(),
+        getattr(gddf.groupby("a"), agg)().compute(),
+    )
+
+    dd.assert_eq(
+        gdf.groupby("a").agg(agg), gddf.groupby("a").agg(agg).compute()
+    )
+
+    dd.assert_eq(
+        getattr(gdf.groupby("a"), agg)(),
+        getattr(gddf.groupby("a"), agg)().compute(),
+    )
+
+
+def test_groupby_with_list_of_series():
+    df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]})
+    gdf = dask_cudf.from_cudf(df, npartitions=2)
+    gs = cudf.Series([1, 1, 1, 2, 2], name="id")
+    ggs = dask_cudf.from_cudf(gs, npartitions=2)
+
+    ddf = dd.from_pandas(df.to_pandas(), npartitions=2)
+    pgs = dd.from_pandas(gs.to_pandas(), npartitions=2)
+
+    dd.assert_eq(
+        gdf.groupby([ggs]).agg(["sum"]), ddf.groupby([pgs]).agg(["sum"])
+    )