From 1160ef198578e48f8fcced2fbedb661658781759 Mon Sep 17 00:00:00 2001
From: Thomas Li <thomasli1234567890@gmail.com>
Date: Wed, 31 Jul 2024 17:34:03 +0000
Subject: [PATCH] Squashed commit of the following:

commit 606d15e7260b553cbdb69f9ecd935c12ba94e430
Author: Thomas Li <thomasli1234567890@gmail.com>
Date:   Wed Jul 31 14:30:48 2024 +0000

    put back mistakenly removed CMakeLists.txt

commit feac68de39be09c1751d0ccc2bb5f93b1075ac8f
Author: Thomas Li <thomasli1234567890@gmail.com>
Date:   Wed Jul 31 13:59:50 2024 +0000

    rpath was the problem?

commit b2b68e14b9faa1dac0f2516667f65ecb5693a744
Author: Thomas Li <thomasli1234567890@gmail.com>
Date:   Tue Jul 30 22:29:14 2024 +0000

    maybe fix?

commit 5243eac8a90114e4fdf794760cb6b6029d9ba1a1
Author: Thomas Li <thomasli1234567890@gmail.com>
Date:   Tue Jul 30 21:11:03 2024 +0000

    fix cuda suffixing

commit acb31227d3ffb07e4a35be5d1c0ec6cbadbfe53d
Author: Thomas Li <thomasli1234567890@gmail.com>
Date:   Tue Jul 30 20:29:52 2024 +0000

    fixes

commit b2306df549ac5db08dc0d1b09df270137dacfe9d
Author: Thomas Li <thomasli1234567890@gmail.com>
Date:   Tue Jul 30 20:08:13 2024 +0000

    fixes

commit d6d91df1510a70d79fefacf8b57ca1caf027edf8
Merge: b7a2782f1a 7b3e73a7e3
Author: Thomas Li <thomasli1234567890@gmail.com>
Date:   Tue Jul 30 19:32:18 2024 +0000

    Merge branch 'branch-24.10' of github.com:rapidsai/cudf into setup-pylibcudf-package

commit 7b3e73a7e38b671db1387879cfa963fe61060c36
Merge: ce259fff66 dbf4bd02a8
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 30 13:14:19 2024 -0400

    Merge pull request #16435 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit dbf4bd02a8fdccd1891edbc2d049c3ddddb234b3
Author: GALI PREM SAGAR <sagarprem75@gmail.com>
Date:   Tue Jul 30 12:14:14 2024 -0500

    Add about rmm modes in `cudf.pandas` docs (#16404)

    This PR adds user facing docs for rmm memory modes and prefetching.

    ---------

    Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com>
    Co-authored-by: Bradley Dice <bdice@bradleydice.com>

commit ce259fff6641dd847883d535645c7c17c36fb7ec
Merge: b8bfe2c912 0f07b0bb5e
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 30 09:02:26 2024 -0400

    Merge pull request #16433 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 0f07b0bb5e2cc89ca66e9d9639ff6ac961ec0471
Author: GALI PREM SAGAR <sagarprem75@gmail.com>
Date:   Tue Jul 30 08:02:21 2024 -0500

    Enable prefetching before `runpy` (#16427)

    This PR enables prefetching before we execute the `runpy` module and
    script code.

commit b8bfe2c91234032cbe9b2549e46a08109e238c8a
Merge: d1be0b6dc0 5feeaf3827
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 30 09:02:06 2024 -0400

    Merge pull request #16432 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 5feeaf3827bfd20755cdd0516ef0c6ba484a600c
Author: Richard (Rick) Zamora <rzamora217@gmail.com>
Date:   Tue Jul 30 08:02:01 2024 -0500

    [Bug] Remove loud `NativeFile` deprecation noise for `read_parquet` from S3 (#16415)

    Important follow-up to https://github.com/rapidsai/cudf/pull/16132

    Without this PR, using `dask_cudf.read_parquet("s3://...", ...)` will
    result in loud deprecation warnings after `compute`/`persist` is called.
    This is because dask will always pass `NativeFile` objects down to cudf.

    My fault for missing this earlier!

commit d1be0b6dc06fddd0b69fb69731281b16894cb132
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Mon Jul 29 15:12:38 2024 -1000

    Align CategoricalIndex APIs with pandas 2.x (#16369)

    Mostly exposing methods that were available on the CategoricalColumn

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16369

commit 368a34ca9fd7db1b6cfb6e7817978e3e4fcfb00b
Author: Bradley Dice <bdice@bradleydice.com>
Date:   Mon Jul 29 20:05:17 2024 -0500

    Use RMM adaptor constructors instead of factories. (#16414)

    This PR uses RMM memory resource adaptor constructors instead of factory functions. With CTAD, we do not need the factory and can use the constructor directly. The factory will be deprecated in https://github.com/rapidsai/rmm/pull/1626.

    Authors:
      - Bradley Dice (https://github.com/bdice)

    Approvers:
      - Nghia Truong (https://github.com/ttnghia)
      - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

    URL: https://github.com/rapidsai/cudf/pull/16414

commit e8048f7f3d66433203651a6a603d4de1360ca5ca
Merge: f8eb63e499 bd302d773c
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 29 20:07:38 2024 -0400

    Merge pull request #16431 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit bd302d773c50552531bc7f11f782f8ed876e8fab
Author: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date:   Mon Jul 29 17:07:33 2024 -0700

    Support thread-safe for `prefetch_config::get` and `prefetch_config::set` (#16425)

    This adds muti-thread support for `prefetch_config` getter and setter
    functions. This avoid the issue that the config map is corrupted in
    multi-thread environments.

    Closes https://github.com/rapidsai/cudf/issues/16426.

    ---------

    Signed-off-by: Nghia Truong <nghiat@nvidia.com>

commit f8eb63e499f94d583d715f5c1f5e6f234589be57
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Mon Jul 29 12:39:19 2024 -1000

    Align Index APIs with pandas 2.x (#16361)

    Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters

    * `to_flat_index`
    * `isin`
    * `unique`
    * `transpose`

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16361

commit 743e16426c564d0ed0d7e3d9be5f67e4605c4f32
Author: James Lamb <jlamb@nvidia.com>
Date:   Mon Jul 29 14:19:43 2024 -0500

    update some branch references in GitHub Actions configs (#16397)

    Fixes some lingering references to `branch-24.08` in the `pr_issue_status_automation` CI workflow.

    This was missed when new branches were cut because that file ends in `.yml` and `update-version.sh` was only modifying files ending in `.yaml`. The corresponding `update-version.sh` changes were made in #16183 and are already on 24.10 thanks to forward mergers.

    https://github.com/rapidsai/cudf/blob/dc05a01f3fc0742c5fbbddd86a0f2007bfdc2050/ci/release/update-version.sh#L78

    ## Notes for Reviewers

    I checked like this, and don't see any other missed references:

    ```shell
    git grep -E '24\.8|24\.08|0\.39'
    ```

    Authors:
      - James Lamb (https://github.com/jameslamb)

    Approvers:
      - Kyle Edwards (https://github.com/KyleFromNVIDIA)

    URL: https://github.com/rapidsai/cudf/pull/16397

commit 35796057b64e258713d4d89ba368837d30a1a9c5
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Mon Jul 29 08:33:23 2024 -1000

    Align misc DataFrame and MultiIndex methods with pandas 2.x (#16402)

    The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

    * `DataFrame.insert`
    * `DataFrame.melt`
    * `DataFrame.merge`
    * `DataFrame.quantile`
    * `DataFrame.cov`
    * `DataFrame.corr`
    * `DataFrame.median`
    * `DataFrame.rolling`
    * `DataFrame.resample`
    * `DataFrame.dropna`
    * `MultiIndex.from_tuple`
    * `MultiIndex.from_frame`
    * `MultiIndex.from_product`

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16402

commit 6e7624d6b31c93b0547590929ac63ed8e3a48d24
Author: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date:   Mon Jul 29 14:06:51 2024 -0400

    Add stream parameter to reshape APIs (#16410)

    Adds `stream` parameter to reshape APIs:
    - `cudf::interleave_columns`
    - `cudf::tile`
    - `cudf::byte_cast`

    Found while working #15983

    Authors:
      - David Wendt (https://github.com/davidwendt)

    Approvers:
      - Bradley Dice (https://github.com/bdice)
      - Nghia Truong (https://github.com/ttnghia)

    URL: https://github.com/rapidsai/cudf/pull/16410

commit 58f47242fe04b1e25fd42e1e45e8c15417140777
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Mon Jul 29 06:09:21 2024 -1000

    Align groupby APIs with pandas 2.x (#16403)

    The following breaking APIs are affected:

    * `apply`
    * `transform`
    * `describe`

    The rest of the APIs are non-breaking and generally will raise a `NotImplementedError`

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16403

commit 18c1465b597284d8b558964cc0ca48de7da60a17
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Mon Jul 29 06:06:07 2024 -1000

    Align ewm APIs with pandas 2.x (#16413)

    These all currently are not implemented and raise a `NotImplementedError`

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16413

commit eed0b1f36c84aa4a4bf17a3b99f931940cb6ddd9
Merge: 24997fda19 a51964ed8b
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 29 09:42:33 2024 -0400

    Merge pull request #16419 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit a51964ed8b00c3c88d463e329af7ec8378642343
Author: GALI PREM SAGAR <sagarprem75@gmail.com>
Date:   Mon Jul 29 08:42:27 2024 -0500

    Fix a `pandas-2.0` missing attribute error (#16416)

    `NumpyEADtype` is a 2.1.0+ change, this PR handles the missing attribute
    error in pandas-2.0

commit 24997fda194d5b8af34048a8bf275830cabbff8c
Author: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date:   Fri Jul 26 18:37:30 2024 -0700

    Deduplicate decimal32/decimal64 to decimal128 conversion function (#16236)

    Closes #16194

    This PR deduplicates the `convert_data_to_decimal128` function from `to_arrow.cu`, `writer_impl.cu` and `to_arrow_device.cu` to a common location.

    Authors:
      - Muhammad Haseeb (https://github.com/mhaseeb123)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Vukasin Milovanovic (https://github.com/vuule)
      - Nghia Truong (https://github.com/ttnghia)
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16236

commit 473dec55abd1a3d9d540c541443f831d18ebb532
Author: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date:   Fri Jul 26 14:45:12 2024 -0700

    Add query 10 to the TPC-H suite (#16392)

    Adds Q10 to the TPC-H benchmark suite

    Authors:
      - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

    Approvers:
      - Mike Wilson (https://github.com/hyperbolic2346)
      - Yunsong Wang (https://github.com/PointKernel)

    URL: https://github.com/rapidsai/cudf/pull/16392

commit 46ff702144a2477d06ffabd3d92d38967c10b1ff
Merge: 73158f06e2 5dd3efba5b
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 26 16:47:54 2024 -0400

    Merge pull request #16411 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 5dd3efba5b7e0c22dce87cf20aecb1b198677d2e
Author: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date:   Fri Jul 26 16:47:49 2024 -0400

    Fix nightly memcheck error for empty STREAM_INTEROP_TEST (#16406)

    ## Description
    The `STREAM_INTEROP_TEST` code was commented out in #16379 so the
    `compute-sanitizer` returns an error for this test in the nightly
    cpp-memcheck tests.
    https://github.com/rapidsai/cudf/actions/runs/10107041505/job/27950193878#step:9:62177

    This PR comments out the empty test so it is not built. The test will be
    re-enabled in a future release when the deprecated functions are
    replaced.

    ## Checklist
    - [x] I am familiar with the [Contributing
    Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
    - [x] New or existing tests cover these changes.
    - [x] The documentation is up to date with these changes.

commit 73158f06e2b816d88e4a2b71f236812ab997391f
Merge: dc05a01f3f f88a242832
Author: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date:   Fri Jul 26 13:14:22 2024 -0500

    Merge pull request #16409 from vyasr/branch-24.10-merge-branch-24.08

    Branch 24.10 merge branch 24.08

commit f88a242832a1c991c615961631f02c9875ab871f
Merge: dc05a01f3f cd762b4eb1
Author: Vyas Ramasubramani <vyasr@nvidia.com>
Date:   Fri Jul 26 18:10:32 2024 +0000

    Merge branch 'branch-24.08' into branch-24.10-merge-branch-24.08

commit cd762b4eb1fd55a0bc5079ed69bfc04426f10e60
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 26 08:08:01 2024 -1000

    Gate ArrowStringArrayNumpySemantics cudf.pandas proxy behind version check (#16401)

    ## Description
    `ArrowStringArrayNumpySemantics` was newly added in 2.1:
    https://github.com/pandas-dev/pandas/blob/2.1.x/pandas/core/arrays/string_arrow.py#L488,
    so putting the proxy wrapper behind a version check for pandas 2.0.x
    compat

    ```ipython
    In [1]: %load_ext cudf.pandas

    In [2]: import pandas as pd

    In [3]: pd.__version__
    Out[3]: '2.0.0'
    ```

    ## Checklist
    - [ ] I am familiar with the [Contributing
    Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
    - [ ] New or existing tests cover these changes.
    - [ ] The documentation is up to date with these changes.

commit 1cea1eaf6c1e87e65729897dd9bbedc4bdc5e7ab
Author: Kyle Edwards <kyedwards@nvidia.com>
Date:   Thu Jul 25 16:26:34 2024 -0400

    Don't export bs_thread_pool (#16398)

    ## Description
    cudf does not currently export any headers that depend on
    bs_thread_pool, and having it as a dependency is currently causing
    problems for consumers. Avoid exporting it since it's not needed.

    ## Checklist
    - [ ] I am familiar with the [Contributing
    Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
    - [ ] New or existing tests cover these changes.
    - [ ] The documentation is up to date with these changes.

commit dc05a01f3fc0742c5fbbddd86a0f2007bfdc2050
Merge: fb2021fe82 e553295cfa
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Thu Jul 25 12:14:52 2024 -0400

    Merge pull request #16396 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit e553295cfaf2f5bd1f539ee78d9a3a064e00e5f0
Author: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date:   Thu Jul 25 11:14:47 2024 -0500

    Require fixed width types for casting in `cudf-polars` (#16381)

    Fixes a bug where numeric <-> string casts are not being properly rejected at the cudf-polars level.

    Authors:
      - https://github.com/brandon-b-miller

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16381

commit fb2021fe82724746ae1c58345ed37f7e7a0207ed
Merge: 673b96f6d1 f756e01a3c
Author: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date:   Thu Jul 25 11:06:30 2024 -0400

    Merge pull request #16391 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit f756e01a3c5ff83421b1afb44460d9e5147a410e
Author: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date:   Thu Jul 25 07:04:47 2024 -0700

    Implement support for scan_ndjson in cudf-polars (#16263)

    Implement support for scan_ndjson in cudf-polars.

    Authors:
      - Thomas Li (https://github.com/lithomas1)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16263

commit 673b96f6d15dbd5d8bcb22d612d3c324aa899e26
Merge: 5a3399bec8 4cc37896a5
Author: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date:   Thu Jul 25 08:27:15 2024 -0500

    Merge pull request #16393 from jameslamb/branch-24.10-merge-branch-24.08

    Merge branch-24.08 into branch-24.10

commit d953676e9281125a5b8bd9be739c997611471771
Author: Robert Maynard <rmaynard@nvidia.com>
Date:   Thu Jul 25 04:49:12 2024 -0400

    Hide visibility of non public symbols (#15982)

    Converts cudf over to a system of explicit markup of what symbols should be used by consumers. This is done by compiling with `-fvisibility=hidden` and explicit markup via `CUDF_EXPORT` of components we want usable.

    Due to issues with tests a portion of `include/` detail functions had to be marked as public API.

    More concernning are that the tests leverage functions from `cpp/` that are never part of the installed headers. That set of files can be found at https://github.com/rapidsai/cudf/commit/16b365635ab0f86bb1cc6db5f036564e8290f3b1 and we should discuss how we should restructure cudf to remove these.

    Authors:
      - Robert Maynard (https://github.com/robertmaynard)
      - Bradley Dice (https://github.com/bdice)

    Approvers:
      - Bradley Dice (https://github.com/bdice)
      - Nghia Truong (https://github.com/ttnghia)

    URL: https://github.com/rapidsai/cudf/pull/15982

commit 4aefcc7b2988346166b9a757fc837e93f6f0a3bb
Author: GALI PREM SAGAR <sagarprem75@gmail.com>
Date:   Wed Jul 24 22:30:35 2024 -0500

    Add ability to prefetch in `cudf.pandas` and change default to managed pool (#16296)

    This PR adds ability to prefetch in `cudf.pandas` based off of: https://github.com/rapidsai/rmm/pull/1608/

    Authors:
      - GALI PREM SAGAR (https://github.com/galipremsagar)
      - Bradley Dice (https://github.com/bdice)

    Approvers:
      - Bradley Dice (https://github.com/bdice)
      - Muhammad Haseeb (https://github.com/mhaseeb123)
      - Vyas Ramasubramani (https://github.com/vyasr)
      - Mark Harris (https://github.com/harrism)

    URL: https://github.com/rapidsai/cudf/pull/16296

commit 6486bb928dfb0e1817b0604572e2f5789d05c596
Author: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date:   Wed Jul 24 22:24:46 2024 -0400

    Migrate lists/filtering to pylibcudf (#16184)

    Apart of #15162

    Authors:
      - Matthew Murray (https://github.com/Matt711)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16184

commit a33f520b370d048a22de031294311c241ab23858
Author: David Gardner <96306125+dagardner-nv@users.noreply.github.com>
Date:   Wed Jul 24 18:42:16 2024 -0700

    Fix inconsistent usage of 'results' and 'records' in read-json.md (#15766)

    * Fix inconsistent usage of 'results' and 'records' in `docs/cudf/source/user_guide/io/read-json.md`

    Authors:
      - David Gardner (https://github.com/dagardner-nv)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Bradley Dice (https://github.com/bdice)
      - Nghia Truong (https://github.com/ttnghia)

    URL: https://github.com/rapidsai/cudf/pull/15766

commit 5a3399bec868f44d13c003f172c665919096d8e8
Author: James Lamb <jlamb@nvidia.com>
Date:   Wed Jul 24 19:26:12 2024 -0500

    fix [tool.setuptools] reference in custreamz config (#16365)

    Noticed this warning in logs from #16183

    > _/python3.10/site-packages/setuptools/config/pyprojecttoml.py:70: _ToolsTypoInMetadata: Ignoring [tools.setuptools] in pyproject.toml, did you mean [tool.setuptools]?_

    This fixes that.

    ## Notes for Reviewers

    Intentionally targeting this at 24.10.

    This misconfiguration has been in `custreamz` since the 23.04 release ([git blame link](https://github.com/rapidsai/cudf/blame/e6d412cba7c23df7ee500c28257ed9281cea49b9/python/custreamz/pyproject.toml#L60)).

    I think the only effect might be that some test files are included in wheels when we don't want to.

    I don't think the fix for it needs to be rushed into 24.08.

    I searched across RAPIDS in case this was copied from somewhere else... don't see any other instances of this typo that need to be fixed.

    Authors:
      - James Lamb (https://github.com/jameslamb)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16365

commit 4cc37896a5dff1e019f0dff8101f3a84a05fd5d8
Merge: 29ce5c529e a36dacb663
Author: James Lamb <jlamb@nvidia.com>
Date:   Wed Jul 24 18:54:56 2024 -0500

    Merge branch-24.08 into branch-24.10

commit a36dacb66325e03d3264482d35a5cf7e0b6c7a37
Author: Lawrence Mitchell <lmitchell@nvidia.com>
Date:   Thu Jul 25 00:31:40 2024 +0100

    Make C++ compilation warning free after #16297 (#16379)

    In https://github.com/rapidsai/cudf/pull/16297, we deprecated the use of `to_arrow` in favour of `to_arrow_host` and `to_arrow_device`. However, the scalar detail overload of `to_arrow` used the public table overload. So we get a warning when compiling internal libcudf code. Fix this by using the detail API, and fix a bug along the way where we were not passing through the arrow memory resource.

    Authors:
      - Lawrence Mitchell (https://github.com/wence-)

    Approvers:
      - David Wendt (https://github.com/davidwendt)
      - Michael Schellenberger Costa (https://github.com/miscco)
      - Vyas Ramasubramani (https://github.com/vyasr)
      - Karthikeyan (https://github.com/karthikeyann)

    URL: https://github.com/rapidsai/cudf/pull/16379

commit ae4c7e3ce4fe100eb919ca00fa34461e44078ba9
Author: James Lamb <jlamb@nvidia.com>
Date:   Wed Jul 24 18:30:53 2024 -0500

    split up CUDA-suffixed dependencies in dependencies.yaml (#16183)

    Contributes to https://github.com/rapidsai/build-planning/issues/31

    Follow-up to #15245

    RAPIDS DLFW builds prefer to build all RAPIDS packages together without CUDA suffixes, leading to the following set of requirements for `cudf` wheels built there:

    * project name must be `cudf` (not `cudf-cu12`)
    * all dependencies must be unsuffixed (e.g. `rmm` not `rmm-cu12`)
    * the correct set of dependencies based on CUDA version must be expressed in the wheel metadata (e.g. `cubinlinker` and `ptxcompiler` on CUDA 11, `pynvjitlink` on CUDA 12)

    To meet all 3 of those, this proposes decomposing CUDA-suffixed dependencies in `dependencies.yaml` into two lists... `cuda_suffixed="true"` and `cuda_suffixed="false"`.

    That'd allow DLFW builds to do the following to meet its requirements:

    ```shell
    pip wheel \
      -C rapidsai.disable-cuda=true \
      -C rapidsai.matrix-entry="cuda=12.5;cuda_suffixed=false" \
      .
    ```

    Authors:
      - James Lamb (https://github.com/jameslamb)

    Approvers:
      - Bradley Dice (https://github.com/bdice)
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16183

commit 29ce5c529ea9ea18edc32ab905f1ef076f266008
Author: Michael Schellenberger Costa <miscco@nvidia.com>
Date:   Thu Jul 25 01:29:41 2024 +0200

    Fix some issues with deprecated / removed cccl facilities (#16377)

    `cub::If` has been deprecated and should not be used. There is a better alternative in `cuda::std::conditional_t`

    `thrust::{binary, unary}_function` has been deprecated and does not serve a purpose similar to the removed `std::{binary, unary}_function`

    Rather than relying on the type aliases one should use the `std::invoke` machinery

    Authors:
      - Michael Schellenberger Costa (https://github.com/miscco)

    Approvers:
      - Bradley Dice (https://github.com/bdice)
      - Nghia Truong (https://github.com/ttnghia)
      - Bernhard Manfred Gruber (https://github.com/bernhardmgruber)

    URL: https://github.com/rapidsai/cudf/pull/16377

commit a6b1cf1fa96d622626a9e4d99a5c71d33fb1bd49
Merge: 2eabe0de58 59f65843b8
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 19:10:33 2024 -0400

    Merge pull request #16389 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 59f65843b80d967f743841aee8489b6ae63b269a
Author: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date:   Wed Jul 24 16:10:28 2024 -0700

    Gracefully CUDF_FAIL when `skip_rows > 0` in Chunked Parquet reader (#16385)

    This PR must merge in cudf 24.08 to avoid unhandled expections.

    Gracefully CUDF_FAIL in chunked parquet reader when `skip_rows>0` which may result in runtime exceptions like segfaults or an infinite loop. See #16186 for more information.

    Authors:
      - Muhammad Haseeb (https://github.com/mhaseeb123)

    Approvers:
      - David Wendt (https://github.com/davidwendt)
      - Vyas Ramasubramani (https://github.com/vyasr)
      - Bradley Dice (https://github.com/bdice)
      - Karthikeyan (https://github.com/karthikeyann)
      - Nghia Truong (https://github.com/ttnghia)

    URL: https://github.com/rapidsai/cudf/pull/16385

commit 2eabe0de584ff8c8ae6e82b1845309d5b01c4a98
Merge: 4624edf586 8bba6dfad2
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 18:16:08 2024 -0400

    Merge pull request #16388 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 8bba6dfad239b4fd69a82acbc5dd7707ba576cce
Author: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date:   Wed Jul 24 18:16:03 2024 -0400

    Migrate lists/set_operations to pylibcudf (#16190)

    Apart of #15162

    Authors:
      - Matthew Murray (https://github.com/Matt711)

    Approvers:
      - Thomas Li (https://github.com/lithomas1)

    URL: https://github.com/rapidsai/cudf/pull/16190

commit 4624edf58683391529cd9d7b76ca2e45438655bf
Merge: 077457ee89 73937fbaba
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 16:42:06 2024 -0400

    Merge pull request #16387 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 73937fbabaeea76665663ed23688b1cac61b7ee9
Author: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date:   Wed Jul 24 16:42:00 2024 -0400

    Migrate lists/filling to pylibcudf (#16189)

    Apart of #15162

    Authors:
      - Matthew Murray (https://github.com/Matt711)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Thomas Li (https://github.com/lithomas1)
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16189

commit 077457ee89140e98c9e25849511b14410370f684
Merge: 17c1afbd93 8fcf72a787
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 13:06:35 2024 -0400

    Merge pull request #16382 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 8fcf72a787acb0168c97d11b8ab9130146e9b37e
Author: Alessandro Bellina <abellina@nvidia.com>
Date:   Wed Jul 24 12:06:29 2024 -0500

    [JNI] Add setKernelPinnedCopyThreshold and setPinnedAllocationThreshold (#16288)

    In 24.08 two new cuDF methods are being added, and the second method is still in flight (see: https://github.com/rapidsai/cudf/pull/16206):

    ```
    cudf::set_kernel_pinned_copy_threshold
    cudf::set_allocate_host_as_pinned_threshold
    ```

    We'd like to expose these methods in our JNI layer. I created a Cudf.java with the two static methods, and put the definitions in CudfJni.cpp.

    Marked as draft since I need https://github.com/rapidsai/cudf/pull/16206 to merge, and we are still testing it.

    Authors:
      - Alessandro Bellina (https://github.com/abellina)
      - Nghia Truong (https://github.com/ttnghia)

    Approvers:
      - Robert (Bobby) Evans (https://github.com/revans2)
      - Jason Lowe (https://github.com/jlowe)

    URL: https://github.com/rapidsai/cudf/pull/16288

commit 17c1afbd936989bdcdcdb5654c1cbc4dbe57cc7d
Merge: a0c58c766e 7191b74ce2
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 09:55:53 2024 -0400

    Merge pull request #16380 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 7191b74ce244518f17ef65e701f5a262f1c5cf8a
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Wed Jul 24 03:55:48 2024 -1000

    Align Index __init__ APIs with pandas 2.x (#16362)

    * It would be nice to have `Index`'s constructor to not go through `IndexMeta.__call__`, but I think that would be a separate effort
    * There were a couple `verify_integrity` keyword arguments added that don't raise a `NotImplementedError` since there's not support, but I don't think it's worth making this case falling back in `cudf.pandas` as it's just a validation and won't affect further behavior with the object

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16362

commit a0c58c766e41525059e5a4e37ac5fce3a638468e
Merge: b66281c4fa 743264f6ac
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 06:32:36 2024 -0400

    Merge pull request #16378 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 743264f6ac924fdbec58fad666f989b14b901a98
Author: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date:   Wed Jul 24 05:32:31 2024 -0500

    Warn on cuDF failure when `POLARS_VERBOSE` is true (#16308)

    Just something quick to get us started here

    Closes https://github.com/rapidsai/cudf/issues/16256

    Authors:
      - https://github.com/brandon-b-miller
      - Lawrence Mitchell (https://github.com/wence-)

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16308

commit b66281c4fa811431dec0cdc0d8222fba9e8e4088
Merge: f20205b2dc 62625f1bfc
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 03:42:08 2024 -0400

    Merge pull request #16376 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 62625f1bfcdb980186a1afbec41e420fdb4a7075
Author: Matt Topol <zotthewizard@gmail.com>
Date:   Wed Jul 24 03:42:03 2024 -0400

    Host implementation of `to_arrow` using nanoarrow (#16297)

    Adds the corresponding `to_arrow_host` functions for interop using `ArrowDeviceArray`. This includes updating the version of nanoarrow in use to pick up some bug fixes and features.

    Authors:
      - Matt Topol (https://github.com/zeroshade)
      - Muhammad Haseeb (https://github.com/mhaseeb123)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Muhammad Haseeb (https://github.com/mhaseeb123)
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16297

commit f20205b2dc7a5e830b72386df378934c53da5043
Merge: bc748d67b5 8c1749b40e
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 01:19:15 2024 -0400

    Merge pull request #16375 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 8c1749b40eaa983966ed3bece6bdd29a4316d18a
Author: Kyle Edwards <kyedwards@nvidia.com>
Date:   Wed Jul 24 01:19:10 2024 -0400

    Use rapids_cpm_bs_thread_pool() (#16360)

    Authors:
      - Kyle Edwards (https://github.com/KyleFromNVIDIA)

    Approvers:
      - Bradley Dice (https://github.com/bdice)

    URL: https://github.com/rapidsai/cudf/pull/16360

commit bc748d67b52de4cf1c876f9701644fdbf1d839e5
Merge: 6d9aff4b7d 75289c58f3
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Wed Jul 24 00:46:03 2024 -0400

    Merge pull request #16374 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 75289c58f3d9ca11a51396e4adadfbd5f51856f5
Author: Bradley Dice <bdice@bradleydice.com>
Date:   Tue Jul 23 23:45:59 2024 -0500

    Rename PrefetchConfig to prefetch_config. (#16358)

    This PR addresses a comment requesting a rename of `PrefetchConfig` to `prefetch_config`.

    See: https://github.com/rapidsai/cudf/pull/16020#discussion_r1686284151

    Authors:
      - Bradley Dice (https://github.com/bdice)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)
      - Shruti Shivakumar (https://github.com/shrshi)
      - Nghia Truong (https://github.com/ttnghia)

    URL: https://github.com/rapidsai/cudf/pull/16358

commit 6d9aff4b7dfd23db43d294dacdeaf6c52af2fc4b
Merge: dcf791c83e f0efc8b36a
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 23 20:17:10 2024 -0400

    Merge pull request #16373 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit f0efc8b36a8f43cfa027966265dcea052bb5c45d
Author: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date:   Tue Jul 23 17:17:05 2024 -0700

    Modify `make_host_vector` and `make_device_uvector` factories to optionally use pinned memory and kernel copy (#16206)

    Issue #15616

    Modified `make_host_vector` functions to return `cudf::detail::host_vector`, which can use a pinned or a pageable memory resource. When pinned memory is used, the D2H copy is potentially done using a CUDA kernel.

    Also added factories to create `host_vector`s without device data. These are useful to replace uses of `std::vector` and `thrust::host_vector` when the data eventually gets copied to the GPU.

    Added `is_device_accessible` to `host_span`. With this, `make_device_uvector` can optionally use the kernel for the H2D copy.

    Modified `cudf::detail::host_vector` to be derived from `thrust::host_vector`, to avoid issues with implicit conversion from `std::vector`.

    Used `cudf::detail::host_vector` and its new factory functions wherever data ends up copied to the GPU.

    Stopped using `thrust::copy_n` for the kernel copy path in `cuda_memcpy` because of an optimization that allows it to fall back to `cudaMemCpyAsync`. We now call a simple local kernel.

    Authors:
      - Vukasin Milovanovic (https://github.com/vuule)

    Approvers:
      - Robert Maynard (https://github.com/robertmaynard)
      - Yunsong Wang (https://github.com/PointKernel)
      - Nghia Truong (https://github.com/ttnghia)
      - Alessandro Bellina (https://github.com/abellina)

    URL: https://github.com/rapidsai/cudf/pull/16206

commit dcf791c83e3ab87d57d94017ee7413d96f9e99a5
Merge: 7a09f809dc 39f256c339
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 23 20:03:22 2024 -0400

    Merge pull request #16372 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 39f256c3397afc9c495cb819636abddb23f81dc0
Author: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date:   Tue Jul 23 19:03:16 2024 -0500

    Fall back to CPU for unsupported libcudf binaryops in cudf-polars (#16188)

    This PR adds logic that should trigger CPU fallback unsupported binary ops.

    Authors:
      - https://github.com/brandon-b-miller
      - Lawrence Mitchell (https://github.com/wence-)

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16188

commit 7a09f809dc5c8cf8d2663fae186e4d249893c888
Merge: a3aacd8915 cd711913d2
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 23 18:24:24 2024 -0400

    Merge pull request #16370 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit cd711913d2312ba158e34f5c03784a7b07f1583a
Author: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date:   Wed Jul 24 00:24:19 2024 +0200

    Adds write-coalescing code path optimization to FST (#16143)

    This PR adds an optimized code path to the finite-state transducer (FST) that will use a shared memory-backed write buffer for the translated output and translated output indexes, if the the write buffer does not require allocating excessive amounts of shared memory (i.e., current heuristic is 24 KB/CTA). Writes are first buffered in shared memory and then collaboratively written out using coalesced writes to global memory.

    ## Benchmark results

    Numbers are for libcudf's FST_NVBENCH for a 1.073 GB input. FST outputs one token per input symbol. Benchmarks run on V100 with 900 GB/s theoretical peak BW.
    We compare the current FST implementation (old) to an FST implementaation that uses write-coalescing to gmem (new).

    |                  | OLD throughput  (GB/s) | NEW throughput  (GB/s) | relative performance |   | 1st kernel, per byte: bytes read/written | 2nd kernel, per byte: bytes read/written | expected SOL (GB/s) | achieved SOL (old) | achieved SOL (new) |
    |------------------|------------------------|------------------------|----------------------|---|------------------------------------------|------------------------------------------|---------------------|--------------------|--------------------|
    | full             |                   15.7 |                  74.74 |                 476% |   |                                        1 |                                        6 |              102.86 |             15.26% |             72.66% |
    | no out-indexes   |                 39.123 |                  105.8 |                 270% |   |                                        1 |                                        2 |              240.00 |             16.30% |             44.08% |
    | no-output        |                 229.27 |                 178.92 |                  78% |   |                                        1 |                                        1 |              360.00 |             63.69% |             49.70% |
    | out-indexes-only |                  24.95 |                   85.2 |                 341% |   |                                        1 |                                        5 |              120.00 |             20.79% |             71.00% |

    Authors:
      - Elias Stehle (https://github.com/elstehle)

    Approvers:
      - Shruti Shivakumar (https://github.com/shrshi)
      - Vukasin Milovanovic (https://github.com/vuule)

    URL: https://github.com/rapidsai/cudf/pull/16143

commit a3aacd8915fa503ea4be8e1d7797a080e0427923
Merge: 2de9fa7bd8 ff30c02111
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 23 15:04:01 2024 -0400

    Merge pull request #16366 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit ff30c0211109e14b1f6918fcc6c2e2b98f863a1f
Author: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date:   Tue Jul 23 12:03:55 2024 -0700

    Fix compile warnings with `jni_utils.hpp` (#16336)

    This fixes the compiler warnings with `jni_utils.hpp`, removing some `const` qualifiers that are redundant.

    Closes https://github.com/rapidsai/cudf/issues/16335.

    Authors:
      - Nghia Truong (https://github.com/ttnghia)

    Approvers:
      - Jason Lowe (https://github.com/jlowe)

    URL: https://github.com/rapidsai/cudf/pull/16336

commit 2de9fa7bd821c7b1653340dfca4e6a1e9e216cc5
Merge: bc609fb648 e6d412cba7
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 23 07:03:33 2024 -0400

    Merge pull request #16364 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit e6d412cba7c23df7ee500c28257ed9281cea49b9
Author: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date:   Tue Jul 23 06:03:28 2024 -0500

    Fall back when casting a timestamp to numeric in cudf-polars (#16232)

    This PR adds logic that falls back to CPU when a cudf-polars query would cast a timestamp column to a numeric type, an unsupported operation in libcudf, which should fix a few polars tests. It could be cleaned up a bit with some of the utilities that will be added in https://github.com/rapidsai/cudf/pull/16150.

    Authors:
      - https://github.com/brandon-b-miller

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16232

commit bc609fb6482e32152d64f3e9d34aaa4cb9b87cec
Merge: 023dba6fab c7b28ceeb4
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Tue Jul 23 06:28:20 2024 -0400

    Merge pull request #16363 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit c7b28ceeb46d2b921e30f081a9ed97745c91ff9e
Author: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date:   Tue Jul 23 05:28:13 2024 -0500

    Add `drop_nulls` in `cudf-polars` (#16290)

    Closes https://github.com/rapidsai/cudf/issues/16219

    Authors:
      - https://github.com/brandon-b-miller

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16290

commit 023dba6fab1c00116b11ff10fc7536d4f9e78fcd
Merge: 4a0813b681 0cac2a9d68
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 17:18:26 2024 -0400

    Merge pull request #16359 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 0cac2a9d68341a38721be16132ead14cf4a0d70b
Author: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date:   Mon Jul 22 14:18:21 2024 -0700

    Remove size constraints on source files in batched JSON reading (#16162)

    Addresses https://github.com/rapidsai/cudf/issues/16138
    The batched multi-source JSON reader fails when the size of any of the input source buffers exceeds `INT_MAX` bytes.
    The goal of this PR is to remove this constraint by modifying the batching behavior of the reader.  Instead of constructing batches that include entire source files, the batches are now constructed at the granularity of byte ranges of size at most `INT_MAX` bytes,

    Authors:
      - Shruti Shivakumar (https://github.com/shrshi)

    Approvers:
      - Vukasin Milovanovic (https://github.com/vuule)
      - Karthikeyan (https://github.com/karthikeyann)

    URL: https://github.com/rapidsai/cudf/pull/16162

commit 4a0813b68158474b00d3e7c692310b62b48fe2fc
Merge: a4acaa7177 81e65ee312
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 16:18:45 2024 -0400

    Merge pull request #16357 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 81e65ee312af5133ca2b98d52efaeb29c274a825
Author: GALI PREM SAGAR <sagarprem75@gmail.com>
Date:   Mon Jul 22 15:18:40 2024 -0500

    Fix docstring of `DataFrame.apply` (#16351)

    This PR fixes docstring of `DataFrame.apply`

    Authors:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    Approvers:
      - Matthew Roeschke (https://github.com/mroeschke)

    URL: https://github.com/rapidsai/cudf/pull/16351

commit a4acaa717798a3a09a57ab333965c00666d9d808
Merge: 0868314b1d 996cb8d870
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 16:15:22 2024 -0400

    Merge pull request #16356 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 996cb8d870b7b6153802bde670435e8cd3b8775d
Author: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date:   Mon Jul 22 16:15:16 2024 -0400

    Migrate lists/sorting to pylibcudf (#16179)

    Apart of #15162

    Authors:
      - Matthew Murray (https://github.com/Matt711)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16179

commit 0868314b1d5f2ca31eb56f4fee5f75de42b22fbe
Merge: a3ebf3badd c14c8bf59f
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 15:04:01 2024 -0400

    Merge pull request #16355 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit c14c8bf59fd1e97fe94c8dfd2db6df7f9a6c65ad
Author: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date:   Mon Jul 22 12:03:56 2024 -0700

    Implement parquet reading using pylibcudf in cudf-polars (#16346)

    Replace cudf-classic with pylibcudf for parquet reading in cudf-polars.

    Authors:
      - Thomas Li (https://github.com/lithomas1)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16346

commit a3ebf3badd0c7375b3f24dd466d4db8fa127000e
Merge: edbb1bcd9c e0a00c1fcb
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 15:03:29 2024 -0400

    Merge pull request #16354 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit e0a00c1fcb4b72b7abd29debe5b2f6b38081d39a
Author: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date:   Mon Jul 22 12:03:24 2024 -0700

    Add `stream` param to list explode APIs (#16317)

    Add `stream` param to list `explode*` APIs. Partially fixes https://github.com/rapidsai/cudf/issues/13744

    Authors:
      - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16317

commit edbb1bcd9c363876b79039caf7176270ee3eba03
Merge: b52ec0f436 e54b82c9f3
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 15:03:09 2024 -0400

    Merge pull request #16353 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit e54b82c9f3499b35e7e789d41d2042a5d5a80810
Author: Mark Harris <783069+harrism@users.noreply.github.com>
Date:   Tue Jul 23 05:03:04 2024 +1000

    Use resource_ref for upstream in stream_checking_resource_adaptor (#16187)

    As we move toward replacing all `device_memory_resource` pointers with `resource_ref`s, there are some places that changes can be made ahead of RMM to simplify required changes as RMM is refactored.

    In this PR I eliminate the unnecessary `Upstream` template parameter from `cudf_test::stream_checking_resource_adaptor`, and use a `device_async_resource` for the upstream resource.   A similar change will be made to all RMM resource adaptors, but this one can be done without deprecations since it is just a test utility.

    Authors:
      - Mark Harris (https://github.com/harrism)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16187

commit b52ec0f436c549b79daf6d9379ad2851b8833dbe
Merge: 0135e46880 3053f42351
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 13:56:45 2024 -0400

    Merge pull request #16352 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 3053f42351b04e22d873f78f5bc49f8b20ff17ac
Author: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date:   Mon Jul 22 10:56:39 2024 -0700

    Add missing `stream` param to dictionary factory APIs (#16319)

    Add `stream` param to dictionary column factory functions. Partially solves #13744

    Authors:
      - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

    Approvers:
      - Mark Harris (https://github.com/harrism)
      - Yunsong Wang (https://github.com/PointKernel)

    URL: https://github.com/rapidsai/cudf/pull/16319

commit 0135e468808ccf7e8471e654bcd723eafb9c48c5
Merge: c53f9c54ac 135c99512e
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 10:13:37 2024 -0400

    Merge pull request #16344 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 135c99512e5f7a2d38f6a870ad6883ccb39a3cce
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Mon Jul 22 04:13:32 2024 -1000

    Align Series APIs with pandas 2.x (#16333)

    Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters

    * `reindex`
    * `reset_index`
    * `add_suffix`
    * `searchsorted`
    * `clip`
    * `mask`
    * `shift`
    * `dropna`
    * `rename`
    * `cov`
    * `apply`
    * `replace`

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16333

commit c53f9c54ac9e4d25350f04ffcb41ceb5bca9bdb2
Merge: c636778de3 852b151002
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Mon Jul 22 09:48:23 2024 -0400

    Merge pull request #16343 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 852b151002dc76e9f09d3529c80e4b589f1df9fc
Author: Lawrence Mitchell <lmitchell@nvidia.com>
Date:   Mon Jul 22 14:48:18 2024 +0100

    Fix issue in horizontal concat implementation in cudf-polars (#16271)

    Shorter tables must be extended to the same length as the longest table.

    Authors:
      - Lawrence Mitchell (https://github.com/wence-)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16271

commit c636778de39491e24ace55d99dcfb29c574a20d2
Merge: dacc6c0baa e6537de747
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 23:10:44 2024 -0400

    Merge pull request #16342 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit e6537de7474c91b4153542e6611c8a4e33a58caa
Author: Vyas Ramasubramani <vyasr@nvidia.com>
Date:   Fri Jul 19 20:10:40 2024 -0700

    Experimental support for configurable prefetching (#16020)

    This PR adds experimental support for prefetching managed memory at a select few points in libcudf. A new configuration object is introduced for handling whether prefetching is enabled or disabled, and whether to print debug information about pointers being prefetched. Prefetching control is managed on a per API basis to enable profiling of the effects of prefetching different classes of data in different contexts. Prefetching in this PR always occurs on the default stream, so it will trigger synchronization with any blocking streams that the user has created. Turning on prefetching and then passing non-blocking to any libcudf APIs will trigger undefined behavior.

    Authors:
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - David Wendt (https://github.com/davidwendt)
      - Kyle Edwards (https://github.com/KyleFromNVIDIA)
      - Thomas Li (https://github.com/lithomas1)
      - Muhammad Haseeb (https://github.com/mhaseeb123)

    URL: https://github.com/rapidsai/cudf/pull/16020

commit dacc6c0baa47c89fe8e0d1c3d246bcc94a4b6416
Merge: 1ccdf15dd7 c5b96003ce
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 23:04:24 2024 -0400

    Merge pull request #16341 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit c5b96003cef00b2635923d03edcd48a13821a61e
Author: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date:   Fri Jul 19 20:04:19 2024 -0700

    Migrate Parquet reader to pylibcudf (#16078)

    xref #15162

    Migrates the parquet reader (and chunked parquet reader) to pylibcudf.

    (Does not migrate the writers or the metadata reader yet).

    Authors:
      - Thomas Li (https://github.com/lithomas1)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16078

commit 1ccdf15dd736a1a08aa8f566a47ca0392ca33cac
Merge: 97e1bab151 26a3799d2f
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 22:49:07 2024 -0400

    Merge pull request #16340 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 26a3799d2ff9ffb2aa72d63bb388b4bee70b3440
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 16:49:01 2024 -1000

    Make ColumnAccessor strictly require a mapping of columns (#16285)

    `ColumnAccessor` had a default `data=None` argument and initialized an empty dict in the `__init__` if `data` was not passed. This PR now makes `data` a required argument.

    Additionally if `verify=True`, the `__init__` would call `as_column` on each `data.values()` allowing non-`ColumnBase` inputs. This PR now avoids this call and makes the caller responsible for ensuring the inputs are `ColumnBase`s

    Also, adds a few `verify=False` internally where we know we are passing columns from a libcudf op or reconstructing from another `ColumnAccessor`

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16285

commit 97e1bab151184aa537edf39b7e838c07e07271a9
Merge: 5ad4c877ed 75335f6af5
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 21:21:32 2024 -0400

    Merge pull request #16339 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 75335f6af51bde6be68c1fb0a6caa8030b9eda3e
Author: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date:   Fri Jul 19 18:21:27 2024 -0700

    Report number of rows per file read by PQ reader when no row selection and fix segfault in chunked PQ reader when skip_rows > 0 (#16195)

    Closes #15389
    Closes #16186

    This PR adds the capability to calculate and report the number of rows read from each data source into the table returned by the Parquet reader (both chunked and normal). The returned vector of counts is only valid (non-empty) when row selection (AST filter) is not being used.

    This PR also fixes a segfault in chunked parquet reader when skip_rows > 0 and the number of passes > 1. This segfault was being caused by a couple of arithmetic errors when computing the (start_row, num_row)  for row_group_info, pass, column chunk descriptor structs.

    Both changes were added to this PR as changes and the gtests from the former work were needed to implement the segfault fix.

    Authors:
      - Muhammad Haseeb (https://github.com/mhaseeb123)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)
      - Vukasin Milovanovic (https://github.com/vuule)

    URL: https://github.com/rapidsai/cudf/pull/16195

commit 5ad4c877ed631094f358f87c003ee9b381e9e270
Merge: ebacf394d9 535db9b26e
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 20:28:20 2024 -0400

    Merge pull request #16338 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 535db9b26ed1a57e4275f4a6f11b04ebeee21248
Author: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date:   Fri Jul 19 17:28:14 2024 -0700

    Deprecate Arrow support in I/O (#16132)

    Contributes to https://github.com/rapidsai/cudf/issues/15193

    Authors:
      - Thomas Li (https://github.com/lithomas1)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Richard (Rick) Zamora (https://github.com/rjzamora)
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16132

commit ebacf394d975fa5a0f65a7337d5587c9e8273902
Merge: b11cdf854d e169e8e427
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 19:36:08 2024 -0400

    Merge pull request #16337 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit e169e8e4273e4d317e3f27c810c5b137dd75adb3
Author: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date:   Fri Jul 19 16:36:03 2024 -0700

    Implement read_csv in cudf-polars using pylibcudf (#16307)

    Replace cudf-classic with pylibcudf for CSV reading in cudf-polars

    Authors:
      - Thomas Li (https://github.com/lithomas1)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)

    URL: https://github.com/rapidsai/cudf/pull/16307

commit b11cdf854d64e248d682ad2d8178f8ae08e34b3e
Merge: d82caec4e0 5dde41d7f7
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 19:08:41 2024 -0400

    Merge pull request #16334 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 5dde41d7f7533180ecd355bac248a7ed18adcc10
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 13:08:36 2024 -1000

    Replace is_float/integer_dtype checks with .kind checks (#16261)

    It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16261

commit d82caec4e04468b497f2d553221c6314c53f9d10
Merge: 3c3ee56637 4c46628eaf
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 18:51:12 2024 -0400

    Merge pull request #16332 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 4c46628eaf7ba16a2a181ceb3311f315cd4932dc
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 12:51:07 2024 -1000

    Mark cudf._typing as a typing module in ruff (#16318)

    Additionally breaks up the prior, single-line of `select` rules that are enabled.

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - Thomas Li (https://github.com/lithomas1)
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16318

commit 3c3ee56637116e07804f20efab46d4dd3aa7c4cf
Merge: 1cb07e0c29 7d3083254c
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 18:48:43 2024 -0400

    Merge pull request #16331 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 7d3083254c0503b07f82af32188120f42acef860
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 12:48:39 2024 -1000

    Replace np.isscalar/issubdtype checks with is_scalar/.kind checks (#16275)

    * `is_scalar` also handles cudf.Scalars which should be handled internally
    * `issubdtype` can largely be replaced by checking the `.kind` attribute on the dtype

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16275

commit 1cb07e0c29c0b6acd1896ecef867afeca27a84c1
Merge: 52657b3375 57ed7fce67
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 18:25:01 2024 -0400

    Merge pull request #16330 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 57ed7fce6742abc96a8fd65216f032bad5937a2f
Author: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date:   Fri Jul 19 17:24:55 2024 -0500

    Add tests for `pylibcudf` binaryops (#15470)

    This PR implements a more general approach to testing binaryops that originally came up in https://github.com/rapidsai/cudf/pull/15279. This PR can possibly supersede that one.

    Authors:
      - https://github.com/brandon-b-miller

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/15470

commit 52657b3375c900a66b6ec5f8d7e1ebe37c38232f
Merge: 6be515506d ecc27a1140
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 17:55:45 2024 -0400

    Merge pull request #16329 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit ecc27a1140c0c287091f6a1291dfaf7ccd82cb19
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 11:55:40 2024 -1000

    Align more DataFrame APIs with pandas (#16310)

    I have a script that did some signature comparisons between `pandas.DataFrame` and `cudf.DataFrame` API and it appears some signatures have changed between the pandas 1.x and 2.x release. The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

    * `__init__`
    * `__array__`
    * `__arrow_c_stream__`
    * `to_dict`
    * `where`
    * `add_prefix`
    * `join`
    * `apply`
    * `to_records`
    * `from_records`
    * `unstack`
    * `pct_change`
    * `sort_values`

    Marking as breaking as I ensured some added keywords are in the same positions as pandas and therefore might break users who are using purely positional arguments.

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16310

commit 6be515506d4a6f833e71ac67f16c2925f7b8576b
Merge: fcaea56166 6e37afc7c9
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 17:52:32 2024 -0400

    Merge pull request #16328 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 6e37afc7c9e177b307c41950e52453bd5906af44
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 11:52:27 2024 -1000

    Make __bool__ raise for more cudf objects (#16311)

    To match pandas, this PR makes `DataFrame`, `MultiIndex` and `RangeIndex` raise on `__bool__`.

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    Approvers:
      - GALI PREM SAGAR (https://github.com/galipremsagar)

    URL: https://github.com/rapidsai/cudf/pull/16311

commit fcaea56166e2d8f8b1916d702ec8572a9e12b2be
Merge: 051fadd250 910989eb8f
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 17:48:42 2024 -0400

    Merge pull request #16327 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 910989eb8fb87b2e896aa032260705c27cce71e0
Author: Bradley Dice <bdice@bradleydice.com>
Date:   Fri Jul 19 15:48:37 2024 -0600

    Rename gather/scatter benchmarks to clarify coalesced behavior. (#16083)

    The benchmark names `coalesce_x` and `coalesce_o` are not very clear. This PR renames them to `coalesced` and `shuffled`. This was discussed with @GregoryKimball.

    Authors:
      - Bradley Dice (https://github.com/bdice)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Karthikeyan (https://github.com/karthikeyann)
      - Mike Wilson (https://github.com/hyperbolic2346)

    URL: https://github.com/rapidsai/cudf/pull/16083

commit 051fadd2500bc20b90b74d662deec918ee27f299
Merge: ece86996ad fa0d89d9b4
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 17:46:33 2024 -0400

    Merge pull request #16326 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit fa0d89d9b4b4152b919999b5f01b1e68407469c5
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 11:46:28 2024 -1000

    Clean unneeded/redudant dtype utils (#16309)

    * Replace `min_scalar_type` with `min_signed_type` (the former just called the latter)
    * Replace `numeric_normalize_types` with `find_common_dtype` followed by a column `astype`
    * Removed `_NUMPY_SCTYPES` with just hardcoding the integer/floating types or using `np.integer`/`np.floating`

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16309

commit ece86996ad69b1631e0da6f4dfb551cda38585a8
Merge: f47c891a2e 18f5fe0010
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 17:41:47 2024 -0400

    Merge pull request #16325 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 18f5fe0010fd42f604a340cd025a9ca9e122c6f5
Author: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date:   Fri Jul 19 14:41:39 2024 -0700

    Fix polars for 1.2.1 (#16316)

    I think Polars made a breaking change in a patch release.
    At least the error we're getting looks like the error from
    https://github.com/pola-rs/polars/pull/17606.

    Authors:
      - Thomas Li (https://github.com/lithomas1)

    Approvers:
      - Lawrence Mitchell (https://github.com/wence-)
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16316

commit f47c891a2ea3a0de4bb0462d557531e046860fbb
Merge: c61638cbeb 3df4ac2842
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 16:46:23 2024 -0400

    Merge pull request #16323 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit 3df4ac28423b99e4dd88570da8d55e2e5af2e1bc
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 10:46:18 2024 -1000

    Remove squeeze argument from groupby (#16312)

    In pandas, this argument was deprecated in pandas 1.x and removed in pandas 2.x. xref https://github.com/pandas-dev/pandas/pull/33218

    Looks like in cudf this argument was never implemented, so to align with pandas, I think it should be OK to just remove this argument

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16312

commit c61638cbeb4eeb9ce9244508edbe33ccc301b07e
Merge: f7e9d0c0f8 cb570fe6d7
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 16:45:35 2024 -0400

    Merge pull request #16322 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit cb570fe6d7dc7ebdd6c8c030916ba27bef277b5e
Author: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date:   Fri Jul 19 10:45:30 2024 -1000

    Deprecate dtype= parameter in reduction methods (#16313)

    In terms of pandas alignment, this argument doesn't exist in reduction ops. Additionally, the same result can be easily achieved by calling `astype` after the operation, and it appears libcudf does not support any arbitrary casting to an output type.

    Authors:
      - Matthew Roeschke (https://github.com/mroeschke)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16313

commit f7e9d0c0f829118f06054c2e03425d7ddf33767e
Merge: f364fdcd44 dc62177a64
Author: gpuCI <38199262+GPUtester@users.noreply.github.com>
Date:   Fri Jul 19 15:17:46 2024 -0400

    Merge pull request #16320 from rapidsai/branch-24.08

    Forward-merge branch-24.08 into branch-24.10

commit dc62177a64a5fb4d6521f346ff0f44c2ede740f6
Author: Lawrence Mitchell <lmitchell@nvidia.com>
Date:   Fri Jul 19 20:17:42 2024 +0100

    Preserve order in left join for cudf-polars (#16268)

    Unlike all other joins, polars provides an ordering guarantee for left joins. By default libcudf does not, so we need to order the gather maps in this case.

    While here, because it requires another hard-coding of `int32` for something that should be `size_type`, expose `type_to_id` in cython and plumb it through.

    Authors:
      - Lawrence Mitchell (https://github.com/wence-)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16268

commit f364fdcd44540b6d5403f1d08acbebfff4e78bd4
Author: Ray Douglass <ray@raydouglass.com>
Date:   Fri Jul 19 14:56:13 2024 -0400

    DOC v24.10 Updates [skip ci]

commit d5ab48d4f2586d2e45234463c1bbe877ce76afe8
Author: Kyle Edwards <kyedwards@nvidia.com>
Date:   Fri Jul 19 14:32:54 2024 -0400

    Use workflow branch 24.08 again (#16314)

    After updating everything to CUDA 12.5.1, use `shared-workflows@branch-24.08` again.

    Contributes to https://github.com/rapidsai/build-planning/issues/73

    Authors:
      - Kyle Edwards (https://github.com/KyleFromNVIDIA)

    Approvers:
      - James Lamb (https://github.com/jameslamb)
      - https://github.com/jakirkham

    URL: https://github.com/rapidsai/cudf/pull/16314

commit 2bbeee95ec338c30c0c876dc6a58376fbb0a5a06
Author: Ray Bell <rayjohnbell0@gmail.com>
Date:   Fri Jul 19 12:43:49 2024 -0400

    DOC: use intersphinx mapping in pandas-compat ext (#15846)

    ~~If https://github.com/rapidsai/cudf/pull/15704 is merged~~

    This PR changes the header in the admonition (pandas compat box) to be hyperlinked to the pandas docs instead of just text. See https://raybellwaves.github.io/compatsphinxext/compat.html which is the docs of a minimal repo where I have been testing

    Authors:
      - Ray Bell (https://github.com/raybellwaves)
      - Bradley Dice (https://github.com/bdice)
      - Vyas Ramasubramani (https://github.com/vyasr)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/15846

commit 461ed33753545832da0ff13bf01cf922a651bf0a
Merge: 9a713e3adb 752b1f32b1
Author: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date:   Fri Jul 19 11:06:06 2024 -0500

    Merge pull request #16315 from vyasr/branch-24.08-merge-branch-24.06

    Branch 24.08 merge branch 24.06

commit 752b1f32b128b69847c8fc306f1c28ab7f91354b
Merge: 9a713e3adb 781794bb52
Author: Vyas Ramasubramani <vyasr@nvidia.com>
Date:   Fri Jul 19 15:04:17 2024 +0000

    Merge branch 'branch-24.06' into branch-24.08-merge-branch-24.06

commit 9a713e3adb8abb1f41de0445b8ea896fdb48c560
Author: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date:   Fri Jul 19 10:34:16 2024 -0400

    Migrate lists/count_elements to pylibcudf (#16072)

    Apart of #15162

    Authors:
      - Matthew Murray (https://github.com/Matt711)

    Approvers:
      - Thomas Li (https://github.com/lithomas1)

    URL: https://github.com/rapidsai/cudf/pull/16072

commit 8ff27ed5bcaf8fc5fc8d1f546dee30c59861c320
Author: Lawrence Mitchell <lmitchell@nvidia.com>
Date:   Fri Jul 19 15:15:20 2024 +0100

    Support Literals in groupby-agg (#16218)

    To do this, we just need to collect the appropriate aggregation information, and broadcast literals to the correct size.

    Authors:
      - Lawrence Mitchell (https://github.com/wence-)

    Approvers:
      - Vyas Ramasubramani (https://github.com/vyasr)

    URL: https://github.com/rapidsai/cudf/pull/16218

commit debbef0bc12f523054740432983030dd0b24f9c4
Author: Lawrence Mitchell <lmitchell@nvidia.com>
Date:   Fri Jul 19 15:12:56 2024 +0100

    Update vendored thread_pool implementation (#16210)

    Since we introduced the vendored thread_pool in #8752, upstream has introduced some new features, and particularly now uses condition variables/notification to handle when there are no tasks in the queue. This avoids the issue described in #16209 where the thread pool by default artificially introduces a delay of 1000microseconds to all tasks whenever the task queue is emptied.

    - Closes #16209

    Authors:
      - Lawrence Mitchell (https://github.com/wence-)

    Approvers:
      - Bradley Dice (https://github.com/bdice)
      - Robert Maynard (https://github.com/robertmaynard)

    URL: https://github.com/rapidsai/cudf/pull/16210

commit 781794bb52448f617351ed96441a8e2fdb765dd7
Author: Vyas Ramasubramani <vyasr@nvidia.com>
Date:   Mon Jul 1 14:59:04 2024 -0700

    Backport #16045 to 24.06 (#16102)

    Backporting #16045 for a patch release.

    ---------

    Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>

commit dfab1b589e5907b324dc1688f6dab862d194012c
Author: Bradley Dice <bdice@bradleydice.com>
Date:   Mon Jul 1 15:33:42 2024 -0500

    Backport: Use size_t to allow large conditional joins (#16127) (#16133)

    Backports #16127 to 24.06 for inclusion in a hotfix release.

    ---------

    Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>

commit e41242094092f9ed31fd4d04f8a30107c1ffb2ff
Author: Vyas Ramasubramani <vyasr@nvidia.com>
Date:   Mon Jul 1 11:24:52 2024 -0700

    Backport #16038 to 24.06 (#16101)

    Backporting #16038 for a patch release.

    ---------

    Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>

commit 4e34a20a31fae2546f9cfbaa520d7561b80563c7
Author: Bradley Dice <bdice@bradleydice.com>
Date:   Mon Jul 1 11:18:25 2024 -0500

    Backport: Fix segfault in conditional join (#16094) (#16100)

    Backports #16094 to 24.06 for inclusion in a hotfix release.
---
 .../cuda11.8-conda/devcontainer.json          |    6 +-
 .devcontainer/cuda11.8-pip/devcontainer.json  |    6 +-
 .../cuda12.5-conda/devcontainer.json          |    9 +-
 .devcontainer/cuda12.5-pip/devcontainer.json  |    6 +-
 .github/workflows/build.yaml                  |   20 +-
 .github/workflows/pandas-tests.yaml           |    2 +-
 .github/workflows/pr.yaml                     |   46 +-
 .../workflows/pr_issue_status_automation.yml  |    6 +-
 .github/workflows/test.yaml                   |   22 +-
 README.md                                     |    2 +-
 VERSION                                       |    2 +-
 ci/build_python.sh                            |    7 +
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |    2 +-
 ci/release/update-version.sh                  |    5 +-
 ci/test_wheel_cudf_polars.sh                  |    2 +-
 .../all_cuda-118_arch-x86_64.yaml             |   10 +-
 .../all_cuda-125_arch-x86_64.yaml             |   10 +-
 cpp/CMakeLists.txt                            |   13 +-
 cpp/benchmarks/common/generate_input.cu       |    2 +-
 cpp/benchmarks/copying/gather.cu              |    6 +-
 cpp/benchmarks/copying/scatter.cu             |    6 +-
 cpp/benchmarks/fixture/benchmark_fixture.hpp  |    2 +-
 .../groupby/group_max_multithreaded.cpp       |   10 +-
 cpp/benchmarks/io/fst.cu                      |   16 +-
 .../io/orc/orc_reader_multithreaded.cpp       |   26 +-
 .../io/parquet/parquet_reader_multithread.cpp |   26 +-
 cpp/benchmarks/lists/copying/scatter_lists.cu |    6 +-
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |    4 +-
 cpp/cmake/thirdparty/get_thread_pool.cmake    |   25 +
 .../thirdparty/patches/cccl_override.json     |    5 +
 .../patches/cccl_symbol_visibility.diff       |   27 +
 .../developer_guide/DEVELOPER_GUIDE.md        |   27 +-
 cpp/doxygen/developer_guide/DOCUMENTATION.md  |    6 +-
 cpp/examples/tpch/CMakeLists.txt              |    4 +
 cpp/examples/tpch/q1.cpp                      |    2 +-
 cpp/examples/tpch/q10.cpp                     |  166 +++
 cpp/examples/tpch/q5.cpp                      |   20 +-
 cpp/examples/tpch/q6.cpp                      |    2 +-
 cpp/examples/versions.cmake                   |    2 +-
 cpp/include/cudf/aggregation.hpp              |    5 +-
 .../cudf/ast/detail/expression_parser.hpp     |   11 +-
 .../ast/detail/expression_transformer.hpp     |   10 +-
 cpp/include/cudf/ast/detail/operators.hpp     |    4 +-
 cpp/include/cudf/ast/expressions.hpp          |    4 +-
 cpp/include/cudf/binaryop.hpp                 |   31 +-
 cpp/include/cudf/column/column.hpp            |    4 +-
 .../cudf/column/column_device_view.cuh        |    4 +-
 cpp/include/cudf/column/column_factories.hpp  |    4 +-
 cpp/include/cudf/column/column_view.hpp       |   59 +-
 cpp/include/cudf/concatenate.hpp              |    5 +-
 cpp/include/cudf/contiguous_split.hpp         |   13 +-
 cpp/include/cudf/copying.hpp                  |    9 +-
 cpp/include/cudf/datetime.hpp                 |    5 +-
 .../cudf/detail/aggregation/aggregation.hpp   |    4 +-
 .../cudf/detail/aggregation/result_cache.hpp  |    6 +-
 cpp/include/cudf/detail/binaryop.hpp          |    5 +-
 cpp/include/cudf/detail/concatenate.hpp       |    5 +-
 cpp/include/cudf/detail/concatenate_masks.hpp |    5 +-
 cpp/include/cudf/detail/contiguous_split.hpp  |    4 +-
 cpp/include/cudf/detail/copy.hpp              |    4 +-
 cpp/include/cudf/detail/datetime.hpp          |    4 +-
 cpp/include/cudf/detail/fill.hpp              |    4 +-
 cpp/include/cudf/detail/gather.cuh            |    6 +-
 cpp/include/cudf/detail/gather.hpp            |    5 +-
 cpp/include/cudf/detail/groupby.hpp           |   13 +-
 .../detail/groupby/group_replace_nulls.hpp    |    4 +-
 .../cudf/detail/groupby/sort_helper.hpp       |   12 +-
 cpp/include/cudf/detail/interop.hpp           |    5 +-
 cpp/include/cudf/detail/is_element_valid.hpp  |    7 +-
 cpp/include/cudf/detail/join.hpp              |    9 +-
 cpp/include/cudf/detail/label_bins.hpp        |    4 +-
 cpp/include/cudf/detail/merge.hpp             |    6 +-
 cpp/include/cudf/detail/null_mask.cuh         |    4 +-
 cpp/include/cudf/detail/null_mask.hpp         |   12 +-
 cpp/include/cudf/detail/quantiles.hpp         |    5 +-
 cpp/include/cudf/detail/repeat.hpp            |    4 +-
 cpp/include/cudf/detail/replace.hpp           |    4 +-
 cpp/include/cudf/detail/reshape.hpp           |    8 +-
 cpp/include/cudf/detail/rolling.hpp           |    4 +-
 cpp/include/cudf/detail/round.hpp             |    4 +-
 cpp/include/cudf/detail/scan.hpp              |    7 +-
 cpp/include/cudf/detail/scatter.hpp           |    5 +-
 cpp/include/cudf/detail/search.hpp            |    8 +-
 cpp/include/cudf/detail/sequence.hpp          |    4 +-
 cpp/include/cudf/detail/sorting.hpp           |    4 +-
 cpp/include/cudf/detail/stream_compaction.hpp |    4 +-
 cpp/include/cudf/detail/structs/utilities.hpp |    8 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |   12 +-
 cpp/include/cudf/detail/timezone.hpp          |    7 +-
 cpp/include/cudf/detail/transform.hpp         |    5 +-
 cpp/include/cudf/detail/transpose.hpp         |    5 +-
 cpp/include/cudf/detail/unary.hpp             |    5 +-
 .../cudf/detail/utilities/alignment.hpp       |    6 +-
 .../cudf/detail/utilities/cuda_memcpy.hpp     |    8 +-
 .../cudf/detail/utilities/default_stream.hpp  |    8 +-
 .../cudf/detail/utilities/host_memory.hpp     |   51 +
 .../cudf/detail/utilities/host_vector.hpp     |   31 +-
 .../cudf/detail/utilities/linked_column.hpp   |    9 +-
 .../cudf/detail/utilities/stacktrace.hpp      |   10 +-
 .../cudf/detail/utilities/stream_pool.hpp     |    7 +-
 .../detail/utilities/vector_factories.hpp     |  111 +-
 cpp/include/cudf/detail/valid_if.cuh          |    2 +-
 .../cudf/dictionary/detail/concatenate.hpp    |   10 +-
 cpp/include/cudf/dictionary/detail/encode.hpp |   10 +-
 cpp/include/cudf/dictionary/detail/merge.hpp  |   10 +-
 .../cudf/dictionary/detail/replace.hpp        |   10 +-
 cpp/include/cudf/dictionary/detail/search.hpp |    5 +-
 .../cudf/dictionary/detail/update_keys.hpp    |   10 +-
 .../dictionary/dictionary_column_view.hpp     |    6 +-
 .../cudf/dictionary/dictionary_factories.hpp  |   17 +-
 cpp/include/cudf/dictionary/encode.hpp        |    4 +-
 cpp/include/cudf/dictionary/search.hpp        |    4 +-
 cpp/include/cudf/dictionary/update_keys.hpp   |    4 +-
 cpp/include/cudf/filling.hpp                  |    5 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |    4 +-
 .../cudf/fixed_point/floating_conversion.hpp  |    5 +-
 cpp/include/cudf/fixed_point/temporary.hpp    |    4 +-
 cpp/include/cudf/groupby.hpp                  |    5 +-
 cpp/include/cudf/hashing.hpp                  |    5 +-
 cpp/include/cudf/hashing/detail/hashing.hpp   |   10 +-
 cpp/include/cudf/interop.hpp                  |   88 +-
 cpp/include/cudf/interop/detail/arrow.hpp     |   53 -
 cpp/include/cudf/io/arrow_io_source.hpp       |    8 +-
 cpp/include/cudf/io/avro.hpp                  |    4 +-
 cpp/include/cudf/io/csv.hpp                   |    4 +-
 cpp/include/cudf/io/data_sink.hpp             |    6 +-
 cpp/include/cudf/io/datasource.hpp            |    7 +-
 cpp/include/cudf/io/detail/avro.hpp           |   13 +-
 cpp/include/cudf/io/detail/csv.hpp            |   13 +-
 cpp/include/cudf/io/detail/json.hpp           |    7 +-
 cpp/include/cudf/io/detail/orc.hpp            |   13 +-
 cpp/include/cudf/io/detail/parquet.hpp        |   13 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |    5 +-
 cpp/include/cudf/io/detail/utils.hpp          |   15 +-
 cpp/include/cudf/io/json.hpp                  |    8 +-
 cpp/include/cudf/io/orc.hpp                   |   14 +-
 cpp/include/cudf/io/orc_metadata.hpp          |    5 +-
 cpp/include/cudf/io/orc_types.hpp             |   10 +-
 cpp/include/cudf/io/parquet.hpp               |   18 +-
 cpp/include/cudf/io/parquet_metadata.hpp      |    5 +-
 cpp/include/cudf/io/text/byte_range_info.hpp  |    5 +-
 .../cudf/io/text/data_chunk_source.hpp        |    5 +-
 .../io/text/data_chunk_source_factories.hpp   |    9 +-
 .../cudf/io/text/detail/bgzip_utils.hpp       |    7 +-
 .../cudf/io/text/detail/multistate.hpp        |    8 +-
 .../cudf/io/text/detail/tile_state.hpp        |    6 +-
 cpp/include/cudf/io/text/detail/trie.hpp      |    9 +-
 cpp/include/cudf/io/text/multibyte_split.hpp  |    4 +-
 cpp/include/cudf/io/types.hpp                 |   11 +-
 cpp/include/cudf/join.hpp                     |   16 +-
 cpp/include/cudf/json/json.hpp                |    5 +-
 cpp/include/cudf/labeling/label_bins.hpp      |    4 +-
 cpp/include/cudf/lists/combine.hpp            |    5 +-
 cpp/include/cudf/lists/contains.hpp           |    5 +-
 cpp/include/cudf/lists/count_elements.hpp     |    5 +-
 cpp/include/cudf/lists/detail/combine.hpp     |   10 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |   10 +-
 cpp/include/cudf/lists/detail/contains.hpp    |   10 +-
 cpp/include/cudf/lists/detail/copying.hpp     |   10 +-
 cpp/include/cudf/lists/detail/dremel.hpp      |   17 +-
 cpp/include/cudf/lists/detail/extract.hpp     |   10 +-
 cpp/include/cudf/lists/detail/gather.cuh      |    3 +
 .../cudf/lists/detail/interleave_columns.hpp  |   10 +-
 .../lists/detail/lists_column_factories.hpp   |   10 +-
 cpp/include/cudf/lists/detail/reverse.hpp     |    7 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |   10 +-
 .../cudf/lists/detail/set_operations.hpp      |    6 +-
 cpp/include/cudf/lists/detail/sorting.hpp     |   10 +-
 .../cudf/lists/detail/stream_compaction.hpp   |    7 +-
 cpp/include/cudf/lists/explode.hpp            |   12 +-
 cpp/include/cudf/lists/extract.hpp            |    5 +-
 cpp/include/cudf/lists/filling.hpp            |    6 +-
 cpp/include/cudf/lists/gather.hpp             |    5 +-
 cpp/include/cudf/lists/list_device_view.cuh   |    4 +-
 cpp/include/cudf/lists/list_view.hpp          |    8 +-
 .../cudf/lists/lists_column_device_view.cuh   |    8 +-
 cpp/include/cudf/lists/lists_column_view.hpp  |    5 +-
 cpp/include/cudf/lists/reverse.hpp            |    7 +-
 cpp/include/cudf/lists/set_operations.hpp     |    8 +-
 cpp/include/cudf/lists/sorting.hpp            |    5 +-
 cpp/include/cudf/lists/stream_compaction.hpp  |    7 +-
 cpp/include/cudf/merge.hpp                    |    5 +-
 cpp/include/cudf/null_mask.hpp                |    5 +-
 cpp/include/cudf/partitioning.hpp             |    5 +-
 cpp/include/cudf/quantiles.hpp                |    5 +-
 cpp/include/cudf/reduction.hpp                |    5 +-
 .../cudf/reduction/detail/histogram.hpp       |    7 +-
 .../cudf/reduction/detail/reduction.hpp       |    7 +-
 .../reduction/detail/reduction_functions.hpp  |   11 +-
 .../detail/segmented_reduction_functions.hpp  |   11 +-
 cpp/include/cudf/replace.hpp                  |    5 +-
 cpp/include/cudf/reshape.hpp                  |   22 +-
 cpp/include/cudf/rolling.hpp                  |    5 +-
 .../cudf/rolling/range_window_bounds.hpp      |    5 +-
 cpp/include/cudf/round.hpp                    |    5 +-
 cpp/include/cudf/scalar/scalar.hpp            |    4 +-
 .../cudf/scalar/scalar_device_view.cuh        |    6 +-
 cpp/include/cudf/scalar/scalar_factories.hpp  |    4 +-
 cpp/include/cudf/search.hpp                   |    5 +-
 cpp/include/cudf/sorting.hpp                  |    5 +-
 cpp/include/cudf/stream_compaction.hpp        |    5 +-
 cpp/include/cudf/strings/attributes.hpp       |    4 +-
 cpp/include/cudf/strings/capitalize.hpp       |    4 +-
 cpp/include/cudf/strings/case.hpp             |    4 +-
 .../cudf/strings/char_types/char_cases.hpp    |    8 +-
 .../cudf/strings/char_types/char_types.hpp    |    4 +-
 .../strings/char_types/char_types_enum.hpp    |    6 +-
 cpp/include/cudf/strings/combine.hpp          |    4 +-
 cpp/include/cudf/strings/contains.hpp         |    4 +-
 .../cudf/strings/convert/convert_booleans.hpp |    4 +-
 .../cudf/strings/convert/convert_datetime.hpp |    4 +-
 .../strings/convert/convert_durations.hpp     |    4 +-
 .../strings/convert/convert_fixed_point.hpp   |    4 +-
 .../cudf/strings/convert/convert_floats.hpp   |    4 +-
 .../cudf/strings/convert/convert_integers.hpp |    4 +-
 .../cudf/strings/convert/convert_ipv4.hpp     |    4 +-
 .../cudf/strings/convert/convert_lists.hpp    |    4 +-
 .../cudf/strings/convert/convert_urls.hpp     |    4 +-
 .../cudf/strings/detail/char_tables.hpp       |   14 +-
 cpp/include/cudf/strings/detail/combine.hpp   |   11 +-
 .../cudf/strings/detail/concatenate.hpp       |   11 +-
 .../cudf/strings/detail/converters.hpp        |   11 +-
 .../cudf/strings/detail/copy_range.hpp        |   10 +-
 cpp/include/cudf/strings/detail/copying.hpp   |   11 +-
 cpp/include/cudf/strings/detail/fill.hpp      |   11 +-
 cpp/include/cudf/strings/detail/gather.cuh    |    7 +-
 cpp/include/cudf/strings/detail/merge.hpp     |    7 +-
 cpp/include/cudf/strings/detail/replace.hpp   |   11 +-
 cpp/include/cudf/strings/detail/scan.hpp      |   10 +-
 .../cudf/strings/detail/strings_children.cuh  |    2 +
 cpp/include/cudf/strings/detail/utf8.hpp      |   10 +-
 cpp/include/cudf/strings/detail/utilities.hpp |   11 +-
 cpp/include/cudf/strings/extract.hpp          |    4 +-
 cpp/include/cudf/strings/find.hpp             |    4 +-
 cpp/include/cudf/strings/find_multiple.hpp    |    4 +-
 cpp/include/cudf/strings/findall.hpp          |    4 +-
 cpp/include/cudf/strings/padding.hpp          |    4 +-
 cpp/include/cudf/strings/regex/flags.hpp      |    8 +-
 .../cudf/strings/regex/regex_program.hpp      |    4 +-
 cpp/include/cudf/strings/repeat_strings.hpp   |    4 +-
 cpp/include/cudf/strings/replace.hpp          |    4 +-
 cpp/include/cudf/strings/replace_re.hpp       |    4 +-
 cpp/include/cudf/strings/reverse.hpp          |    4 +-
 cpp/include/cudf/strings/side_type.hpp        |    8 +-
 cpp/include/cudf/strings/slice.hpp            |    4 +-
 cpp/include/cudf/strings/split/partition.hpp  |    4 +-
 cpp/include/cudf/strings/split/split.hpp      |    4 +-
 cpp/include/cudf/strings/split/split_re.hpp   |    4 +-
 cpp/include/cudf/strings/string_view.cuh      |    5 +-
 cpp/include/cudf/strings/string_view.hpp      |    6 +-
 .../cudf/strings/strings_column_view.hpp      |    5 +-
 cpp/include/cudf/strings/strip.hpp            |    4 +-
 cpp/include/cudf/strings/translate.hpp        |    4 +-
 cpp/include/cudf/strings/wrap.hpp             |    4 +-
 .../cudf/structs/detail/concatenate.hpp       |   11 +-
 cpp/include/cudf/structs/detail/scan.hpp      |   11 +-
 cpp/include/cudf/structs/struct_view.hpp      |    6 +-
 .../structs/structs_column_device_view.cuh    |    6 +-
 .../cudf/structs/structs_column_view.hpp      |    6 +-
 .../cudf/table/experimental/row_operators.cuh |    4 +-
 cpp/include/cudf/table/row_operators.cuh      |    4 +-
 cpp/include/cudf/table/table.hpp              |    4 +-
 cpp/include/cudf/table/table_device_view.cuh  |    6 +-
 .../cudf/tdigest/tdigest_column_view.hpp      |    6 +-
 cpp/include/cudf/timezone.hpp                 |    6 +-
 cpp/include/cudf/transform.hpp                |    5 +-
 cpp/include/cudf/transpose.hpp                |    5 +-
 cpp/include/cudf/types.hpp                    |    6 +-
 cpp/include/cudf/unary.hpp                    |    5 +-
 cpp/include/cudf/utilities/bit.hpp            |    4 +-
 cpp/include/cudf/utilities/default_stream.hpp |    7 +-
 cpp/include/cudf/utilities/error.hpp          |    9 +-
 cpp/include/cudf/utilities/pinned_memory.hpp  |   22 +-
 cpp/include/cudf/utilities/prefetch.hpp       |  163 +++
 cpp/include/cudf/utilities/span.hpp           |   37 +-
 cpp/include/cudf/utilities/thread_pool.hpp    |  381 ------
 cpp/include/cudf/utilities/traits.cuh         |    6 +-
 cpp/include/cudf/utilities/traits.hpp         |    4 +-
 cpp/include/cudf/utilities/type_checks.hpp    |    4 +-
 .../cudf/utilities/type_dispatcher.hpp        |    6 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |    4 +-
 cpp/include/cudf/wrappers/durations.hpp       |    6 +-
 cpp/include/cudf/wrappers/timestamps.hpp      |    5 +-
 cpp/include/cudf_test/base_fixture.hpp        |    5 +-
 cpp/include/cudf_test/column_utilities.hpp    |   12 +-
 cpp/include/cudf_test/column_wrapper.hpp      |    8 +-
 cpp/include/cudf_test/debug_utilities.hpp     |    9 +-
 cpp/include/cudf_test/default_stream.hpp      |    8 +-
 cpp/include/cudf_test/file_utilities.hpp      |    3 +-
 .../cudf_test/io_metadata_utilities.hpp       |    9 +-
 cpp/include/cudf_test/iterator_utilities.hpp  |    7 +-
 cpp/include/cudf_test/print_utilities.cuh     |    7 +-
 cpp/include/cudf_test/random.hpp              |    5 +-
 .../stream_checking_resource_adaptor.hpp      |   35 +-
 cpp/include/cudf_test/table_utilities.hpp     |    9 +-
 cpp/include/cudf_test/tdigest_utilities.cuh   |    7 +-
 cpp/include/cudf_test/testing_main.hpp        |    9 +-
 cpp/include/cudf_test/timestamp_utilities.cuh |    5 +-
 cpp/include/cudf_test/type_list_utilities.hpp |    8 +-
 cpp/include/cudf_test/type_lists.hpp          |    5 +-
 cpp/include/nvtext/byte_pair_encoding.hpp     |    5 +-
 cpp/include/nvtext/detail/generate_ngrams.hpp |    4 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |    4 +-
 cpp/include/nvtext/detail/tokenize.hpp        |    4 +-
 cpp/include/nvtext/edit_distance.hpp          |    5 +-
 cpp/include/nvtext/generate_ngrams.hpp        |    5 +-
 cpp/include/nvtext/jaccard.hpp                |    5 +-
 cpp/include/nvtext/minhash.hpp                |    5 +-
 cpp/include/nvtext/ngrams_tokenize.hpp        |    5 +-
 cpp/include/nvtext/normalize.hpp              |    5 +-
 cpp/include/nvtext/replace.hpp                |    5 +-
 cpp/include/nvtext/stemmer.hpp                |    5 +-
 cpp/include/nvtext/subword_tokenize.hpp       |    5 +-
 cpp/include/nvtext/tokenize.hpp               |    5 +-
 cpp/src/aggregation/aggregation.cpp           |  350 +++---
 cpp/src/binaryop/binaryop.cpp                 |    7 +-
 cpp/src/binaryop/compiled/binary_ops.cu       |    1 +
 cpp/src/bitmask/is_element_valid.cpp          |    5 +-
 cpp/src/column/column_view.cpp                |   42 +
 cpp/src/copying/concatenate.cu                |    7 +-
 cpp/src/copying/contiguous_split.cu           |    3 +-
 cpp/src/copying/purge_nonempty_nulls.cu       |    1 +
 cpp/src/datetime/timezone.cpp                 |    6 +-
 cpp/src/dictionary/detail/concatenate.cu      |    2 +-
 cpp/src/dictionary/dictionary_factories.cu    |   13 +-
 cpp/src/dictionary/set_keys.cu                |    1 +
 cpp/src/filling/calendrical_month_sequence.cu |    1 +
 cpp/src/interop/arrow_utilities.cpp           |   31 +
 cpp/src/interop/arrow_utilities.hpp           |   43 +-
 .../interop/decimal_conversion_utilities.cu   |   70 ++
 .../interop/decimal_conversion_utilities.cuh  |   44 +
 cpp/src/interop/from_arrow_device.cu          |   10 +-
 cpp/src/interop/from_arrow_host.cu            |    2 +-
 cpp/src/interop/to_arrow.cu                   |   39 +-
 cpp/src/interop/to_arrow_device.cu            |  104 +-
 cpp/src/interop/to_arrow_host.cu              |  396 ++++++
 cpp/src/interop/to_arrow_schema.cpp           |    7 +-
 cpp/src/io/avro/reader_impl.cu                |    8 +-
 cpp/src/io/comp/gpuinflate.hpp                |    7 +-
 cpp/src/io/csv/reader_impl.cu                 |   44 +-
 cpp/src/io/fst/agent_dfa.cuh                  |  371 +++++-
 cpp/src/io/fst/dispatch_dfa.cuh               |    7 +-
 cpp/src/io/fst/lookup_tables.cuh              |   70 +-
 cpp/src/io/functions.cpp                      |   13 +
 cpp/src/io/json/json_column.cu                |    4 +-
 cpp/src/io/json/json_normalization.cu         |   26 +-
 cpp/src/io/json/nested_json.hpp               |   18 +-
 cpp/src/io/json/nested_json_gpu.cu            |   31 +-
 cpp/src/io/json/read_json.cu                  |  142 +--
 cpp/src/io/json/read_json.hpp                 |   25 +-
 cpp/src/io/orc/reader_impl_decode.cu          |   10 +-
 cpp/src/io/orc/stripe_enc.cu                  |    4 +-
 cpp/src/io/orc/writer_impl.cu                 |   50 +-
 cpp/src/io/orc/writer_impl.hpp                |    9 +-
 .../io/parquet/compact_protocol_reader.hpp    |    8 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |   20 +-
 cpp/src/io/parquet/reader.cpp                 |    5 +
 cpp/src/io/parquet/reader_impl.cpp            |   86 +-
 cpp/src/io/parquet/reader_impl.hpp            |   31 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  131 +-
 cpp/src/io/parquet/reader_impl_chunking.hpp   |    6 +
 cpp/src/io/parquet/reader_impl_helpers.cpp    |   32 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |   20 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   29 +-
 cpp/src/io/parquet/writer_impl.cu             |   67 +-
 cpp/src/io/utilities/base64_utilities.hpp     |    8 +-
 cpp/src/io/utilities/data_casting.cu          |    4 +-
 cpp/src/io/utilities/file_io_utilities.cpp    |    6 +-
 cpp/src/io/utilities/file_io_utilities.hpp    |   12 +-
 cpp/src/io/utilities/row_selection.hpp        |    6 +-
 cpp/src/io/utilities/string_parsing.hpp       |    5 +-
 cpp/src/io/utilities/trie.cuh                 |    6 +-
 cpp/src/jit/parser.hpp                        |    6 +-
 cpp/src/join/hash_join.cu                     |    2 +
 cpp/src/lists/contains.cu                     |    1 +
 cpp/src/lists/copying/concatenate.cu          |    1 +
 cpp/src/lists/copying/segmented_gather.cu     |    1 +
 cpp/src/lists/dremel.cu                       |    6 +-
 cpp/src/lists/explode.cu                      |   29 +-
 cpp/src/lists/set_operations.cu               |    1 +
 cpp/src/lists/stream_compaction/distinct.cu   |    1 +
 cpp/src/merge/merge.cu                        |    1 +
 cpp/src/partitioning/round_robin.cu           |    5 +-
 cpp/src/quantiles/quantile.cu                 |    1 +
 cpp/src/quantiles/quantiles.cu                |    1 +
 cpp/src/quantiles/tdigest/tdigest.cu          |    1 +
 cpp/src/reductions/minmax.cu                  |    3 +-
 cpp/src/reductions/scan/rank_scan.cu          |    1 +
 cpp/src/reductions/scan/scan_inclusive.cu     |    1 +
 cpp/src/reductions/segmented/reductions.cpp   |    2 +-
 cpp/src/reshape/byte_cast.cu                  |   11 +-
 cpp/src/reshape/interleave_columns.cu         |    4 +-
 cpp/src/reshape/tile.cu                       |    4 +-
 cpp/src/rolling/rolling.cu                    |    1 +
 cpp/src/scalar/scalar.cpp                     |    4 +-
 cpp/src/search/contains_column.cu             |    1 +
 cpp/src/search/contains_scalar.cu             |    2 +
 cpp/src/search/contains_table.cu              |    1 +
 cpp/src/search/search_ordered.cu              |    1 +
 cpp/src/strings/combine/join.cu               |    6 +-
 cpp/src/strings/convert/convert_datetime.cu   |    2 +-
 cpp/src/strings/convert/convert_durations.cu  |    1 +
 cpp/src/strings/copying/concatenate.cu        |    2 +-
 cpp/src/strings/filter_chars.cu               |    2 +-
 cpp/src/strings/replace/multi_re.cu           |    2 +-
 cpp/src/strings/strings_scalar_factories.cpp  |    1 +
 cpp/src/strings/translate.cu                  |    2 +-
 cpp/src/strings/utilities.cu                  |    1 +
 cpp/src/table/row_operators.cu                |    5 +-
 cpp/src/transform/one_hot_encode.cu           |    1 +
 cpp/src/transform/row_bit_count.cu            |    1 +
 cpp/src/utilities/cuda_memcpy.cu              |   20 +-
 .../{pinned_memory.cpp => host_memory.cpp}    |   86 +-
 cpp/src/utilities/prefetch.cpp                |   95 ++
 cpp/tests/CMakeLists.txt                      |   21 +-
 .../binaryop/binop-verify-input-test.cpp      |    4 +-
 cpp/tests/interop/from_arrow_test.cpp         |    9 +
 cpp/tests/interop/nanoarrow_utils.hpp         |    9 +-
 cpp/tests/interop/to_arrow_device_test.cpp    |   78 +-
 cpp/tests/interop/to_arrow_host_test.cpp      | 1117 +++++++++++++++++
 cpp/tests/interop/to_arrow_test.cpp           |   10 +
 cpp/tests/io/fst/common.hpp                   |    4 +-
 cpp/tests/io/fst/fst_test.cu                  |    4 +-
 .../json_chunked_reader.cu}                   |   81 +-
 .../json_quote_normalization_test.cpp         |    0
 cpp/tests/io/{ => json}/json_test.cpp         |    0
 cpp/tests/io/{ => json}/json_tree.cpp         |    6 +-
 .../io/{ => json}/json_type_cast_test.cu      |    0
 cpp/tests/io/json/json_utils.cuh              |  105 ++
 .../json_whitespace_normalization_test.cu     |    0
 cpp/tests/io/{ => json}/json_writer.cpp       |    0
 cpp/tests/io/{ => json}/nested_json_test.cpp  |    0
 cpp/tests/io/parquet_chunked_reader_test.cu   |  394 ++++++
 cpp/tests/io/parquet_reader_test.cpp          |  203 +++
 .../{json_tests.cpp => json_tests.cu}         |   45 +-
 cpp/tests/streams/dictionary_test.cpp         |   46 +
 cpp/tests/streams/interop_test.cpp            |    9 +
 cpp/tests/streams/lists_test.cpp              |   57 +-
 cpp/tests/streams/reshape_test.cpp            |   47 +
 cpp/tests/strings/integers_tests.cpp          |    4 +-
 cpp/tests/utilities/random_seed.cpp           |    4 +-
 .../utilities_tests/pinned_memory_tests.cpp   |   67 +-
 dependencies.yaml                             |  131 +-
 docs/cudf/source/conf.py                      |    6 +
 docs/cudf/source/cudf_pandas/how-it-works.md  |   16 +
 .../source/developer_guide/documentation.md   |    2 +-
 .../source/user_guide/api_docs/groupby.rst    |    3 +-
 .../api_docs/pylibcudf/io/index.rst           |    1 +
 .../api_docs/pylibcudf/io/parquet.rst         |    6 +
 docs/cudf/source/user_guide/io/read-json.md   |    6 +-
 java/ci/README.md                             |    4 +-
 java/pom.xml                                  |    2 +-
 java/src/main/java/ai/rapids/cudf/Cudf.java   |   36 +
 java/src/main/native/CMakeLists.txt           |    1 +
 java/src/main/native/include/jni_utils.hpp    |   20 +-
 java/src/main/native/src/CudfJni.cpp          |   25 +
 java/src/main/native/src/RmmJni.cpp           |    7 -
 java/src/main/native/src/TableJni.cpp         |    5 +-
 .../main/native/src/aggregation128_utils.cu   |    2 +-
 pyproject.toml                                |   64 +-
 python/cudf/CMakeLists.txt                    |  101 ++
 python/cudf/cudf/_lib/lists.pyx               |   92 +-
 python/cudf/cudf/_lib/parquet.pyx             |  312 ++---
 python/cudf/cudf/_lib/reduce.pyx              |   15 +-
 python/cudf/cudf/_lib/types.pyx               |    4 +-
 python/cudf/cudf/api/types.py                 |    2 +-
 python/cudf/cudf/core/_base_index.py          |   54 +-
 python/cudf/cudf/core/_internals/where.py     |    2 +-
 python/cudf/cudf/core/column/categorical.py   |  130 +-
 python/cudf/cudf/core/column/column.py        |   56 +-
 python/cudf/cudf/core/column/datetime.py      |   11 +-
 python/cudf/cudf/core/column/decimal.py       |    4 +-
 python/cudf/cudf/core/column/lists.py         |   21 +-
 python/cudf/cudf/core/column/numerical.py     |   70 +-
 .../cudf/cudf/core/column/numerical_base.py   |   11 +-
 python/cudf/cudf/core/column/string.py        |   16 +-
 python/cudf/cudf/core/column/timedelta.py     |    7 +-
 python/cudf/cudf/core/column_accessor.py      |   64 +-
 python/cudf/cudf/core/dataframe.py            |  378 ++++--
 python/cudf/cudf/core/dtypes.py               |    9 +-
 python/cudf/cudf/core/frame.py                |   29 +-
 python/cudf/cudf/core/groupby/groupby.py      |  642 +++++++---
 python/cudf/cudf/core/index.py                |  227 +++-
 python/cudf/cudf/core/indexed_frame.py        |  216 +++-
 python/cudf/cudf/core/indexing_utils.py       |    8 +-
 python/cudf/cudf/core/join/_join_helpers.py   |   29 +-
 python/cudf/cudf/core/multiindex.py           |   56 +-
 python/cudf/cudf/core/resample.py             |   12 +-
 python/cudf/cudf/core/reshape.py              |   22 +-
 python/cudf/cudf/core/series.py               |  256 +++-
 python/cudf/cudf/core/single_column_frame.py  |   16 +-
 python/cudf/cudf/core/tools/numeric.py        |    2 +-
 python/cudf/cudf/core/window/ewm.py           |   77 +-
 python/cudf/cudf/core/window/rolling.py       |   27 +-
 python/cudf/cudf/io/csv.py                    |    2 +-
 python/cudf/cudf/io/orc.py                    |   33 +-
 python/cudf/cudf/io/parquet.py                |   44 +-
 python/cudf/cudf/pandas/__init__.py           |   60 +-
 python/cudf/cudf/pandas/__main__.py           |   14 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   |   16 +-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |    2 +-
 python/cudf/cudf/testing/testing.py           |   10 +-
 python/cudf/cudf/tests/test_categorical.py    |   56 +
 .../cudf/cudf/tests/test_column_accessor.py   |  190 ++-
 python/cudf/cudf/tests/test_csv.py            |    7 +-
 python/cudf/cudf/tests/test_dataframe.py      |   11 +-
 python/cudf/cudf/tests/test_dropna.py         |    9 +
 python/cudf/cudf/tests/test_gcs.py            |    3 +-
 python/cudf/cudf/tests/test_groupby.py        |   25 +
 python/cudf/cudf/tests/test_index.py          |    9 +
 python/cudf/cudf/tests/test_multiindex.py     |   18 +
 python/cudf/cudf/tests/test_parquet.py        |   24 +-
 python/cudf/cudf/tests/test_reductions.py     |   15 +-
 python/cudf/cudf/tests/test_s3.py             |  136 +-
 python/cudf/cudf/utils/dtypes.py              |   58 +-
 python/cudf/cudf/utils/ioutils.py             |   78 +-
 python/cudf/cudf/utils/utils.py               |   26 +
 python/cudf/pyproject.toml                    |    5 +-
 python/cudf_kafka/pyproject.toml              |    3 +-
 python/cudf_polars/cudf_polars/callback.py    |   12 +-
 .../cudf_polars/containers/column.py          |    3 +-
 .../cudf_polars/containers/dataframe.py       |   12 -
 python/cudf_polars/cudf_polars/dsl/expr.py    |   66 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |  214 +++-
 .../cudf_polars/testing/asserts.py            |   34 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |   41 +-
 .../cudf_polars/cudf_polars/utils/versions.py |    1 +
 python/cudf_polars/docs/overview.md           |    2 +-
 python/cudf_polars/pyproject.toml             |    3 +-
 .../tests/expressions/test_casting.py         |   52 +
 .../tests/expressions/test_literal.py         |   18 +-
 .../tests/expressions/test_numeric_binops.py  |   14 +-
 .../tests/expressions/test_stringfunction.py  |    6 +-
 python/cudf_polars/tests/test_config.py       |   34 +
 python/cudf_polars/tests/test_drop_nulls.py   |   65 +
 python/cudf_polars/tests/test_groupby.py      |   28 +
 python/cudf_polars/tests/test_hconcat.py      |    9 +
 python/cudf_polars/tests/test_join.py         |   93 +-
 python/cudf_polars/tests/test_scan.py         |  163 ++-
 python/cudf_polars/tests/utils/test_dtypes.py |    1 +
 python/custreamz/pyproject.toml               |    7 +-
 python/dask_cudf/dask_cudf/io/parquet.py      |   76 +-
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |   92 +-
 python/dask_cudf/pyproject.toml               |    7 +-
 python/pylibcudf/CMakeLists.txt               |    6 +-
 python/pylibcudf/pylibcudf/CMakeLists.txt     |    1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |    3 +
 python/pylibcudf/pylibcudf/__init__.py        |    3 +
 python/pylibcudf/pylibcudf/binaryop.pxd       |    8 +
 python/pylibcudf/pylibcudf/binaryop.pyx       |   35 +
 python/pylibcudf/pylibcudf/column.pyx         |   22 +
 python/pylibcudf/pylibcudf/experimental.pxd   |   10 +
 python/pylibcudf/pylibcudf/experimental.pyx   |   42 +
 python/pylibcudf/pylibcudf/expressions.pyx    |   11 +
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |    4 +-
 python/pylibcudf/pylibcudf/io/__init__.pxd    |    2 +-
 python/pylibcudf/pylibcudf/io/__init__.py     |    2 +-
 python/pylibcudf/pylibcudf/io/datasource.pyx  |   10 +-
 python/pylibcudf/pylibcudf/io/parquet.pxd     |   34 +
 python/pylibcudf/pylibcudf/io/parquet.pyx     |  203 +++
 python/pylibcudf/pylibcudf/io/types.pyx       |    8 +
 python/pylibcudf/pylibcudf/join.pyx           |   15 +-
 .../pylibcudf/pylibcudf/libcudf/binaryop.pxd  |   39 +-
 .../pylibcudf/pylibcudf/libcudf/copying.pxd   |    3 +-
 .../pylibcudf/libcudf/exception_handler.pxd   |   69 +
 .../pylibcudf/libcudf/experimental.pxd        |   16 +
 .../pylibcudf/libcudf/io/parquet.pxd          |   79 +-
 .../pylibcudf/pylibcudf/libcudf/io/types.pxd  |    1 +
 .../pylibcudf/libcudf/lists/contains.pxd      |    3 +-
 .../libcudf/lists/count_elements.pxd          |    2 +-
 .../pylibcudf/libcudf/lists/filling.pxd       |   18 +
 .../libcudf/lists/set_operations.pxd          |   36 +
 .../pylibcudf/libcudf/lists/sorting.pxd       |    6 +
 .../libcudf/lists/stream_compaction.pxd       |    7 +-
 .../libcudf/scalar/scalar_factories.pxd       |    3 +
 .../libcudf/utilities/type_dispatcher.pxd     |    7 +
 python/pylibcudf/pylibcudf/lists.pxd          |   20 +-
 python/pylibcudf/pylibcudf/lists.pyx          |  394 +++++-
 python/pylibcudf/pylibcudf/scalar.pxd         |    4 +
 python/pylibcudf/pylibcudf/scalar.pyx         |   18 +
 .../pylibcudf/pylibcudf/tests/common/utils.py |   73 +-
 python/pylibcudf/pylibcudf/tests/conftest.py  |   15 +
 .../pylibcudf/tests/io/test_parquet.py        |  108 ++
 .../tests/io/test_source_sink_info.py         |   21 +-
 .../pylibcudf/tests/test_binaryops.py         |  785 ++++++++++++
 .../pylibcudf/tests/test_column_factories.py  |    3 +-
 .../tests/test_column_from_device.py          |   51 -
 .../pylibcudf/pylibcudf/tests/test_copying.py |    3 +-
 python/pylibcudf/pylibcudf/tests/test_join.py |    3 +-
 .../pylibcudf/pylibcudf/tests/test_lists.py   |  259 +++-
 .../pylibcudf/pylibcudf/tests/test_reshape.py |    3 +-
 .../pylibcudf/pylibcudf/tests/test_traits.py  |    2 +-
 .../pylibcudf/tests/test_transform.py         |    3 +-
 .../pylibcudf/pylibcudf/tests/test_unary.py   |    2 +-
 python/pylibcudf/pylibcudf/types.pyx          |    7 +-
 python/pylibcudf/pyproject.toml               |    7 +-
 596 files changed, 12207 insertions(+), 4023 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_thread_pool.cmake
 create mode 100644 cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
 create mode 100644 cpp/examples/tpch/q10.cpp
 create mode 100644 cpp/include/cudf/detail/utilities/host_memory.hpp
 delete mode 100644 cpp/include/cudf/interop/detail/arrow.hpp
 create mode 100644 cpp/include/cudf/utilities/prefetch.hpp
 delete mode 100644 cpp/include/cudf/utilities/thread_pool.hpp
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cu
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cuh
 create mode 100644 cpp/src/interop/to_arrow_host.cu
 rename cpp/src/utilities/{pinned_memory.cpp => host_memory.cpp} (73%)
 create mode 100644 cpp/src/utilities/prefetch.cpp
 create mode 100644 cpp/tests/interop/to_arrow_host_test.cpp
 rename cpp/tests/io/{json_chunked_reader.cpp => json/json_chunked_reader.cu} (64%)
 rename cpp/tests/io/{ => json}/json_quote_normalization_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_tree.cpp (99%)
 rename cpp/tests/io/{ => json}/json_type_cast_test.cu (100%)
 create mode 100644 cpp/tests/io/json/json_utils.cuh
 rename cpp/tests/io/{ => json}/json_whitespace_normalization_test.cu (100%)
 rename cpp/tests/io/{ => json}/json_writer.cpp (100%)
 rename cpp/tests/io/{ => json}/nested_json_test.cpp (100%)
 rename cpp/tests/large_strings/{json_tests.cpp => json_tests.cu} (50%)
 create mode 100644 cpp/tests/streams/reshape_test.cpp
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
 create mode 100644 java/src/main/java/ai/rapids/cudf/Cudf.java
 create mode 100644 python/cudf/CMakeLists.txt
 create mode 100644 python/cudf_polars/tests/expressions/test_casting.py
 create mode 100644 python/cudf_polars/tests/test_config.py
 create mode 100644 python/cudf_polars/tests/test_drop_nulls.py
 create mode 100644 python/pylibcudf/pylibcudf/experimental.pxd
 create mode 100644 python/pylibcudf/pylibcudf/experimental.pyx
 create mode 100644 python/pylibcudf/pylibcudf/io/parquet.pxd
 create mode 100644 python/pylibcudf/pylibcudf/io/parquet.pyx
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/exception_handler.pxd
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/experimental.pxd
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
 create mode 100644 python/pylibcudf/pylibcudf/tests/io/test_parquet.py
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_binaryops.py
 delete mode 100644 python/pylibcudf/pylibcudf/tests/test_column_from_device.py

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 8423fe21c29..7a1361e52c5 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 4945d6cf753..64d7cd54130 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index b79c949999a..4d61427b2ce 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,20 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    // TODO: change this back to rapidsai/devcontainers
-    // once https://github.com/lithomas1/devcontainers/tree/pylibcudf
-    // is merged in
-    "ghcr.io/lithomas1/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/lithomas1/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/lithomas1/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 026eb540952..beab2940176 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 937080572ad..2fc39c06fad 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 1516cb09449..cf0c2b377dd 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 76645cb71c8..86314d3c9d3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -35,41 +35,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -77,14 +77,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -94,7 +94,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -104,7 +104,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -114,7 +114,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -124,28 +124,28 @@ jobs:
   wheel-build-pylibcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -154,7 +154,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -165,7 +165,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,7 +174,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -182,7 +182,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -193,7 +193,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -202,7 +202,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
@@ -212,7 +212,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 2a8ebd30993..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 73f8d726e77..9feea050b19 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 1ab6a2d7457..fd8b0365807 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.08 python=3.11 cuda-version=12.5
+    cudf=24.10 python=3.11 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index ec8489fda92..7c7ba04436f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 79e09432779..3705786f8bc 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -22,6 +22,13 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
+
+# TODO: enable once conda recipes written for pylibcudf
+# RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+#   --no-test \
+#   --channel "${CPP_CHANNEL}" \
+#   conda/recipes/pylibcudf
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index abde5e5d160..48ee4a05628 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -19,7 +19,7 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
-  -n 10 \
+  -n 5 \
   --tb=no \
   -m "not slow" \
   --max-worker-restart=3 \
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index f629de64905..ad96aff3930 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -68,15 +68,18 @@ done
 # README.md update
 sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" README.md
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
+sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" python/cudf_polars/docs/overview.md
+sed_runner "s/branch-${CURRENT_SHORT_TAG}/branch-${NEXT_SHORT_TAG}/g" python/cudf_polars/docs/overview.md
 
 # Libcudf examples update
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/versions.cmake
 
 # CI files
-for FILE in .github/workflows/*.yaml; do
+for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
+sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 
 # Java files
 NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 900acd5d473..cc9f5788685 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
 then
     HAS_CHANGES=1
 else
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b8d73a01f96..b1a1cc3c68e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -43,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -77,9 +77,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 3f5fae49cbb..1017b11779c 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -42,10 +42,10 @@ dependencies:
 - libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -75,9 +75,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 903cff27be4..310bc99b279 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -216,6 +216,8 @@ include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
+# find thread_pool
+include(cmake/thirdparty/get_thread_pool.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
@@ -363,8 +365,10 @@ add_library(
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/arrow_utilities.cpp
+  src/interop/decimal_conversion_utilities.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/to_arrow_host.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
   src/interop/from_arrow_stream.cu
@@ -669,9 +673,10 @@ add_library(
   src/unary/null_ops.cu
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
+  src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
-  src/utilities/pinned_memory.cpp
+  src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp
@@ -707,8 +712,10 @@ set_target_properties(
              CXX_STANDARD_REQUIRED ON
              # For std:: support of __int128_t. Can be removed once using cuda::std
              CXX_EXTENSIONS ON
+             CXX_VISIBILITY_PRESET hidden
              CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON
+             CUDA_VISIBILITY_PRESET hidden
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON
 )
@@ -804,7 +811,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
+  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
@@ -883,8 +890,10 @@ if(CUDF_BUILD_TESTUTIL)
                # set target compile options
                CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
+               CXX_VISIBILITY_PRESET hidden
                CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON
+               CUDA_VISIBILITY_PRESET hidden
                POSITION_INDEPENDENT_CODE ON
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 6df2cb44adc..0970003deb2 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -718,7 +718,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
 }
 
 template <typename T>
-struct clamp_down : public thrust::unary_function<T, T> {
+struct clamp_down {
   T max;
   clamp_down(T max) : max(max) {}
   __host__ __device__ T operator()(T x) const { return min(x, max); }
diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu
index eeb0149fb3a..985166f7298 100644
--- a/cpp/benchmarks/copying/gather.cu
+++ b/cpp/benchmarks/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,5 +71,5 @@ void BM_gather(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                     \
     ->UseManualTime();
 
-GBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-GBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+GBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+GBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/copying/scatter.cu b/cpp/benchmarks/copying/scatter.cu
index a521dc82739..c27480b69f4 100644
--- a/cpp/benchmarks/copying/scatter.cu
+++ b/cpp/benchmarks/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,5 +74,5 @@ void BM_scatter(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {1, 8}})                      \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-SBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8c8d6756b00..8900899f9be 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -107,7 +107,7 @@ class memory_stats_logger {
  public:
   memory_stats_logger()
     : existing_mr(rmm::mr::get_current_device_resource()),
-      statistics_mr(rmm::mr::make_statistics_adaptor(existing_mr))
+      statistics_mr(rmm::mr::statistics_resource_adaptor(existing_mr))
   {
     rmm::mr::set_current_device_resource(&statistics_mr);
   }
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
index 3b8faba618f..bf1a1a5fcf7 100644
--- a/cpp/benchmarks/groupby/group_max_multithreaded.cpp
+++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -20,8 +20,8 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 template <typename Type>
@@ -58,7 +58,7 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
   for (auto& thread_requests : requests) {
@@ -75,10 +75,8 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
       auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
       timer.start();
-      for (int64_t i = 0; i < num_threads; ++i) {
-        threads.submit(perform_agg, i);
-      }
-      threads.wait_for_tasks();
+      threads.detach_sequence(decltype(num_threads){0}, num_threads, perform_agg);
+      threads.wait();
       cudf::detail::join_streams(streams, cudf::get_default_stream());
       cudf::get_default_stream().synchronize();
       timer.stop();
diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu
index ad19bdfdfcb..31f1bf8e70f 100644
--- a/cpp/benchmarks/io/fst.cu
+++ b/cpp/benchmarks/io/fst.cu
@@ -95,7 +95,9 @@ void BM_FST_JSON(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -134,7 +136,9 @@ void BM_FST_JSON_no_outidx(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -171,7 +175,9 @@ void BM_FST_JSON_no_out(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -209,7 +215,9 @@ void BM_FST_JSON_no_str(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index aa0ee39a179..e91bf06fdfa 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -24,8 +24,8 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -90,7 +90,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -112,13 +112,11 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                    cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
@@ -170,7 +168,7 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -203,13 +201,11 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
                    } while (reader.has_next());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index b4c8ed78ed8..9e76ebb71ab 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -23,10 +23,10 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -93,7 +93,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -114,13 +114,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                  cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
@@ -176,7 +174,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -207,13 +205,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
                  } while (reader.has_next());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index dbc3234dabf..570decf410f 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,5 +143,5 @@ void BM_lists_scatter(::benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true);
-SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 025bff7d8f0..8df1b431095 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -17,11 +17,11 @@ function(find_and_configure_nanoarrow)
   # Currently we need to always build nanoarrow so we don't pickup a previous installed version
   set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
-    nanoarrow 0.5.0
+    nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
-    GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c
+    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
     GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
new file mode 100644
index 00000000000..777e16d9a4f
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Need to call rapids_cpm_bs_thread_pool to get support for an installed version of thread-pool and
+# to support installing it ourselves
+function(find_and_configure_thread_pool)
+  include(${rapids-cmake-dir}/cpm/bs_thread_pool.cmake)
+
+  # Find or install thread-pool
+  rapids_cpm_bs_thread_pool()
+
+endfunction()
+
+find_and_configure_thread_pool()
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 2f29578f7ae..dcf9c1139f9 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,6 +3,11 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
+        {
+          "file" : "${current_json_dir}/cccl_symbol_visibility.diff",
+          "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]",
+          "fixed_in" : "2.6"
+        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
new file mode 100644
index 00000000000..f745d5fa314
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
@@ -0,0 +1,27 @@
+diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+index e7c62c031b..5db861853a 100644
+--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
++++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+@@ -1049,7 +1049,6 @@ typedef __char32_t char32_t;
+ #      define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport)
+ #    endif
+ 
+-#    define _LIBCUDACXX_TYPE_VIS      _LIBCUDACXX_DLL_VIS
+ #    define _LIBCUDACXX_FUNC_VIS      _LIBCUDACXX_DLL_VIS
+ #    define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS
+ #    define _LIBCUDACXX_HIDDEN
+@@ -1448,14 +1447,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
+ #    define _LIBCUDACXX_WEAK __attribute__((__weak__))
+ #  endif
+ 
+-// Redefine some macros for internal use
+-#  if defined(__cuda_std__)
+-#    undef _LIBCUDACXX_FUNC_VIS
+-#    define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY
+-#    undef _LIBCUDACXX_TYPE_VIS
+-#    define _LIBCUDACXX_TYPE_VIS
+-#  endif // __cuda_std__
+-
+ // Thread API
+ #  ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
+ #    if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__)
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 0d097541692..aa054ba93e9 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -52,15 +52,36 @@ header file in `cudf/cpp/include/cudf/`. For example, `cudf/cpp/include/cudf/cop
 contains the APIs for functions related to copying from one column to another. Note the `.hpp`
 file extension used to indicate a C++ header file.
 
-Header files should use the `#pragma once` include guard.
+External/public libcudf C++ API header files need to mark all symbols inside of them with `CUDF_EXPORT`.
+This is done by placing the macro on the `namespace cudf` as seen below. Markup on namespace
+require them not to be nested, so the `cudf` namespace must be kept by itself.
+
+```c++
+
+#pragma once
+
+namespace CUDF_EXPORT cudf {
+namespace lists {
+
+...
+
+
+} // namespace lists
+} // namespace CUDF_EXPORT cudf
+
+```
+
 
 The naming of external API headers should be consistent with the name of the folder that contains
 the source files that implement the API. For example, the implementation of the APIs found in
 `cudf/cpp/include/cudf/copying.hpp` are located in `cudf/src/copying`. Likewise, the unit tests for
 the APIs reside in `cudf/tests/copying/`.
 
-Internal API headers containing `detail` namespace definitions that are used across translation
-units inside libcudf should be placed in `include/cudf/detail`.
+Internal API headers containing `detail` namespace definitions that are either used across translation
+units inside libcudf should be placed in `include/cudf/detail`. Just like the public C++ API headers, any
+internal C++ API header requires `CUDF_EXPORT` markup on the `cudf` namespace so that the functions can be tested.
+
+All headers in cudf should use `#pragma once` for include guards.
 
 ## File extensions
 
diff --git a/cpp/doxygen/developer_guide/DOCUMENTATION.md b/cpp/doxygen/developer_guide/DOCUMENTATION.md
index b86f7db82b0..89376223baf 100644
--- a/cpp/doxygen/developer_guide/DOCUMENTATION.md
+++ b/cpp/doxygen/developer_guide/DOCUMENTATION.md
@@ -363,7 +363,7 @@ Here is an example of a doxygen description comment for a namespace declaration.
      *
      * This is the top-level namespace which contains all cuDF functions and types.
      */
-    namespace cudf {
+    namespace CUDF_EXPORT cudf {
 
 A description comment should be included only once for each unique namespace declaration.
 Otherwise, if more than one description is found, doxygen aggregates the descriptions in an arbitrary order in the output pages.
@@ -385,7 +385,7 @@ The existing groups have been carefully structured and named, so new groups shou
 
 When creating a new API, specify its group using the [\@ingroup](https://www.doxygen.nl/manual/commands.html#cmdingroup) tag and the group reference id from the [doxygen_groups.h](../include/doxygen_groups.h) file.
 
-    namespace cudf {
+    namespace CUDF_EXPORT cudf {
 
     /**
      * @brief ...
@@ -401,7 +401,7 @@ When creating a new API, specify its group using the [\@ingroup](https://www.dox
 
 You can also use the \@addtogroup with a `@{ ... @}` pair to automatically include doxygen comment blocks as part of a group.
 
-    namespace cudf {
+    namespace CUDF_EXPORT cudf {
     /**
      * @addtogroup transformation_fill
      * @{
diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
index 1b91d07e148..373a6d72d56 100644
--- a/cpp/examples/tpch/CMakeLists.txt
+++ b/cpp/examples/tpch/CMakeLists.txt
@@ -30,3 +30,7 @@ target_compile_features(tpch_q6 PRIVATE cxx_std_17)
 add_executable(tpch_q9 q9.cpp)
 target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
 target_compile_features(tpch_q9 PRIVATE cxx_std_17)
+
+add_executable(tpch_q10 q10.cpp)
+target_link_libraries(tpch_q10 PRIVATE cudf::cudf)
+target_compile_features(tpch_q10 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
index 1bdf039da4a..fe03320b888 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/examples/tpch/q1.cpp
@@ -124,7 +124,7 @@ int main(int argc, char const** argv)
   auto shipdate_upper =
     cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1998, 9, 2), true);
   auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
-  auto lineitem_pred                = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred          = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/examples/tpch/q10.cpp
new file mode 100644
index 00000000000..94da46f6930
--- /dev/null
+++ b/cpp/examples/tpch/q10.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q10.cpp
+ * @brief Implement query 10 of the TPC-H benchmark.
+ *
+ * create view customer as select * from '/tables/scale-1/customer.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ *
+ * select
+ *    c_custkey,
+ *    c_name,
+ *    sum(l_extendedprice * (1 - l_discount)) as revenue,
+ *    c_acctbal,
+ *    n_name,
+ *    c_address,
+ *    c_phone,
+ *    c_comment
+ * from
+ *    customer,
+ *    orders,
+ *    lineitem,
+ *    nation
+ * where
+ *     c_custkey = o_custkey
+ *     and l_orderkey = o_orderkey
+ *     and o_orderdate >= date '1993-10-01'
+ *     and o_orderdate < date '1994-01-01'
+ *     and l_returnflag = 'R'
+ *     and c_nationkey = n_nationkey
+ * group by
+ *     c_custkey,
+ *     c_name,
+ *     c_acctbal,
+ *     c_phone,
+ *     n_name,
+ *     c_address,
+ *     c_comment
+ * order by
+ *     revenue desc;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(extendedprice,
+                                        one_minus_discount->view(),
+                                        cudf::binary_operator::MUL,
+                                        revenue_type,
+                                        stream,
+                                        mr);
+  return revenue;
+}
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projection and filter predicate for the `orders` table
+  std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
+  auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
+    orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate")));
+  auto o_orderdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1993, 10, 1), true);
+  auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower);
+  auto const o_orderdate_pred_lower  = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit);
+  auto o_orderdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
+  auto const o_orderdate_pred_upper =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
+
+  auto const l_returnflag_ref = cudf::ast::column_reference(3);
+  auto r_scalar               = cudf::string_scalar("R");
+  auto const r_literal        = cudf::ast::literal(r_scalar);
+  auto const lineitem_pred    = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::EQUAL, l_returnflag_ref, r_literal);
+
+  // Read out the tables from parquet files
+  // while pushing down the column projections and filter predicates
+  auto const customer = read_parquet(
+    args.dataset_dir + "/customer.parquet",
+    {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+  auto const lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet",
+                 {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
+                 std::move(lineitem_pred));
+  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"});
+
+  // Perform the joins
+  auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
+  auto const join_b       = apply_inner_join(lineitem, orders, {"l_orderkey"}, {"o_orderkey"});
+  auto const joined_table = apply_inner_join(join_a, join_b, {"c_custkey"}, {"o_custkey"});
+
+  // Calculate and append the `revenue` column
+  auto revenue =
+    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+  (*joined_table).append(revenue, "revenue");
+
+  // Perform the groupby operation
+  auto const groupedby_table = apply_groupby(
+    joined_table,
+    groupby_context_t{
+      {"c_custkey", "c_name", "c_acctbal", "c_phone", "n_name", "c_address", "c_comment"},
+      {
+        {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}},
+      }});
+
+  // Perform the order by operation
+  auto const orderedby_table =
+    apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q10.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
index e56850b94d6..89396a6c968 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/examples/tpch/q5.cpp
@@ -44,14 +44,14 @@
  *    region
  * where
  *     c_custkey = o_custkey
- *    and l_orderkey = o_orderkey
- *    and l_suppkey = s_suppkey
- *    and c_nationkey = s_nationkey
- *    and s_nationkey = n_nationkey
- *    and n_regionkey = r_regionkey
- *    and r_name = 'ASIA'
- *    and o_orderdate >= date '1994-01-01'
- *    and o_orderdate < date '1995-01-01'
+ *     and l_orderkey = o_orderkey
+ *     and l_suppkey = s_suppkey
+ *     and c_nationkey = s_nationkey
+ *     and s_nationkey = n_nationkey
+ *     and n_regionkey = r_regionkey
+ *     and r_name = 'ASIA'
+ *     and o_orderdate >= date '1994-01-01'
+ *     and o_orderdate < date '1995-01-01'
  * group by
  *    n_name
  * order by
@@ -109,7 +109,7 @@ int main(int argc, char const** argv)
   auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
   auto const o_orderdate_pred_upper =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
-  auto orders_pred = std::make_unique<cudf::ast::operation>(
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
 
   // Define the column projection and filter predicate for the `region` table
@@ -118,7 +118,7 @@ int main(int argc, char const** argv)
     region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name")));
   auto r_name_value                          = cudf::string_scalar("ASIA");
   auto const r_name_literal                  = cudf::ast::literal(r_name_value);
-  auto region_pred                           = std::make_unique<cudf::ast::operation>(
+  auto const region_pred                     = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal);
 
   // Read out the tables from parquet files
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
index f11b3d6ab3b..405b2ac73ca 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/examples/tpch/q6.cpp
@@ -84,7 +84,7 @@ int main(int argc, char const** argv)
     cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal);
   auto const shipdate_pred_b =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
-  auto lineitem_pred = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
   auto lineitem =
     read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index 144b3d3721b..44493011673 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -12,4 +12,4 @@
 # the License.
 # =============================================================================
 
-set(CUDF_TAG branch-24.08)
+set(CUDF_TAG branch-24.10)
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 3c1023017be..f5f514d26d9 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <functional>
 #include <memory>
@@ -31,7 +32,7 @@
  * individual function documentation to see what aggregations are supported.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_factories
  * @{
@@ -770,4 +771,4 @@ template <typename Base>
 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index 38f7ac5291f..da552d95421 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -29,9 +29,8 @@
 #include <numeric>
 #include <optional>
 
-namespace cudf {
-namespace ast {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace ast::detail {
 
 /**
  * @brief Node data reference types.
@@ -328,8 +327,6 @@ class expression_parser {
   std::vector<generic_scalar_device_view> _literals;
 };
 
-}  // namespace detail
+}  // namespace ast::detail
 
-}  // namespace ast
-
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/detail/expression_transformer.hpp b/cpp/include/cudf/ast/detail/expression_transformer.hpp
index a6529c338e6..3af1663abf8 100644
--- a/cpp/include/cudf/ast/detail/expression_transformer.hpp
+++ b/cpp/include/cudf/ast/detail/expression_transformer.hpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,8 @@
 
 #include <cudf/ast/expressions.hpp>
 
-namespace cudf::ast::detail {
+namespace CUDF_EXPORT cudf {
+namespace ast::detail {
 /**
  * @brief Base "visitor" pattern class with the `expression` class for expression transformer.
  *
@@ -61,4 +62,7 @@ class expression_transformer {
 
   virtual ~expression_transformer() {}
 };
-}  // namespace cudf::ast::detail
+
+}  // namespace ast::detail
+
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index c483d459833..46507700e21 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -29,7 +29,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace ast {
 
@@ -1233,4 +1233,4 @@ CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
 
 }  // namespace ast
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 918271e3e4f..4299ee5f20f 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -23,7 +23,7 @@
 
 #include <cstdint>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace ast {
 /**
  * @addtogroup expressions
@@ -555,4 +555,4 @@ class column_name_reference : public expression {
 /** @} */  // end of group
 }  // namespace ast
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 22dad11e109..51199bb5792 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup transformation_binaryops
@@ -290,6 +291,17 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
 
 namespace binops {
 
+/**
+ * @brief Returns true if the binary operator is supported for the given input types.
+ *
+ * @param out The output data type
+ * @param lhs The left-hand cudf::data_type
+ * @param rhs The right-hand cudf::data_type
+ * @param op The binary operator
+ * @return true if the binary operator is supported for the given input types
+ */
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op);
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  *
@@ -305,8 +317,13 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-namespace compiled {
-namespace detail {
+}  // namespace binops
+
+/** @} */  // end of group
+}  // namespace CUDF_EXPORT cudf
+
+namespace CUDF_EXPORT cudf {
+namespace binops::compiled::detail {
 
 /**
  * @brief struct binary operation using `NaN` aware sorting physical element comparators
@@ -326,9 +343,5 @@ void apply_sorting_struct_binary_op(mutable_column_view& out,
                                     bool is_rhs_scalar,
                                     binary_operator op,
                                     rmm::cuda_stream_view stream);
-}  // namespace detail
-}  // namespace compiled
-}  // namespace binops
-
-/** @} */  // end of group
-}  // namespace cudf
+}  // namespace binops::compiled::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 22db25bdc83..5d1d74c3f28 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -36,7 +36,7 @@
  * @brief Class definition for cudf::column
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A container of nullable device data as a column of elements.
@@ -332,4 +332,4 @@ class column {
 };
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 787e9c2c479..89fe59bfeaa 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -44,7 +44,7 @@
  * @brief Column device view class definitions
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Indicates the presence of nulls at compile-time or runtime.
@@ -1527,4 +1527,4 @@ ColumnDeviceView* child_columns_to_device_array(ColumnViewIterator child_begin,
 }
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index dc4700576e6..c1f295b7ea8 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -27,7 +27,7 @@
 
 #include <thrust/pair.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_factories
  * @{
@@ -571,4 +571,4 @@ std::unique_ptr<column> make_dictionary_from_scalar(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 134e835911f..3ef7bafe727 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/prefetch.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -29,8 +31,7 @@
  * @file column_view.hpp
  * @brief column view class definitions
  */
-
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief A non-owning, immutable view of device data as a column of elements,
@@ -72,7 +73,7 @@ class column_view_base {
             CUDF_ENABLE_IF(std::is_same_v<T, void> or is_rep_layout_compatible<T>())>
   T const* head() const noexcept
   {
-    return static_cast<T const*>(_data);
+    return static_cast<T const*>(get_data());
   }
 
   /**
@@ -225,6 +226,17 @@ class column_view_base {
   [[nodiscard]] size_type offset() const noexcept { return _offset; }
 
  protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  virtual void const* get_data() const noexcept { return _data; }
+
   data_type _type{type_id::EMPTY};   ///< Element type
   size_type _size{};                 ///< Number of elements
   void const* _data{};               ///< Pointer to device memory containing elements
@@ -236,7 +248,7 @@ class column_view_base {
                                      ///< Enables zero-copy slicing
 
   column_view_base()                        = default;
-  ~column_view_base()                       = default;
+  virtual ~column_view_base()               = default;
   column_view_base(column_view_base const&) = default;  ///< Copy constructor
   column_view_base(column_view_base&&)      = default;  ///< Move constructor
   /**
@@ -284,10 +296,6 @@ class column_view_base {
                    size_type offset = 0);
 };
 
-class mutable_column_view_base : public column_view_base {
- public:
- protected:
-};
 }  // namespace detail
 
 /**
@@ -323,7 +331,7 @@ class column_view : public detail::column_view_base {
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
-  ~column_view() = default;
+  ~column_view() override = default;
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
@@ -447,6 +455,18 @@ class column_view : public detail::column_view_base {
     return device_span<T const>(data<T>(), size());
   }
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend column_view bit_cast(column_view const& input, data_type type);
 
@@ -478,7 +498,7 @@ class mutable_column_view : public detail::column_view_base {
  public:
   mutable_column_view() = default;
 
-  ~mutable_column_view(){
+  ~mutable_column_view() override{
     // Needed so that the first instance of the implicit destructor for any TU isn't 'constructed'
     // from a host+device function marking the implicit version also as host+device
   };
@@ -572,7 +592,7 @@ class mutable_column_view : public detail::column_view_base {
   }
 
   /**
-   * @brief Return first element (accounting for offset) when underlying data is
+   * @brief Return first element (accounting for offset) after underlying data is
    * casted to the specified type.
    *
    * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
@@ -665,6 +685,18 @@ class mutable_column_view : public detail::column_view_base {
    */
   operator column_view() const;
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
 
@@ -765,5 +797,6 @@ std::size_t shallow_hash(column_view const& input);
  * @return If `lhs` and `rhs` have equivalent shallow state
  */
 bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs);
+
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index e7b55a2e6d0..0935bdf7def 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -18,6 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -25,7 +26,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup copy_concatenate
  * @{
@@ -97,4 +98,4 @@ std::unique_ptr<table> concatenate(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 0d4f20d1ef2..195dac25268 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup copy_split
@@ -124,8 +125,14 @@ std::vector<packed_table> contiguous_split(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace detail {
+
+/**
+ * @brief A helper struct containing the state of contiguous_split, whether the caller
+ * is using the single-pass contiguous_split or chunked_pack.
+ *
+ */
 struct contiguous_split_state;
-};
+}  // namespace detail
 
 /**
  * @brief Perform a chunked "pack" operation of the input `table_view` using a user provided
@@ -338,4 +345,4 @@ table_view unpack(packed_columns const& input);
 table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data);
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index b17cafb05ab..3c44ff48fdf 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -23,6 +23,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -30,7 +31,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_copy
@@ -913,7 +914,7 @@ bool may_have_nonempty_nulls(column_view const& input);
  *
  * @code{.pseudo}
  * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} }.release();
- * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false);
+ * cudf::set_null_mask(lists->null_mask(), 1, 2, false);
  *
  * lists[1] is now null, but the lists child column still stores `{2,3}`.
  * The lists column contents will be:
@@ -929,7 +930,7 @@ bool may_have_nonempty_nulls(column_view const& input);
  *
  * @code{.pseudo}
  * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release();
- * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false);
+ * cudf::set_null_mask(strings->null_mask(), 1, 2, false);
  *
  * strings[1] is now null, but the strings column still stores `"CD"`.
  * The lists column contents will be:
@@ -972,4 +973,4 @@ std::unique_ptr<column> purge_nonempty_nulls(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 06b7d24f6cd..f7bed8bdc7e 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -28,7 +29,7 @@
  * @brief DateTime column APIs.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace datetime {
 /**
  * @addtogroup datetime_extract
@@ -401,4 +402,4 @@ std::unique_ptr<cudf::column> round_datetimes(
 /** @} */  // end of group
 
 }  // namespace datetime
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 843414817e3..b257eef1e9e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -26,7 +26,7 @@
 #include <numeric>
 #include <utility>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 // Visitor pattern
@@ -1674,4 +1674,4 @@ constexpr inline bool is_valid_aggregation()
 bool is_valid_aggregation(data_type source, aggregation::Kind k);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index 41eec156c47..ec5a511bb7c 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 
 #include <unordered_map>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 struct pair_column_aggregation_equal_to {
   bool operator()(std::pair<column_view, aggregation const&> const& lhs,
@@ -66,4 +66,4 @@ class result_cache {
 };
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index de1fde8bc96..fe739327a08 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/binaryop.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 
@@ -77,4 +78,4 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr);
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 3e039175542..1be269710b2 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -19,6 +19,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,7 +27,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 /**
@@ -48,4 +49,4 @@ std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/concatenate_masks.hpp b/cpp/include/cudf/detail/concatenate_masks.hpp
index dd2fb471a7d..fc829361fde 100644
--- a/cpp/include/cudf/detail/concatenate_masks.hpp
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -24,7 +25,7 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 
@@ -69,4 +70,4 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index 1467ed1aa67..52c51daa917 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -23,7 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -125,4 +125,4 @@ std::vector<uint8_t> pack_metadata(table_view const& table,
                                    metadata_builder& builder);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index f7430eb090d..2be432c0825 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -28,7 +28,7 @@
 
 #include <initializer_list>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Constructs a zero-copy `column_view`/`mutable_column_view` of the
@@ -280,4 +280,4 @@ std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index a93c06d4371..95469de8ae6 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -23,7 +23,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace datetime {
 namespace detail {
 /**
@@ -174,4 +174,4 @@ std::unique_ptr<cudf::column> extract_quarter(cudf::column_view const& column,
 
 }  // namespace detail
 }  // namespace datetime
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index 6996cda6974..82c6af8b611 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -25,7 +25,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -52,4 +52,4 @@ std::unique_ptr<column> fill(column_view const& input,
                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 5977c7341c1..41f5494f78f 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -518,7 +518,7 @@ struct column_gatherer_impl<struct_view> {
  * Positive indices are unchanged by this transformation.
  */
 template <typename map_type>
-struct index_converter : public thrust::unary_function<map_type, map_type> {
+struct index_converter {
   index_converter(size_type n_rows) : n_rows(n_rows) {}
 
   __device__ map_type operator()(map_type in) const { return ((in % n_rows) + n_rows) % n_rows; }
@@ -571,13 +571,13 @@ void gather_bitmask(table_view const& source,
         not target[i]->nullable()) {
       auto const state =
         op == gather_bitmask_op::PASSTHROUGH ? mask_state::ALL_VALID : mask_state::UNINITIALIZED;
-      auto mask = detail::create_null_mask(target[i]->size(), state, stream, mr);
+      auto mask = cudf::create_null_mask(target[i]->size(), state, stream, mr);
       target[i]->set_null_mask(std::move(mask), 0);
     }
   }
 
   // Make device array of target bitmask pointers
-  std::vector<bitmask_type*> target_masks(target.size());
+  auto target_masks = make_host_vector<bitmask_type*>(target.size(), stream);
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 36824f56895..39cd43934e3 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -27,7 +28,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -84,4 +85,4 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 5a8c9b0a27f..36eae05ce39 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -25,10 +25,8 @@
 #include <memory>
 #include <utility>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace CUDF_EXPORT cudf {
+namespace groupby::detail::hash {
 /**
  * @brief Indicates if a set of aggregation requests can be satisfied with a
  * hash-based groupby implementation.
@@ -47,8 +45,5 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
-}  // namespace hash
-
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace groupby::detail::hash
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index 389c7952875..c0910b4d5ae 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -24,7 +24,7 @@
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace groupby {
 namespace detail {
 
@@ -45,4 +45,4 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
 
 }  // namespace detail
 }  // namespace groupby
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 567efedb9b2..a411a890622 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -25,10 +25,8 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace sort {
+namespace CUDF_EXPORT cudf {
+namespace groupby::detail::sort {
 /**
  * @brief Helper class for computing sort-based groupby
  *
@@ -229,7 +227,5 @@ struct sort_groupby_helper {
   std::vector<null_order> _null_precedence;  ///< How to sort NULLs
 };
 
-}  // namespace sort
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace groupby::detail::sort
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 5b2b9b5e69d..0b9319ba663 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -34,12 +34,13 @@
 #include <cudf/interop.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <string>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -156,4 +157,4 @@ constexpr std::size_t max_precision()
 }
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp
index 72a85d42eb3..4b74d12f306 100644
--- a/cpp/include/cudf/detail/is_element_valid.hpp
+++ b/cpp/include/cudf/detail/is_element_valid.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,11 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -44,4 +45,4 @@ bool is_element_valid_sync(column_view const& col_view,
                            rmm::cuda_stream_view stream);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index aabfff746ea..ff7da4462a2 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -34,15 +34,12 @@
 
 // Forward declaration
 namespace cudf::experimental::row::equality {
-class preprocessed_table;
+class CUDF_EXPORT preprocessed_table;
 }
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
-// Forward declaration
-class cuco_allocator;
-
 constexpr int DEFAULT_JOIN_CG_SIZE = 2;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
@@ -188,4 +185,4 @@ struct hash_join {
                     rmm::device_async_resource_ref mr) const;
 };
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 9f6dcce448d..92a417b0132 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -27,7 +27,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -55,4 +55,4 @@ std::unique_ptr<column> label_bins(column_view const& input,
 
 /** @} */  // end of group
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 56ac0554403..72e34b76158 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -59,4 +61,4 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index e62675cbc8c..ae6db5409cc 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -430,7 +430,9 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
   if (num_segments == 0) { return std::vector<size_type>{}; }
 
   // Construct a contiguous host buffer of indices and copy to device.
-  auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
+  auto h_indices = make_empty_host_vector<typename std::iterator_traits<IndexIterator>::value_type>(
+    std::distance(indices_begin, indices_end), stream);
+  std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
     make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 04d8d663acb..67e3617d873 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -25,7 +25,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -63,6 +63,7 @@ void set_null_mask(bitmask_type* bitmask,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return The number of non-zero bits in the specified range.
  */
+CUDF_EXPORT
 cudf::size_type count_set_bits(bitmask_type const* bitmask,
                                size_type start,
                                size_type stop,
@@ -82,6 +83,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return The number of zero bits in the specified range.
  */
+CUDF_EXPORT
 cudf::size_type count_unset_bits(bitmask_type const* bitmask,
                                  size_type start,
                                  size_type stop,
@@ -100,6 +102,7 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of non-zero bits in the specified ranges.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream);
@@ -117,6 +120,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of zero bits in the specified ranges.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
                                                   host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream);
@@ -137,6 +141,7 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return The number of valid elements in the specified range.
  */
+CUDF_EXPORT
 cudf::size_type valid_count(bitmask_type const* bitmask,
                             size_type start,
                             size_type stop,
@@ -169,6 +174,7 @@ cudf::size_type null_count(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of valid elements in each specified range.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_valid_count(bitmask_type const* bitmask,
                                              host_span<size_type const> indices,
                                              rmm::cuda_stream_view stream);
@@ -189,6 +195,7 @@ std::vector<size_type> segmented_valid_count(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of null elements in each specified range.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
                                             host_span<size_type const> indices,
                                             rmm::cuda_stream_view stream);
@@ -220,6 +227,7 @@ rmm::device_buffer copy_bitmask(column_view const& view,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
+CUDF_EXPORT
 std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type const* const> masks,
                                                      host_span<size_type const> masks_begin_bits,
                                                      size_type mask_size_bits,
@@ -279,4 +287,4 @@ void set_all_valid_null_masks(column_view const& input,
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 6c188d2ca68..23d5fb73ba3 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -18,11 +18,12 @@
 #include <cudf/quantiles.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -64,4 +65,4 @@ std::unique_ptr<column> percentile_approx(tdigest::tdigest_column_view const& in
                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index abb9e45a95c..e17f1b7c5fd 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -24,7 +24,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -51,4 +51,4 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 46203bdf2f0..e2bd729861b 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -24,7 +24,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::replace_nulls(column_view const&, column_view const&,
@@ -102,4 +102,4 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 7a1c3d6c4f0..68a856373bf 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -24,12 +24,10 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::tile
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<table> tile(table_view const& input,
                             size_type count,
@@ -38,12 +36,10 @@ std::unique_ptr<table> tile(table_view const& input,
 
 /**
  * @copydoc cudf::interleave_columns
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
                                            rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index ea6f38c421c..5bfa5679531 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -26,7 +26,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -49,4 +49,4 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index 1a9c5c82c65..ba3ef1c1ce7 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -22,7 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 
@@ -39,4 +39,4 @@ std::unique_ptr<column> round(column_view const& input,
                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 54c25d0157c..bd60309c5c3 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -73,6 +74,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
  * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @returns Column with scan results.
  */
+CUDF_EXPORT
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
@@ -99,6 +101,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return rank values.
  */
+CUDF_EXPORT
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
@@ -117,4 +120,4 @@ std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
   column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 95ed6af8c3c..6691ddc5c09 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -19,6 +19,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,7 +27,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
@@ -144,4 +145,4 @@ std::unique_ptr<table> boolean_mask_scatter(
   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index e60b18f4c8d..72e2cf074bc 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -25,7 +25,9 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
 /**
  * @copydoc cudf::lower_bound
  *
@@ -92,6 +94,7 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * @param mr Device memory resource used to allocate the returned vector
  * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
  */
+CUDF_EXPORT
 rmm::device_uvector<bool> contains(table_view const& haystack,
                                    table_view const& needles,
                                    null_equality compare_nulls,
@@ -99,4 +102,5 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index a18a9d3b200..a08010a610f 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -23,7 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init, scalar const& step,
@@ -65,4 +65,4 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 4ddba38a7e9..08cf329f199 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -26,7 +26,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -163,4 +163,4 @@ std::unique_ptr<table> stable_sort(table_view const& values,
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e3ef4190fd2..05194148a70 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -25,7 +25,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
@@ -148,4 +148,4 @@ cudf::size_type distinct_count(table_view const& input,
                                rmm::cuda_stream_view stream);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index beedc009c84..7de68035b19 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -25,9 +25,8 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <utility>
-
-namespace cudf::structs::detail {
+namespace CUDF_EXPORT cudf {
+namespace structs::detail {
 
 enum class column_nullability {
   MATCH_INCOMING,  ///< generate a null column if the incoming column has nulls
@@ -268,4 +267,5 @@ class flattened_table {
  */
 bool contains_null_structs(column_view const& col);
 
-}  // namespace cudf::structs::detail
+}  // namespace structs::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index bfd12c18fff..10eb3d389c7 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -18,14 +18,14 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace tdigest {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace tdigest::detail {
 
 /**
  * @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
@@ -152,6 +152,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
  *
  * @returns An empty tdigest column.
  */
+CUDF_EXPORT
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
 
@@ -236,6 +237,5 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace tdigest
-}  // namespace cudf
+}  // namespace tdigest::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index 037164aa297..c7798ff60ed 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -16,11 +16,13 @@
 #pragma once
 
 #include <cudf/timezone.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
@@ -34,4 +36,5 @@ std::unique_ptr<table> make_timezone_transition_table(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 47e13fa2e5e..02849ef023c 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -19,11 +19,12 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::transform
@@ -112,4 +113,4 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 1f8effc8103..559b2c32996 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -18,11 +18,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::transpose
@@ -34,4 +35,4 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
                                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 5245cfdf079..bb05138bc8c 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -26,7 +27,7 @@
 
 #include <thrust/transform.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Creates a column of `type_id::BOOL8` elements by applying a predicate to every element
@@ -101,4 +102,4 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/alignment.hpp b/cpp/include/cudf/detail/utilities/alignment.hpp
index e52032fe104..2677eca34db 100644
--- a/cpp/include/cudf/detail/utilities/alignment.hpp
+++ b/cpp/include/cudf/detail/utilities/alignment.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -43,4 +43,4 @@ T* align_ptr_for_type(void* destination)
 }
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index b66c461ab12..632d5a732ec 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -16,9 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
@@ -50,4 +53,5 @@ void cuda_memcpy_async(
 void cuda_memcpy(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/default_stream.hpp b/cpp/include/cudf/detail/utilities/default_stream.hpp
index fa438f142b7..f988355e6e0 100644
--- a/cpp/include/cudf/detail/utilities/default_stream.hpp
+++ b/cpp/include/cudf/detail/utilities/default_stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -33,4 +35,4 @@ extern rmm::cuda_stream_view const default_stream_value;
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
new file mode 100644
index 00000000000..c6775a950c9
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <cstddef>
+
+namespace cudf::detail {
+/**
+ * @brief Get the memory resource to be used for pageable memory allocations.
+ *
+ * @return Reference to the pageable memory resource
+ */
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
+
+/**
+ * @brief Get the allocator to be used for the host memory allocation.
+ *
+ * @param size The number of elements of type T to allocate
+ * @param stream The stream to use for the allocation
+ * @return The allocator to be used for the host memory allocation
+ */
+template <typename T>
+rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
+    return {get_pinned_memory_resource(), stream};
+  }
+  return {get_pageable_memory_resource(), stream};
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 2d14d0306cd..d4dd7b0d626 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/resource_ref.hpp>
@@ -28,7 +29,8 @@
 #include <limits>
 #include <new>  // for bad_alloc
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c a `rmm::host_async_resource_ref` for allocation.
@@ -61,6 +63,10 @@ class rmm_host_allocator<void> {
   };
 };
 
+template <class DesiredProperty, class... Properties>
+inline constexpr bool contains_property =
+  (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
+
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c `rmm::host_async_resource_ref` for allocation.
  *
@@ -100,8 +106,12 @@ class rmm_host_allocator {
   /**
    * @brief Construct from a `cudf::host_async_resource_ref`
    */
-  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
-    : mr(_mr), stream(_stream)
+  template <class... Properties>
+  rmm_host_allocator(cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...> _mr,
+                     rmm::cuda_stream_view _stream)
+    : mr(_mr),
+      stream(_stream),
+      _is_device_accessible{contains_property<cuda::mr::device_accessible, Properties...>}
   {
   }
 
@@ -173,15 +183,26 @@ class rmm_host_allocator {
    */
   inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
 
+  bool is_device_accessible() const { return _is_device_accessible; }
+
  private:
   rmm::host_async_resource_ref mr;
   rmm::cuda_stream_view stream;
+  bool _is_device_accessible;
 };
 
 /**
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
+ public:
+  using base = thrust::host_vector<T, rmm_host_allocator<T>>;
+
+  host_vector(rmm_host_allocator<T> const& alloc) : base(alloc) {}
+
+  host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
+};
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/linked_column.hpp b/cpp/include/cudf/detail/utilities/linked_column.hpp
index 0feef0f1a44..0b388938754 100644
--- a/cpp/include/cudf/detail/utilities/linked_column.hpp
+++ b/cpp/include/cudf/detail/utilities/linked_column.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <memory>
 #include <vector>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 struct linked_column_view;
 
@@ -68,4 +70,5 @@ struct linked_column_view : public column_view_base {
  */
 LinkedColVector table_to_linked_columns(table_view const& table);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/stacktrace.hpp b/cpp/include/cudf/detail/utilities/stacktrace.hpp
index c3ec9ce7a52..f54f5f3579a 100644
--- a/cpp/include/cudf/detail/utilities/stacktrace.hpp
+++ b/cpp/include/cudf/detail/utilities/stacktrace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <string>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 /**
  * @addtogroup utility_stacktrace
  * @{
@@ -44,4 +47,5 @@ std::string get_stacktrace(capture_last_stackframe capture_last_frame);
 
 /** @} */  // end of group
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
index 64c1d4ae514..dfe028bc5b7 100644
--- a/cpp/include/cudf/detail/utilities/stream_pool.hpp
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -23,7 +24,8 @@
 #include <cstddef>
 #include <vector>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 class cuda_stream_pool {
  public:
@@ -122,4 +124,5 @@ cuda_stream_pool& global_cuda_stream_pool();
  */
 void join_streams(host_span<rmm::cuda_stream_view const> streams, rmm::cuda_stream_view stream);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 20cb55bb1c7..a9d91cdeee1 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,9 +21,12 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_memory.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -32,11 +35,9 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/host_vector.h>
-
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -100,11 +101,12 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  auto const is_pinned = source_data.is_device_accessible();
+  cuda_memcpy_async(ret.data(),
+                    source_data.data(),
+                    source_data.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
   return ret;
 }
 
@@ -271,21 +273,11 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
-// Utility function template to allow copying to either a thrust::host_vector or std::vector
-template <typename T, typename OutContainer>
-OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
-{
-  OutContainer result(v.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
-  return result;
-}
-
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -295,14 +287,17 @@ OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view str
 template <typename T>
 std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, std::vector<T>>(v, stream);
+  std::vector<T> result(v.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -324,7 +319,7 @@ std::vector<typename Container::value_type> make_std_vector_async(Container cons
  * @brief Synchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -361,11 +356,46 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Construct a `cudf::detail::host_vector` of the given size.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
+template <typename T>
+host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
+{
+  return host_vector<T>(size, get_host_allocator<T>(size, stream));
+}
+
+/**
+ * @brief Construct an empty `cudf::detail::host_vector` with the given capacity.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param capacity Initial capacity of the vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector with the given capacity
+ */
+template <typename T>
+host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
+{
+  auto result = host_vector<T>(get_host_allocator<T>(capacity, stream));
+  result.reserve(capacity);
+  return result;
+}
+
 /**
  * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -373,16 +403,24 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, thrust::host_vector<T>>(v, stream);
+  auto result          = make_host_vector<T>(v.size(), stream);
+  auto const is_pinned = result.get_allocator().is_device_accessible();
+  cuda_memcpy_async(result.data(),
+                    v.data(),
+                    v.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -394,8 +432,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_async(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_async(Container const& c,
+                                                                   rmm::cuda_stream_view stream)
 {
   return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -404,7 +442,8 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -412,7 +451,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   auto result = make_host_vector_async(v, stream);
   stream.synchronize();
@@ -423,7 +462,7 @@ thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_s
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -435,8 +474,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_sync(Container const& c,
+                                                                  rmm::cuda_stream_view stream)
 {
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -444,7 +483,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 /**
  * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function may not synchronize `stream`.
+ * @note This function may not synchronize `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
@@ -460,7 +499,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
 /**
  * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
@@ -477,4 +516,4 @@ host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 64a3c4edf78..56a2c76b741 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -97,7 +97,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
 
   size_type size = thrust::distance(begin, end);
 
-  auto null_mask = detail::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
+  auto null_mask = cudf::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
 
   size_type null_count{0};
   if (size > 0) {
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index 55f3825b3ec..0eb17aa06f4 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 /**
  * @brief Returns a single column by vertically concatenating the given vector of
  * dictionary columns.
@@ -42,6 +41,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index 3b5a3bbab56..cc7ffbd397f 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 /**
  * @brief Construct a dictionary column by dictionary encoding an existing column.
  *
@@ -84,6 +83,5 @@ std::unique_ptr<column> decode(dictionary_column_view const& dictionary_column,
  */
 data_type get_indices_type_for_size(size_type keys_size);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index c4229690ff5..a1777d412fe 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -22,9 +22,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 
 /**
  * @brief Merges two dictionary columns.
@@ -47,6 +46,5 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 81a91d57169..1e1ee182fc5 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 
 /**
  * @brief Create a new dictionary column by replacing nulls with values
@@ -62,6 +61,5 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 2563b96b214..921acc258a9 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -18,11 +18,12 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 namespace detail {
 
@@ -63,4 +64,4 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 
 }  // namespace detail
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 9cdda773dbb..9eb812eb8ee 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -24,9 +24,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 /**
  * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view
  * const&,rmm::device_async_resource_ref)
@@ -103,6 +102,5 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
   std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 9f2bc90c0b2..dc822fee38b 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
  * @brief Class definition for cudf::dictionary_column_view
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup dictionary_classes
  * @{
@@ -124,4 +124,4 @@ class dictionary_column_view : private column_view {
 namespace dictionary {  // defined here for doxygen output
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 7cdfa3bf9e5..2f663c4af61 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_factories Factories
  * @{
@@ -87,12 +87,17 @@ std::unique_ptr<column> make_dictionary_column(
  * @param indices_column Indices to use for the new dictionary column.
  * @param null_mask Null mask for the output column.
  * @param null_count Number of nulls for the output column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
-std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_column,
-                                               std::unique_ptr<column> indices_column,
-                                               rmm::device_buffer&& null_mask,
-                                               size_type null_count);
+std::unique_ptr<column> make_dictionary_column(
+  std::unique_ptr<column> keys_column,
+  std::unique_ptr<column> indices_column,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -122,4 +127,4 @@ std::unique_ptr<column> make_dictionary_column(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 768e2be2b0d..9e68c947793 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
  * @addtogroup dictionary_encode
@@ -86,4 +86,4 @@ std::unique_ptr<column> decode(
 
 /** @} */  // end of group
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index 1dff6dc1d5d..66275de33e9 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
  * @addtogroup dictionary_search
@@ -50,4 +50,4 @@ std::unique_ptr<scalar> get_index(
 
 /** @} */  // end of group
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index ce7057359a1..c02e91f8d78 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
  * @addtogroup dictionary_update
@@ -169,4 +169,4 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 
 /** @} */  // end of group
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 90139e8634a..054f1e859f4 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_fill
  * @{
@@ -244,4 +245,4 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index c9cbc603226..ea2f5d4b6ca 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -30,7 +30,7 @@
 #include <string>
 
 /// `fixed_point` and supporting types
-namespace numeric {
+namespace CUDF_EXPORT numeric {
 
 /**
  * @addtogroup fixed_point_classes
@@ -799,4 +799,4 @@ using decimal64  = fixed_point<int64_t, Radix::BASE_10>;     ///<  64-bit decima
 using decimal128 = fixed_point<__int128_t, Radix::BASE_10>;  ///< 128-bit decimal fixed point
 
 /** @} */  // end of group
-}  // namespace numeric
+}  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index f12177c6a4b..f0d50edccd1 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <cuda/std/cmath>
@@ -24,7 +25,7 @@
 
 #include <cstring>
 
-namespace numeric {
+namespace CUDF_EXPORT numeric {
 
 /**
  * @addtogroup floating_conversion
@@ -1142,4 +1143,4 @@ CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& val
 }  // namespace detail
 
 /** @} */  // end of group
-}  // namespace numeric
+}  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 17dba6c2452..2bafe235058 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -24,7 +24,7 @@
 #include <algorithm>
 #include <string>
 
-namespace numeric {
+namespace CUDF_EXPORT numeric {
 namespace detail {
 
 template <typename T>
@@ -81,4 +81,4 @@ constexpr auto exp10(int32_t exponent)
 }
 
 }  // namespace detail
-}  // namespace numeric
+}  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 831ef68ed15..f7df9c1aa9b 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -21,6 +21,7 @@
 #include <cudf/replace.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -31,7 +32,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! `groupby` APIs
 namespace groupby {
 namespace detail {
@@ -420,4 +421,4 @@ class groupby {
 };
 /** @} */
 }  // namespace groupby
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 3c2f6dfe0d5..b8be2af6967 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_hash
@@ -187,4 +188,4 @@ std::unique_ptr<column> xxhash_64(
 }  // namespace hashing
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 77266ceb48f..1a459430346 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -24,9 +24,8 @@
 #include <cstddef>
 #include <functional>
 
-namespace cudf {
-namespace hashing {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace hashing::detail {
 
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
@@ -109,9 +108,8 @@ constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
   return lhs ^ (rhs + 0x9e37'79b9'7f4a'7c15 + (lhs << 6) + (lhs >> 2));
 }
 
-}  // namespace detail
-}  // namespace hashing
-}  // namespace cudf
+}  // namespace hashing::detail
+}  // namespace CUDF_EXPORT cudf
 
 // specialization of std::hash for cudf::data_type
 namespace std {
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 11f6ce2bad7..9a8f87b4a46 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -36,6 +36,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -53,7 +54,7 @@ struct ArrowArray;
 
 struct ArrowArrayStream;
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup interop_dlpack
  * @{
@@ -136,6 +137,8 @@ struct column_metadata {
  * Converts the `cudf::table_view` to `arrow::Table` with the provided
  * metadata `column_names`.
  *
+ * @deprecated Since 24.08. Use cudf::to_arrow_host instead.
+ *
  * @throws cudf::logic_error if `column_names` size doesn't match with number of columns.
  *
  * @param input table_view that needs to be converted to arrow Table
@@ -150,16 +153,19 @@ struct column_metadata {
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata = {},
-                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
+[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Table> to_arrow(
+  table_view input,
+  std::vector<column_metadata> const& metadata = {},
+  rmm::cuda_stream_view stream                 = cudf::get_default_stream(),
+  arrow::MemoryPool* ar_mr                     = arrow::default_memory_pool());
 
 /**
  * @brief Create `arrow::Scalar` from cudf scalar `input`
  *
  * Converts the `cudf::scalar` to `arrow::Scalar`.
  *
+ * @deprecated Since 24.08.
+ *
  * @param input scalar that needs to be converted to arrow Scalar
  * @param metadata Contains hierarchy of names of columns and children
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -172,10 +178,11 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata = {},
-                                        rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                        arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Scalar> to_arrow(
+  cudf::scalar const& input,
+  column_metadata const& metadata = {},
+  rmm::cuda_stream_view stream    = cudf::get_default_stream(),
+  arrow::MemoryPool* ar_mr        = arrow::default_memory_pool());
 
 /**
  * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
@@ -329,15 +336,67 @@ unique_device_array_t to_arrow_device(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Copy table view data to host and create `ArrowDeviceArray` for it
+ *
+ * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The
+ * returned ArrowDeviceArray will have a device_type of CPU and will have no ties
+ * to the memory referenced by the table view passed in. The deleter for the
+ * returned unique_ptr will call the release callback on the ArrowDeviceArray
+ * automatically.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf, it will
+ * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of precision 38.
+ *
+ * @param table Input table
+ * @param stream CUDA stream used for the device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray generated from input table
+ */
+unique_device_array_t to_arrow_host(
+  cudf::table_view const& table,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Copy column view data to host and create `ArrowDeviceArray` for it
+ *
+ * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The
+ * returned ArrowDeviceArray will have a device_type of CPU and will have no ties
+ * to the memory referenced by the column view passed in. The deleter for the
+ * returned unique_ptr will call the release callback on the ArrowDeviceArray
+ * automatically.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf, it will
+ * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of precision 38.
+ *
+ * @param col Input column
+ * @param stream CUDA stream used for the device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray generated from input column
+ */
+unique_device_array_t to_arrow_host(
+  cudf::column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
+ * @deprecated Since 24.08. Use cudf::from_arrow_host instead.
+ *
  * @param input arrow:Table that needs to be converted to `cudf::table`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
-std::unique_ptr<table> from_arrow(
+[[deprecated("Use cudf::from_arrow_host")]] std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
@@ -345,12 +404,17 @@ std::unique_ptr<table> from_arrow(
 /**
  * @brief Create `cudf::scalar` from given arrow Scalar input
  *
+ * @deprecated Since 24.08. Use arrow's `MakeArrayFromScalar` on the
+ * input, followed by `ExportArray` to obtain something that can be
+ * consumed by `from_arrow_host`. Then use `cudf::get_element` to
+ * extract a device scalar from the column.
+ *
  * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::scalar`
  * @return cudf scalar generated from given arrow Scalar
  */
-std::unique_ptr<cudf::scalar> from_arrow(
+[[deprecated("See docstring for migration strategies")]] std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
@@ -585,4 +649,4 @@ unique_column_view_t from_arrow_device_column(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
deleted file mode 100644
index 906d48f636b..00000000000
--- a/cpp/include/cudf/interop/detail/arrow.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanoarrow/nanoarrow.hpp>
-
-// from Arrow C Device Data Interface
-// https://arrow.apache.org/docs/format/CDeviceDataInterface.html
-#ifndef ARROW_C_DEVICE_DATA_INTERFACE
-#define ARROW_C_DEVICE_DATA_INTERFACE
-
-// Device type for the allocated memory
-using ArrowDeviceType = int32_t;
-
-// The Arrow spec specifies using macros rather than enums here to avoid being
-// susceptible to changes in the underlying type chosen by the compiler, but
-// clang-tidy doesn't like this.
-// NOLINTBEGIN
-// CPU device, same as using ArrowArray directly
-#define ARROW_DEVICE_CPU 1
-// CUDA GPU Device
-#define ARROW_DEVICE_CUDA 2
-// Pinned CUDA CPU memory by cudaMallocHost
-#define ARROW_DEVICE_CUDA_HOST 3
-// CUDA managed/unified memory allocated by cudaMallocManaged
-#define ARROW_DEVICE_CUDA_MANAGED 13
-// NOLINTEND
-
-struct ArrowDeviceArray {
-  struct ArrowArray array;
-  int64_t device_id;
-  ArrowDeviceType device_type;
-  void* sync_event;
-
-  // reserved bytes for future expansion
-  int64_t reserved[3];
-};
-
-#endif  // ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp
index d7a48c34e12..ed5c839cbb4 100644
--- a/cpp/include/cudf/io/arrow_io_source.hpp
+++ b/cpp/include/cudf/io/arrow_io_source.hpp
@@ -18,6 +18,8 @@
 
 #include "datasource.hpp"
 
+#include <cudf/utilities/export.hpp>
+
 #include <arrow/filesystem/filesystem.h>
 #include <arrow/io/interfaces.h>
 
@@ -25,7 +27,8 @@
 #include <string>
 #include <utility>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 /**
  * @addtogroup io_datasources
  * @{
@@ -86,4 +89,5 @@ class arrow_io_source : public datasource {
 };
 
 /** @} */  // end of group
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 8bc74eb574c..63f9ea3a624 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -28,7 +28,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_readers
@@ -221,4 +221,4 @@ table_with_metadata read_avro(
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index cc361f0918e..bbb4636a5a3 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -31,7 +31,7 @@
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 
 /**
@@ -1762,4 +1762,4 @@ void write_csv(csv_writer_options const& options,
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 69d8a388d45..e1eb9c042c7 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 
@@ -209,4 +209,4 @@ class data_sink {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 28263d466f3..b12fbe39a57 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -25,7 +26,7 @@
 #include <future>
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 
@@ -376,4 +377,4 @@ class datasource {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index fe9f935d2cc..13f695d6866 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -18,14 +18,13 @@
 
 #include <cudf/io/avro.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace avro {
+namespace CUDF_EXPORT cudf {
+namespace io::detail::avro {
 
 /**
  * @brief Reads the entire dataset.
@@ -42,7 +41,5 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace avro
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace io::detail::avro
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 2a70fa888f4..d4cad2f70fd 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -17,14 +17,13 @@
 #pragma once
 
 #include <cudf/io/csv.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
+namespace CUDF_EXPORT cudf {
+namespace io::detail::csv {
 
 /**
  * @brief Reads the entire dataset.
@@ -56,7 +55,5 @@ void write_csv(data_sink* sink,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream);
 
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace io::detail::csv
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 6ff1c12831b..42b10a78ce8 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -18,11 +18,13 @@
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::io::json::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::json::detail {
 
 /**
  * @brief Reads and returns the entire data set.
@@ -73,4 +75,5 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>
 void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr);
-}  // namespace cudf::io::json::detail
+}  // namespace io::json::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 597ddd9cf0a..7538cf7d29c 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -21,6 +21,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -29,12 +30,13 @@
 #include <string>
 #include <vector>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 
 // Forward declaration
-class orc_reader_options;
-class orc_writer_options;
-class chunked_orc_writer_options;
+class CUDF_EXPORT orc_reader_options;
+class CUDF_EXPORT orc_writer_options;
+class CUDF_EXPORT chunked_orc_writer_options;
 
 namespace orc::detail {
 
@@ -183,4 +185,5 @@ class writer {
 };
 
 }  // namespace orc::detail
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 21c870cb75e..a6945e0b7ab 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -24,6 +24,7 @@
 #include <cudf/io/parquet_metadata.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -32,12 +33,13 @@
 #include <string>
 #include <vector>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 
 // Forward declaration
-class parquet_reader_options;
-class parquet_writer_options;
-class chunked_parquet_writer_options;
+class CUDF_EXPORT parquet_reader_options;
+class CUDF_EXPORT parquet_writer_options;
+class CUDF_EXPORT chunked_parquet_writer_options;
 
 namespace parquet::detail {
 
@@ -257,4 +259,5 @@ class writer {
  */
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
 }  // namespace parquet::detail
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index d08c4e7c65a..715eb855daa 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/io/json.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -117,7 +118,7 @@ enum token_t : PdaTokenT {
   NUM_TOKENS
 };
 
-namespace detail {
+namespace CUDF_EXPORT detail {
 
 /**
  * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant
@@ -136,6 +137,6 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
-}  // namespace detail
+}  // namespace CUDF_EXPORT detail
 
 }  // namespace cudf::io::json
diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp
index 7bbda21858d..d0da9b410ce 100644
--- a/cpp/include/cudf/io/detail/utils.hpp
+++ b/cpp/include/cudf/io/detail/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-namespace cudf {
-namespace io {
-namespace detail {
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
 /**
  * @brief Whether writer writes in chunks or all at once
  */
 enum class single_write_mode : bool { YES, NO };
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7af90766ad0..0cb39d15cd5 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -30,7 +30,7 @@
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_readers
@@ -333,14 +333,14 @@ class json_reader_options {
    *
    * @param offset Number of bytes of offset
    */
-  void set_byte_range_offset(size_type offset) { _byte_range_offset = offset; }
+  void set_byte_range_offset(size_t offset) { _byte_range_offset = offset; }
 
   /**
    * @brief Set number of bytes to read.
    *
    * @param size Number of bytes to read
    */
-  void set_byte_range_size(size_type size) { _byte_range_size = size; }
+  void set_byte_range_size(size_t size) { _byte_range_size = size; }
 
   /**
    * @brief Set delimiter separating records in JSON lines
@@ -1024,4 +1024,4 @@ void write_json(json_writer_options const& options,
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 623c1d9fc72..8d484b15872 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -20,6 +20,7 @@
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -31,7 +32,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_readers
@@ -426,7 +427,7 @@ class chunked_orc_reader {
    *
    * This is added just to satisfy cython.
    */
-  chunked_orc_reader() = default;
+  chunked_orc_reader();
 
   /**
    * @brief Construct the reader from input/output size limits, output row granularity, along with
@@ -1429,7 +1430,12 @@ class orc_chunked_writer {
    * @brief Default constructor, this should never be used.
    *        This is added just to satisfy cython.
    */
-  orc_chunked_writer() = default;
+  orc_chunked_writer();
+
+  /**
+   * @brief virtual destructor, Added so we don't leak detail types.
+   */
+  ~orc_chunked_writer();
 
   /**
    * @brief Constructor with chunked writer options
@@ -1459,4 +1465,4 @@ class orc_chunked_writer {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 35196a19349..3c6194bb721 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -23,12 +23,13 @@
 
 #include <cudf/io/orc_types.hpp>
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <optional>
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_types
@@ -387,4 +388,4 @@ orc_metadata read_orc_metadata(source_info const& src_info,
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/orc_types.hpp b/cpp/include/cudf/io/orc_types.hpp
index abd81d76579..f6c03814c9b 100644
--- a/cpp/include/cudf/io/orc_types.hpp
+++ b/cpp/include/cudf/io/orc_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf::io::orc {
+namespace CUDF_EXPORT cudf {
+namespace io::orc {
 /**
  * @addtogroup io_types
  * @{
@@ -104,4 +107,5 @@ enum ProtofType : uint8_t {
 };
 
 /** @} */  // end of group
-}  // namespace cudf::io::orc
+}  // namespace io::orc
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 4d98cae73a7..12897ac77ef 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -21,6 +21,7 @@
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -32,7 +33,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 /**
  * @addtogroup io_readers
  * @{
@@ -480,8 +482,9 @@ class chunked_parquet_reader {
    * @brief Default constructor, this should never be used.
    *
    * This is added just to satisfy cython.
+   * This is added to not leak detail API
    */
-  chunked_parquet_reader() = default;
+  chunked_parquet_reader();
 
   /**
    * @brief Constructor for chunked reader.
@@ -1380,8 +1383,9 @@ class parquet_chunked_writer {
   /**
    * @brief Default constructor, this should never be used.
    *        This is added just to satisfy cython.
+   *        This is added to not leak detail API
    */
-  parquet_chunked_writer() = default;
+  parquet_chunked_writer();
 
   /**
    * @brief Constructor with chunked writer options
@@ -1391,6 +1395,11 @@ class parquet_chunked_writer {
    */
   parquet_chunked_writer(chunked_parquet_writer_options const& options,
                          rmm::cuda_stream_view stream = cudf::get_default_stream());
+  /**
+   * @brief Default destructor.
+   *        This is added to not leak detail API
+   */
+  ~parquet_chunked_writer();
 
   /**
    * @brief Writes table to output.
@@ -1423,4 +1432,5 @@ class parquet_chunked_writer {
 
 /** @} */  // end of group
 
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/parquet_metadata.hpp b/cpp/include/cudf/io/parquet_metadata.hpp
index e0c406c180c..dbb1fd03dca 100644
--- a/cpp/include/cudf/io/parquet_metadata.hpp
+++ b/cpp/include/cudf/io/parquet_metadata.hpp
@@ -22,13 +22,14 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <optional>
 #include <string_view>
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_types
@@ -270,4 +271,4 @@ parquet_metadata read_parquet_metadata(source_info const& src_info);
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 60ee867f058..7e9256be1d3 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -17,11 +17,12 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <cstdint>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 /**
@@ -113,4 +114,4 @@ byte_range_info create_byte_range_info_max();
 
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index 13aff4b3b8f..dd1d2331c1f 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_buffer.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 
@@ -120,4 +121,4 @@ class data_chunk_source {
 
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index 046994d33cc..42d0540b386 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,12 +19,14 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <memory>
 #include <string>
 
-namespace cudf::io::text {
+namespace CUDF_EXPORT cudf {
+namespace io::text {
 
 /**
  * @brief Creates a data source capable of producing device-buffered views of a datasource.
@@ -84,4 +86,5 @@ std::unique_ptr<data_chunk_source> make_source_from_bgzip_file(std::string_view
  */
 std::unique_ptr<data_chunk_source> make_source(cudf::string_scalar& data);
 
-}  // namespace cudf::io::text
+}  // namespace io::text
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
index 515bcf16de2..11eb4518210 100644
--- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
+++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <zlib.h>
@@ -26,7 +27,8 @@
 #include <fstream>
 #include <limits>
 
-namespace cudf::io::text::detail::bgzip {
+namespace CUDF_EXPORT cudf {
+namespace io::text::detail::bgzip {
 
 struct header {
   int block_size;
@@ -109,4 +111,5 @@ void write_compressed_block(std::ostream& output_stream,
                             host_span<char const> pre_size_subfields  = {},
                             host_span<char const> post_size_subfields = {});
 
-}  // namespace cudf::io::text::detail::bgzip
+}  // namespace io::text::detail::bgzip
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
index e4e47d8f010..32187b43d34 100644
--- a/cpp/include/cudf/io/text/detail/multistate.hpp
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 namespace detail {
@@ -125,4 +127,4 @@ constexpr multistate operator+(multistate const& lhs, multistate const& rhs)
 }  // namespace detail
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index aa9185b4983..3980a7fac02 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_scan.cuh>
 #include <cuda/atomic>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 namespace detail {
@@ -147,4 +149,4 @@ struct scan_tile_state_callback {
 }  // namespace detail
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index e0b9c7635e3..eee3fefc79f 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -30,7 +31,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 namespace detail {
@@ -223,11 +224,11 @@ struct trie {
 
     match_length.emplace_back(0);
 
-    std::vector<trie_node> trie_nodes;
     auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+    auto trie_nodes   = cudf::detail::make_empty_host_vector<trie_node>(tokens.size(), stream);
 
     for (uint32_t i = 0; i < tokens.size(); i++) {
-      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]});
       token_counts[tokens[i]]++;
     }
 
@@ -248,4 +249,4 @@ struct trie {
 }  // namespace detail
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index e29ab78ae46..8624a386d0f 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -27,7 +27,7 @@
 #include <memory>
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 /**
@@ -120,4 +120,4 @@ std::unique_ptr<cudf::column> multibyte_split(
 
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 0c96268f6c7..3df737413fa 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -33,16 +33,16 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 class data_sink;
 class datasource;
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
 
 //! cuDF interfaces
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 /**
@@ -277,6 +277,9 @@ struct column_name_info {
 struct table_metadata {
   std::vector<column_name_info>
     schema_info;  //!< Detailed name information for the entire output hierarchy
+  std::vector<size_t> num_rows_per_source;  //!< Number of rows read from each data source.
+                                            //!< Currently only computed for Parquet readers if no
+                                            //!< AST filters being used. Empty vector otherwise.
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata of the first input
                                                  //!< file as key-values pairs (deprecated)
   std::vector<std::unordered_map<std::string, std::string>>
@@ -1086,4 +1089,4 @@ class reader_column_schema {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index ba485bd6372..f4139721475 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -21,6 +21,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -32,7 +33,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Enum to indicate whether the distinct join table has nested columns or not
@@ -43,13 +44,24 @@ enum class has_nested : bool { YES, NO };
 
 // forward declaration
 namespace hashing::detail {
+
+/**
+ * @brief Forward declaration for our Murmur Hash 3 implementation
+ */
 template <typename T>
 class MurmurHash3_x86_32;
 }  // namespace hashing::detail
 namespace detail {
+
+/**
+ * @brief Forward declaration for our hash join
+ */
 template <typename T>
 class hash_join;
 
+/**
+ * @brief Forward declaration for our distinct hash join
+ */
 template <cudf::has_nested HasNested>
 class distinct_hash_join;
 }  // namespace detail
@@ -1179,4 +1191,4 @@ std::size_t conditional_left_anti_join_size(
   ast::expression const& binary_predicate,
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 385e8e54bdc..48d5dcf7727 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -17,13 +17,14 @@
 
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup json_object
@@ -173,4 +174,4 @@ std::unique_ptr<cudf::column> get_json_object(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index 9091e31a9ea..7eb25134ca5 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -24,7 +24,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup label_bins
@@ -79,4 +79,4 @@ std::unique_ptr<column> label_bins(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 853562acfff..5a310e6651f 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 //! Lists column APIs
 namespace lists {
@@ -102,4 +103,4 @@ std::unique_ptr<column> concatenate_list_elements(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 060882555aa..cd0a216488c 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_contains
@@ -182,4 +183,4 @@ std::unique_ptr<column> index_of(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index 2b9f5aa5607..a6f2ea6e68a 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_elements
@@ -58,4 +59,4 @@ std::unique_ptr<column> count_elements(
 /** @} */  // end of lists_elements group
 
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index bd4c01bbb4b..07309da2814 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -21,9 +21,8 @@
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 /**
  * @copydoc cudf::lists::concatenate_rows
  *
@@ -44,6 +43,5 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index d67958ef260..edfa3355dcd 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -24,9 +24,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Returns a single column by concatenating the given vector of
@@ -48,6 +47,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 638cc7afb81..1ca3651b55a 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -20,9 +20,8 @@
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
@@ -71,6 +70,5 @@ std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr);
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 18a70bba5e9..76154ae7064 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -20,9 +20,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Returns a new lists column created from a subset of the
@@ -49,6 +48,5 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index d36a4091947..96ee30dd261 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,12 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/device_uvector.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 /**
  * @brief Device view for `dremel_data`.
@@ -31,8 +33,8 @@ struct dremel_device_view {
   size_type const* offsets;
   uint8_t const* rep_levels;
   uint8_t const* def_levels;
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 };
 
 /**
@@ -45,8 +47,8 @@ struct dremel_data {
   rmm::device_uvector<uint8_t> rep_level;
   rmm::device_uvector<uint8_t> def_level;
 
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 
   operator dremel_device_view() const
   {
@@ -213,4 +215,5 @@ dremel_data get_comparator_data(column_view input,
                                 std::vector<uint8_t> nullability,
                                 bool output_as_byte_array,
                                 rmm::cuda_stream_view stream);
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index 6f983d44bc9..e14b93ff912 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -20,9 +20,8 @@
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, size_type,
@@ -44,6 +43,5 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 0cd77556f33..294282d7caa 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -21,6 +21,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -276,6 +277,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
  *
  * @returns column with elements gathered based on `gather_data`
  */
+CUDF_EXPORT
 std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
@@ -293,6 +295,7 @@ std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
  *
  * @returns column with elements gathered based on `gather_data`
  */
+CUDF_EXPORT
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index 3aff93840a9..ae8caa853f3 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -21,9 +21,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Returns a single column by interleaving rows of the given table of list elements.
@@ -50,6 +49,5 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
index 192aee8d811..18d66f15b1e 100644
--- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
@@ -23,9 +23,8 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Internal API to construct a lists column from a `list_scalar`, for public
@@ -67,6 +66,5 @@ std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index d099a0708b9..d10d7784e6c 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -16,10 +16,12 @@
 #pragma once
 
 #include <cudf/lists/reverse.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists::detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::reverse
@@ -29,4 +31,5 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
                                 rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::lists::detail
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index c550ad5b94f..be76e456900 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -239,11 +239,11 @@ std::unique_ptr<column> scatter(scalar const& slr,
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
 
-  auto lv        = static_cast<list_scalar const*>(&slr);
-  bool slr_valid = slr.is_valid(stream);
-  rmm::device_buffer null_mask =
-    slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
-              : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
+  auto lv                      = static_cast<list_scalar const*>(&slr);
+  bool slr_valid               = slr.is_valid(stream);
+  rmm::device_buffer null_mask = slr_valid
+                                   ? cudf::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
+                                   : cudf::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
   auto offset_column =
     make_numeric_column(data_type{type_to_id<size_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
   thrust::sequence(rmm::exec_policy_nosync(stream),
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index 8746b1ba62a..abfcef72d47 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -24,7 +24,8 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists::detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::list::have_overlap
@@ -75,4 +76,5 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
-}  // namespace cudf::lists::detail
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index e428ea84ce6..8cbfbbae769 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -20,9 +20,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::sort_lists
@@ -46,6 +45,5 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index f5e5b29bc8f..c11e07cd190 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -17,11 +17,13 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists::detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
@@ -45,4 +47,5 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::lists::detail
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index 81d82dcfa09..a3375887815 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -25,7 +25,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_reshape
  * @{
@@ -66,6 +66,7 @@ namespace cudf {
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -73,6 +74,7 @@ namespace cudf {
 std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -109,6 +111,7 @@ std::unique_ptr<table> explode(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with exploded value and position. The column order of return table is
@@ -117,6 +120,7 @@ std::unique_ptr<table> explode(
 std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -152,6 +156,7 @@ std::unique_ptr<table> explode_position(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -159,6 +164,7 @@ std::unique_ptr<table> explode_position(
 std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -196,6 +202,7 @@ std::unique_ptr<table> explode_outer(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -203,8 +210,9 @@ std::unique_ptr<table> explode_outer(
 std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 096d276fcfb..29a02308c66 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_extract
@@ -113,4 +114,4 @@ std::unique_ptr<column> extract_list_element(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 1d840c76bf8..a1f3c37ad9e 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -25,7 +25,8 @@
 
 #include <memory>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 /**
  * @addtogroup lists_filling
  * @{
@@ -113,4 +114,5 @@ std::unique_ptr<column> sequences(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index a0d79c05098..6359e0488c9 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -19,11 +19,12 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_gather
@@ -80,4 +81,4 @@ std::unique_ptr<column> segmented_gather(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 170a20bd7f5..29b81135d64 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -25,7 +25,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A non-owning, immutable view of device data that represents
@@ -377,4 +377,4 @@ CUDF_HOST_DEVICE auto inline make_list_size_iterator(detail::lists_column_device
   return detail::make_counting_transform_iterator(0, list_size_functor{c});
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/list_view.hpp b/cpp/include/cudf/lists/list_view.hpp
index a3f36a9330f..59ad9c9bcee 100644
--- a/cpp/include/cudf/lists/list_view.hpp
+++ b/cpp/include/cudf/lists/list_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,12 +16,14 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 /**
  * @file list_view.hpp
  * @brief Class definition for cudf::list_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A non-owning, immutable view of device data that represents
@@ -29,4 +31,4 @@ namespace cudf {
  */
 class list_view {};
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index 4d12ee1cab4..b3ec18a7913 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -21,9 +21,7 @@
 
 #include <cuda_runtime.h>
 
-namespace cudf {
-
-namespace detail {
+namespace cudf::detail {
 
 /**
  * @brief Given a column_device_view, an instance of this class provides a
@@ -116,6 +114,4 @@ class lists_column_device_view : private column_device_view {
   }
 };
 
-}  // namespace detail
-
-}  // namespace cudf
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 3397cb0ca1d..b117a871b64 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -25,7 +26,7 @@
  * @brief Class definition for cudf::lists_column_view
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup lists_classes
@@ -137,4 +138,4 @@ class lists_column_view : private column_view {
   }
 };
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index 34c40c5a3ba..f00e6e5117a 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -17,13 +17,15 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 /**
  * @addtogroup lists_modify
  * @{
@@ -54,4 +56,5 @@ std::unique_ptr<column> reverse(
 
 /** @} */  // end of doxygen group
 
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index b8abfd62461..55b1591fc44 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -23,7 +23,8 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 /**
  * @addtogroup set_operations
  * @{
@@ -53,8 +54,8 @@ namespace cudf::lists {
  * @param nulls_equal Flag to specify whether null elements should be considered as equal, default
  *        to be `UNEQUAL` which means only non-null elements are checked for overlapping
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
- * @param mr Device memory resource used to allocate the returned object
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned object
  * @return A column of type BOOL containing the check results
  */
 std::unique_ptr<column> have_overlap(
@@ -177,4 +178,5 @@ std::unique_ptr<column> difference_distinct(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index 78cea191bc5..39c71f6e9fa 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_sort
@@ -74,4 +75,4 @@ std::unique_ptr<column> stable_sort_lists(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 31f09d37560..28ef13cd870 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -17,12 +17,14 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 
 /**
  * @addtogroup lists_filtering
@@ -94,4 +96,5 @@ std::unique_ptr<column> distinct(
 
 /** @} */  // end of group
 
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 301e56c19b8..83c6ff04500 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -24,7 +25,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_merge
  * @{
@@ -110,4 +111,4 @@ std::unique_ptr<cudf::table> merge(
   rmm::cuda_stream_view stream                         = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 9e375df140b..70ca6aa29c5 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -25,7 +26,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_nullmask
@@ -208,4 +209,4 @@ cudf::size_type null_count(bitmask_type const* bitmask,
                            size_type stop,
                            rmm::cuda_stream_view stream = cudf::get_default_stream());
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 9ed56297908..6a53553063e 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -26,7 +27,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup reorder_partition
  * @{
@@ -254,4 +255,4 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index a1c98ee4e9d..47eac2e72f9 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -20,11 +20,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_quantiles
  * @{
@@ -129,4 +130,4 @@ std::unique_ptr<column> percentile_approx(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 52f39925a2d..e42ff5df15d 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_reduction
  * @{
@@ -232,4 +233,4 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 
 /** @} */  // end of group
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index f23c5a14e33..5b17df47ec7 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -27,7 +28,8 @@
 #include <memory>
 #include <optional>
 
-namespace cudf::reduction::detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 
 /**
  * @brief Compute the frequency for each distinct row in the input table.
@@ -55,4 +57,5 @@ compute_row_frequencies(table_view const& input,
  */
 [[nodiscard]] std::unique_ptr<column> make_empty_histogram_like(column_view const& values);
 
-}  // namespace cudf::reduction::detail
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
index 78f90a1e2c9..a15783fb460 100644
--- a/cpp/include/cudf/reduction/detail/reduction.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -19,12 +19,14 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf::reduction::detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 
 /**
  * @copydoc cudf::reduce(column_view const&, reduce_aggregation const&, data_type,
@@ -39,4 +41,5 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::reduction::detail
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 31d465619b9..fa21dc87e64 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -20,15 +20,15 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
-namespace reduction {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 /**
  * @brief Computes sum of elements in input column
  *
@@ -352,6 +352,5 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace reduction
-}  // namespace cudf
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 770ac6580ef..1c55b387454 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -20,15 +20,15 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
-namespace reduction {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 
 /**
  * @brief Compute sum of each segment in the input column
@@ -354,6 +354,5 @@ std::unique_ptr<column> segmented_nunique(column_view const& col,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace reduction
-}  // namespace cudf
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index ae20e72f023..43aabd6c6c6 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_replace
  * @{
@@ -308,4 +309,4 @@ void normalize_nans_and_zeros(mutable_column_view& in_out,
                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 26316be7fd4..07aaf6488ad 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -19,13 +19,14 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_reshape
  * @{
@@ -46,13 +47,14 @@ namespace cudf {
  * @throws cudf::logic_error if input contains no columns.
  * @throws cudf::logic_error if input columns dtypes are not identical.
  *
- * @param[in] input Table containing columns to interleave
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
- *
+ * @param input Table containing columns to interleave
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The interleaved columns as a single column
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,15 +69,17 @@ std::unique_ptr<column> interleave_columns(
  * return = [[8, 4, 7, 8, 4, 7], [5, 2, 3, 5, 2, 3]]
  * ```
  *
- * @param[in] input Table containing rows to be repeated
- * @param[in] count Number of times to tile "rows". Must be non-negative
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
+ * @param input Table containing rows to be repeated
+ * @param count Number of times to tile "rows". Must be non-negative
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @return The table containing the tiled "rows"
  */
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -94,6 +98,7 @@ enum class flip_endianness : bool { NO, YES };
  *
  * @param input_column Column to be converted to lists of bytes
  * @param endian_configuration Whether to retain or flip the endianness of the elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @return The column containing the lists of bytes
@@ -101,8 +106,9 @@ enum class flip_endianness : bool { NO, YES };
 std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index d55322dd3e8..5a8c454d8fc 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_rolling
  * @{
@@ -615,4 +616,4 @@ std::unique_ptr<column> rolling_window(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
index a9ee12cea27..21be609cbe6 100644
--- a/cpp/include/cudf/rolling/range_window_bounds.hpp
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_rolling
  * @{
@@ -119,4 +120,4 @@ struct range_window_bounds {
 };
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index 85935f8f05c..ef144b328f7 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -17,11 +17,12 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup transformation_unaryops
@@ -78,4 +79,4 @@ std::unique_ptr<column> round(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index d78907b473a..2c5cc60fc70 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -32,7 +32,7 @@
  * @brief Class definitions for cudf::scalar
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup scalar_classes
  * @{
@@ -894,4 +894,4 @@ class struct_scalar : public scalar {
 };
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 846da0bbe10..cbd3e9175ac 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
  * @brief Scalar device view class definitions
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief A non-owning view of scalar from device that is trivially copyable
@@ -440,4 +440,4 @@ auto get_scalar_device_view(fixed_point_scalar<T>& s)
   return fixed_point_scalar_device_view<T>(s.type(), s.data(), s.validity_data());
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index 7dd4674a2fd..a422c3bfbe9 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup scalar_factories
  * @{
@@ -227,4 +227,4 @@ std::unique_ptr<scalar> make_struct_scalar(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index 2e50ba2d687..ad170ec726b 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -20,13 +20,14 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_search
  * @{
@@ -168,4 +169,4 @@ std::unique_ptr<column> contains(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 79a00cbce42..4cb265a2a0b 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -19,6 +19,7 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -26,7 +27,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_sort
@@ -346,4 +347,4 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 181af11adb8..cfe404ff6ab 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -25,7 +26,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup reorder_compact
  * @{
@@ -401,4 +402,4 @@ cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal = null_equality::EQUAL);
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 26f906b3102..323290e907c 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 //! Strings column APIs
 namespace strings {
@@ -91,4 +91,4 @@ std::unique_ptr<column> code_points(
 /** @} */  // end of strings_apis group
 
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index f8cbdc09748..420b46a05b2 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_case
@@ -129,4 +129,4 @@ std::unique_ptr<column> is_title(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 5403fa8db7e..45f56a681a6 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_case
@@ -89,4 +89,4 @@ std::unique_ptr<column> swapcase(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/char_types/char_cases.hpp b/cpp/include/cudf/strings/char_types/char_cases.hpp
index 9eb63f71a2f..e5e619b8a50 100644
--- a/cpp/include/cudf/strings/char_types/char_cases.hpp
+++ b/cpp/include/cudf/strings/char_types/char_cases.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,9 @@
  */
 #pragma once
 
-namespace cudf {
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
 namespace strings {
 namespace detail {
 /**
@@ -31,4 +33,4 @@ void generate_special_mapping_hash_table();
 
 }  // namespace detail
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index da7a238a400..a6af681eec6 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_types
@@ -119,4 +119,4 @@ std::unique_ptr<column> filter_characters_of_type(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/char_types/char_types_enum.hpp b/cpp/include/cudf/strings/char_types/char_types_enum.hpp
index 8d248cb2ebf..a9142fdbda6 100644
--- a/cpp/include/cudf/strings/char_types/char_types_enum.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types_enum.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cstdint>
 #include <type_traits>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_types
@@ -80,4 +80,4 @@ constexpr string_character_types& operator|=(string_character_types& lhs,
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 8cc735831b8..2cade813d78 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -24,7 +24,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_combine
@@ -334,4 +334,4 @@ std::unique_ptr<column> join_list_elements(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index f79a0f19e9c..59c9b2dea40 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -209,4 +209,4 @@ std::unique_ptr<column> like(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index 9c922361914..d79dd4a80ea 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -72,4 +72,4 @@ std::unique_ptr<column> from_booleans(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index b89384d718b..c3b3c91ab35 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -255,4 +255,4 @@ std::unique_ptr<column> from_timestamps(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index 2db719a4f1f..8b69968a609 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -133,4 +133,4 @@ std::unique_ptr<column> from_durations(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 9911bea1948..a9c5aea6343 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -130,4 +130,4 @@ std::unique_ptr<column> is_fixed_point(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index feb5b528686..64e9bb776f4 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -103,4 +103,4 @@ std::unique_ptr<column> is_float(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 82696811fdc..62eb1fdda4d 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -235,4 +235,4 @@ std::unique_ptr<column> integers_to_hex(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 64f8a412ce9..04a04907c12 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -113,4 +113,4 @@ std::unique_ptr<column> is_ipv4(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index a88bbe99492..85b67907228 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -68,4 +68,4 @@ std::unique_ptr<column> format_list_column(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index 30988d2ff0a..a42a5cd2407 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -75,4 +75,4 @@ std::unique_ptr<column> url_decode(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp
index 0901076c835..5d6aff28826 100644
--- a/cpp/include/cudf/strings/detail/char_tables.hpp
+++ b/cpp/include/cudf/strings/detail/char_tables.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 // Type for the character flags table.
 using character_flags_table_type = std::uint8_t;
 
@@ -101,6 +102,5 @@ constexpr uint16_t get_special_case_hash_index(uint32_t code_point)
   return static_cast<uint16_t>(code_point % special_case_prime);
 }
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 25214055787..962191eae6a 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -21,13 +21,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @copydoc concatenate(table_view const&,string_scalar const&,string_scalar
@@ -68,6 +68,5 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index b5dd5b9516a..e038102ab1f 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -19,14 +19,14 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Returns a single column by vertically concatenating the given vector of
  * strings columns.
@@ -47,6 +47,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index d212239264b..73a97499293 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -18,13 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @copydoc to_integers(strings_column_view const&,data_type,rmm::device_async_resource_ref)
@@ -153,6 +153,5 @@ std::unique_ptr<column> from_fixed_point(column_view const& integers,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/copy_range.hpp b/cpp/include/cudf/strings/detail/copy_range.hpp
index 192c5b833c6..71dcf9edaf3 100644
--- a/cpp/include/cudf/strings/detail/copy_range.hpp
+++ b/cpp/include/cudf/strings/detail/copy_range.hpp
@@ -21,9 +21,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @brief Internal API to copy a range of string elements out-of-place from
@@ -56,6 +55,5 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index 240cac17188..b4d3362359d 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -19,13 +19,13 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Returns a new strings column created from a subset of
  * of the strings column.
@@ -83,6 +83,5 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index c5d005fbf75..1a3ff2c9166 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -19,13 +19,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Returns a strings column replacing a range of rows
  * with the specified string.
@@ -50,6 +50,5 @@ std::unique_ptr<column> fill(strings_column_view const& strings,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index fcd74bebfe8..4369de317b3 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -18,11 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -230,7 +232,8 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
 
   auto chars_data = rmm::device_uvector<char>(chars_bytes, stream, mr);
-  auto d_chars    = chars_data.data();
+  cudf::experimental::prefetch::detail::prefetch("gather", chars_data, stream);
+  auto d_chars = chars_data.data();
 
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
@@ -312,6 +315,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
   // build chars column
   auto const offsets_view =
     cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
+  cudf::experimental::prefetch::detail::prefetch(
+    "gather", strings.chars_begin(stream), strings.chars_size(stream), stream);
   auto out_chars_data = gather_chars(
     d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
 
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
index 35fd9c0593d..0aa5c0c2899 100644
--- a/cpp/include/cudf/strings/detail/merge.hpp
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -18,10 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/merge.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf ::strings ::detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Merges two strings columns
  *
@@ -38,4 +40,5 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::strings::detail
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index 481d00f1bce..ab092555c48 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -19,13 +19,13 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&,
@@ -100,6 +100,5 @@ std::unique_ptr<cudf::column> find_and_replace_all(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
index f32afa64a72..4991fd633d5 100644
--- a/cpp/include/cudf/strings/detail/scan.hpp
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -21,9 +21,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Scan function for strings
  *
@@ -43,6 +42,5 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index f5f3982a5d6..55b59dd4ff2 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -23,6 +23,7 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -186,6 +187,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
 
   // Now build the chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
+  cudf::experimental::prefetch::detail::prefetch("gather", chars, stream);
   size_and_exec_fn.d_chars = chars.data();
 
   // Execute the function fn again to fill in the chars data.
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index 5587597cb51..85349a421b1 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -22,9 +22,8 @@
  * @brief Standalone string functions.
  */
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @brief This will return true if passed a continuation byte of a UTF-8 character.
@@ -206,6 +205,5 @@ constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
   return utf8;
 }
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 4467a9d0023..1fa505501d8 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -18,15 +18,15 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @brief Create an offsets column to be a child of a strings column
@@ -96,6 +96,5 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 4138e1e59d5..2ef7308b802 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -104,4 +104,4 @@ std::unique_ptr<column> extract_all_record(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index c116dbc2fe1..efba6da9454 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_find
@@ -262,4 +262,4 @@ std::unique_ptr<column> ends_with(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index c2e82aa6f1a..dea08308ff0 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_find
@@ -63,4 +63,4 @@ std::unique_ptr<column> find_multiple(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index abc1d28ee4c..26249b6842c 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -70,4 +70,4 @@ std::unique_ptr<column> findall(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index f1382d6ea29..11e35f717ae 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -96,4 +96,4 @@ std::unique_ptr<column> zfill(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index 44ca68439e7..f7108129dee 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,11 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 /**
@@ -86,4 +88,4 @@ enum class capture_groups : uint32_t {
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp
index 95c86ae0f8a..9da859d9c87 100644
--- a/cpp/include/cudf/strings/regex/regex_program.hpp
+++ b/cpp/include/cudf/strings/regex/regex_program.hpp
@@ -21,7 +21,7 @@
 #include <memory>
 #include <string>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 /**
@@ -135,4 +135,4 @@ struct regex_program {
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index cbf1edc8331..e160f75390b 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_copy
@@ -133,4 +133,4 @@ std::unique_ptr<column> repeat_strings(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index a714f762a19..5b4ffb98f99 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_replace
@@ -174,4 +174,4 @@ std::unique_ptr<column> replace_multiple(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index f61f9585144..6b487072cb2 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -25,7 +25,7 @@
 
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -112,4 +112,4 @@ std::unique_ptr<column> replace_with_backrefs(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index 86656693c8b..fbda2e5fe7c 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -53,4 +53,4 @@ std::unique_ptr<column> reverse(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/side_type.hpp b/cpp/include/cudf/strings/side_type.hpp
index 5905e087deb..5b794261ad9 100644
--- a/cpp/include/cudf/strings/side_type.hpp
+++ b/cpp/include/cudf/strings/side_type.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,9 @@
  */
 #pragma once
 
-namespace cudf {
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -34,4 +36,4 @@ enum class side_type {
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index e2be6abd344..b0da6976207 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_slice
@@ -114,4 +114,4 @@ std::unique_ptr<column> slice_strings(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 0a837034ba1..8f5ae752417 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_split
@@ -101,4 +101,4 @@ std::unique_ptr<table> rpartition(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index d5c44406ca7..ca371d7abd1 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_split
@@ -245,4 +245,4 @@ std::unique_ptr<column> rsplit_record(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 81595fa7ed4..96ef0b6e830 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -263,4 +263,4 @@ std::unique_ptr<column> rsplit_record_re(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 93cc787683b..abb26d7ccb4 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -18,6 +18,7 @@
 
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #ifndef __CUDA_ARCH__
 #include <cudf/utilities/error.hpp>
@@ -35,7 +36,7 @@
 // This file should only include device code logic.
 // Host-only or host/device code should be defined in the string_view.hpp header file.
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 namespace detail {
 
@@ -448,4 +449,4 @@ __device__ inline size_type string_view::character_offset(size_type bytepos) con
   return strings::detail::characters_in_string(data(), bytepos);
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index afc7e027a4b..504c31057ae 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
  * @brief Class definition for cudf::string_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
 
@@ -406,4 +406,4 @@ class string_view {
                                         size_type count) const;
 };
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 1e9e73cef4c..4a2512eb7c5 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -17,13 +17,14 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 /**
  * @file
  * @brief Class definition for cudf::strings_column_view
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup strings_classes
@@ -126,4 +127,4 @@ namespace strings {
 }  // namespace strings
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 6fb9bbc45e6..4cfba59c72c 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -71,4 +71,4 @@ std::unique_ptr<column> strip(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 9cd6b7d5974..531753f4a8c 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -25,7 +25,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -109,4 +109,4 @@ std::unique_ptr<column> filter_characters(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index c05c33fbac8..465a9d15d00 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -72,4 +72,4 @@ std::unique_ptr<column> wrap(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index 5dc3169c0c4..16be868af52 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -18,13 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace structs {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace structs::detail {
 
 /**
  * @brief Returns a single column by concatenating the given vector of structs columns.
@@ -54,6 +54,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace structs
-}  // namespace cudf
+}  // namespace structs::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
index c97a8452ecd..6121f63d42f 100644
--- a/cpp/include/cudf/structs/detail/scan.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -17,13 +17,13 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace structs {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace structs::detail {
 /**
  * @brief Scan function for struct column type
  *
@@ -41,6 +41,5 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace structs
-}  // namespace cudf
+}  // namespace structs::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/struct_view.hpp b/cpp/include/cudf/structs/struct_view.hpp
index 75483709867..65fd3f78d1a 100644
--- a/cpp/include/cudf/structs/struct_view.hpp
+++ b/cpp/include/cudf/structs/struct_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
  * @brief Class definition for cudf::struct_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A non-owning, immutable view of device data that represents
@@ -29,4 +29,4 @@ namespace cudf {
  */
 class struct_view {};
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/structs_column_device_view.cuh b/cpp/include/cudf/structs/structs_column_device_view.cuh
index 7580582631f..cf71ba87a20 100644
--- a/cpp/include/cudf/structs/structs_column_device_view.cuh
+++ b/cpp/include/cudf/structs/structs_column_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/types.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -84,4 +84,4 @@ class structs_column_device_view : private column_device_view {
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp
index 4a50488ef00..19798f51656 100644
--- a/cpp/include/cudf/structs/structs_column_view.hpp
+++ b/cpp/include/cudf/structs/structs_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
  * @brief Class definition for cudf::structs_column_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup structs_classes
@@ -98,4 +98,4 @@ class structs_column_view : public column_view {
     int index, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 };         // class structs_column_view;
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index c181ac7d402..f05e5f4ca5c 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -54,7 +54,7 @@
 #include <type_traits>
 #include <utility>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace experimental {
 
@@ -2026,4 +2026,4 @@ class row_hasher {
 }  // namespace row
 
 }  // namespace experimental
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 0e57d24f4b3..e3b65d77b4a 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -30,7 +30,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_reduce.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Result type of the `element_relational_comparator` function object.
@@ -635,4 +635,4 @@ class row_hasher {
   uint32_t _seed{DEFAULT_HASH_SEED};
 };
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index c4f14af53fb..be2af7ac653 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -31,7 +31,7 @@
  * @brief Class definition for cudf::table
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A set of cudf::column's of the same size.
@@ -194,4 +194,4 @@ class table {
   size_type _num_rows{};
 };
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 511013b585d..16d532ea2b8 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@
  * @brief Table device view class definitions
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -271,4 +271,4 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
index b2eb341df86..2f19efa5630 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Tdigest interfaces
 namespace tdigest {
 /**
@@ -132,4 +132,4 @@ class tdigest_column_view : private column_view {
 
 /** @} */  // end of group
 }  // namespace tdigest
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index 7f65128526e..8329c64e24f 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -22,7 +24,7 @@
 #include <optional>
 #include <string>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 class table;
 
 // Cycle in which the time offsets repeat in Gregorian calendar
@@ -52,4 +54,4 @@ std::unique_ptr<table> make_timezone_transition_table(
   std::string_view timezone_name,
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 7bb9fb7a42e..adc5bdb2af8 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_transform
  * @{
@@ -248,4 +249,4 @@ std::unique_ptr<column> segmented_row_bit_count(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp
index c01a04afe87..f4433c46a06 100644
--- a/cpp/include/cudf/transpose.hpp
+++ b/cpp/include/cudf/transpose.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup reshape_transpose
  * @{
@@ -48,4 +49,4 @@ std::pair<std::unique_ptr<column>, table_view> transpose(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index baf07fa3db6..409b8c825bb 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -36,6 +36,8 @@
 #define CUDF_KERNEL static
 #endif
 
+#include <cudf/utilities/export.hpp>
+
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -54,7 +56,7 @@ class device_buffer;
 
 }  // namespace rmm
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 // Forward declaration
 class column;
 class column_view;
@@ -344,4 +346,4 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh
 std::size_t size_of(data_type t);
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 1609c72f175..55f4c1f5a23 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -20,6 +20,7 @@
 #include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -27,7 +28,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_unaryops
  * @{
@@ -259,4 +260,4 @@ std::unique_ptr<column> is_not_nan(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index 9bdc372419f..736796e610a 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -27,7 +27,7 @@
  * @brief Utilities for bit and bitmask operations.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 // @cond
 // Work around a bug in NVRTC that fails to compile assert() in constexpr
@@ -217,4 +217,4 @@ __device__ inline void clear_bit(bitmask_type* bitmask, size_type bit_index)
 }
 #endif
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp
index aacab996e8a..97a42243250 100644
--- a/cpp/include/cudf/utilities/default_stream.hpp
+++ b/cpp/include/cudf/utilities/default_stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,12 @@
 #pragma once
 
 #include <cudf/detail/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup default_stream
  * @{
@@ -43,4 +44,4 @@ rmm::cuda_stream_view const get_default_stream();
 bool is_ptds_enabled();
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index f019f516b84..f847ce0f66a 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/stacktrace.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <cuda.h>
 #include <cuda_runtime_api.h>
@@ -25,7 +26,7 @@
 #include <string>
 #include <type_traits>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup utility_error
  * @{
@@ -140,7 +141,7 @@ struct data_type_error : public std::invalid_argument, public stacktrace_recorde
 };
 /** @} */
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
 
 #define STRINGIFY_DETAIL(x) #x                   ///< Stringify a macro argument
 #define CUDF_STRINGIFY(x)   STRINGIFY_DETAIL(x)  ///< Stringify a macro argument
@@ -229,7 +230,7 @@ struct data_type_error : public std::invalid_argument, public stacktrace_recorde
 
 /// @endcond
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 // @cond
 inline void throw_cuda_error(cudaError_t error, char const* file, unsigned int line)
@@ -251,7 +252,7 @@ inline void throw_cuda_error(cudaError_t error, char const* file, unsigned int l
 }
 // @endcond
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
 
 /**
  * @brief Error checking macro for CUDA runtime API functions.
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 3e2fa43cb50..623a033698f 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -16,11 +16,13 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Set the rmm resource to be used for pinned memory allocations.
@@ -71,4 +73,20 @@ void set_kernel_pinned_copy_threshold(size_t threshold);
  */
 size_t get_kernel_pinned_copy_threshold();
 
-}  // namespace cudf
+/**
+ * @brief Set the threshold size for allocating host memory as pinned memory.
+ *
+ * @param threshold The threshold size in bytes. If the size of the allocation is less or equal to
+ * this threshold, the memory will be allocated as pinned memory. If the size is greater than this
+ * threshold, the memory will be allocated as pageable memory.
+ */
+void set_allocate_host_as_pinned_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for allocating host memory as pinned memory.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_allocate_host_as_pinned_threshold();
+
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp
new file mode 100644
index 00000000000..3384181fc37
--- /dev/null
+++ b/cpp/include/cudf/utilities/prefetch.hpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <map>
+#include <shared_mutex>
+#include <string>
+#include <string_view>
+
+namespace CUDF_EXPORT cudf {
+namespace experimental::prefetch {
+
+namespace detail {
+
+/**
+ * @brief A singleton class that manages the prefetching configuration.
+ */
+class prefetch_config {
+ public:
+  prefetch_config& operator=(const prefetch_config&) = delete;
+  prefetch_config(const prefetch_config&)            = delete;
+
+  /**
+   * @brief Get the singleton instance of the prefetching configuration.
+   *
+   * @return The singleton instance of the prefetching configuration.
+   */
+  static prefetch_config& instance();
+
+  /**
+   * @brief Get the value of a configuration key.
+   *
+   * If the key does not exist, a `false` value will be returned.
+   *
+   * @param key The configuration key.
+   * @return The value of the configuration key.
+   */
+  bool get(std::string_view key);
+  /**
+   * @brief Set the value of a configuration key.
+   *
+   * This is a thread-safe operation.
+   *
+   * @param key The configuration key.
+   * @param value The value to set.
+   */
+  void set(std::string_view key, bool value);
+  /**
+   * @brief Enable or disable debug mode.
+   *
+   * In debug mode, the pointers being prefetched are printed to stderr.
+   */
+  bool debug{false};
+
+ private:
+  prefetch_config() = default;                //< Private constructor to enforce singleton pattern
+  std::map<std::string, bool> config_values;  //< Map of configuration keys to values
+  std::shared_mutex config_mtx;               //< Mutex for thread-safe config access
+};
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+void prefetch(std::string_view key,
+              void const* ptr,
+              std::size_t size,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device());
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @note This function will not throw exceptions, so it is safe to call in
+ * noexcept contexts. If an error occurs, the error code is returned. This
+ * function primarily exists for [mutable_]column_view::get_data and should be
+ * removed once an method for stream-ordered data pointer access is added to
+ * those data structures.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+cudaError_t prefetch_noexcept(
+  std::string_view key,
+  void const* ptr,
+  std::size_t size,
+  rmm::cuda_stream_view stream,
+  rmm::cuda_device_id device_id = rmm::get_current_cuda_device()) noexcept;
+
+/**
+ * @brief Prefetch the data in a device_uvector.
+ *
+ * @note At present this function does not support stream-ordered execution. Prefetching always
+ * occurs on the default stream.
+ *
+ * @param key The key to enable prefetching for.
+ * @param v The device_uvector to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+template <typename T>
+void prefetch(std::string_view key,
+              rmm::device_uvector<T> const& v,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device())
+{
+  if (v.is_empty()) { return; }
+  prefetch(key, v.data(), v.size(), stream, device_id);
+}
+
+}  // namespace detail
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ */
+void enable_prefetching(std::string_view key);
+
+/**
+ * @brief Disable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to disable prefetching for.
+ */
+void disable_prefetching(std::string_view key);
+
+/**
+ * @brief Enable or disable debug mode.
+ *
+ * In debug mode, the pointers being prefetched are printed to stderr.
+ *
+ * @param enable Whether to enable or disable debug mode.
+ */
+void prefetch_debugging(bool enable);
+
+}  // namespace experimental::prefetch
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 3b35e60e034..0daebc0dd8d 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -16,6 +16,9 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -30,7 +33,7 @@
 #include <type_traits>
 #include <utility>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup utility_span
  * @{
@@ -257,6 +260,26 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   {
   }
 
+  /// Constructor from a host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT>& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
+  /// Constructor from a const host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT> const& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
   // Copy construction to support const conversion
   /// @param other The span to copy
   template <typename OtherT,
@@ -268,6 +291,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size())
   {
   }
+
+  /**
+   * @brief Returns whether the data is device accessible (e.g. pinned memory)
+   *
+   * @return true if the data is device accessible
+   */
+  [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
+
+ private:
+  bool _is_device_accessible{false};
 };
 
 // ===== device_span ===============================================================================
@@ -507,4 +540,4 @@ template <class T>
 using device_2dspan = base_2dspan<T, device_span>;
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
deleted file mode 100644
index c8c3eb097c4..00000000000
--- a/cpp/include/cudf/utilities/thread_pool.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-/**
- * Modified from https://github.com/bshoshany/thread-pool
- * @copyright Copyright (c) 2021 Barak Shoshany. Licensed under the MIT license.
- *            See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
-
-#include <atomic>       // std::atomic
-#include <chrono>       // std::chrono
-#include <cstdint>      // std::int_fast64_t, std::uint_fast32_t
-#include <functional>   // std::function
-#include <future>       // std::future, std::promise
-#include <memory>       // std::shared_ptr, std::unique_ptr
-#include <mutex>        // std::mutex, std::scoped_lock
-#include <queue>        // std::queue
-#include <thread>       // std::this_thread, std::thread
-#include <type_traits>  // std::decay_t, std::enable_if_t, std::is_void_v, std::invoke_result_t
-#include <utility>      // std::move, std::swap
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief A C++17 thread pool class. The user submits tasks to be executed into a queue. Whenever a
- * thread becomes available, it pops a task from the queue and executes it. Each task is
- * automatically assigned a future, which can be used to wait for the task to finish executing
- * and/or obtain its eventual return value.
- */
-class thread_pool {
-  using ui32 = int;
-
- public:
-  /**
-   * @brief Construct a new thread pool.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  thread_pool(ui32 const& _thread_count = std::thread::hardware_concurrency())
-    : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()),
-      threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()])
-  {
-    create_threads();
-  }
-
-  /**
-   * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads.
-   * Note that if the variable paused is set to true, then any tasks still in the queue will never
-   * be executed.
-   */
-  ~thread_pool()
-  {
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-  }
-
-  /**
-   * @brief Get the number of tasks currently waiting in the queue to be executed by the threads.
-   *
-   * @return The number of queued tasks.
-   */
-  [[nodiscard]] size_t get_tasks_queued() const
-  {
-    std::scoped_lock const lock(queue_mutex);
-    return tasks.size();
-  }
-
-  /**
-   * @brief Get the number of tasks currently being executed by the threads.
-   *
-   * @return The number of running tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
-
-  /**
-   * @brief Get the total number of unfinished tasks - either still in the queue, or running in a
-   * thread.
-   *
-   * @return The total number of tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; }
-
-  /**
-   * @brief Get the number of threads in the pool.
-   *
-   * @return The number of threads.
-   */
-  [[nodiscard]] ui32 get_thread_count() const { return thread_count; }
-
-  /**
-   * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the
-   * thread pool, and waiting for all blocks to finish executing. The loop will be equivalent to:
-   * for (T i = first_index; i <= last_index; i++) loop(i);
-   *
-   * @tparam T The type of the loop index. Should be a signed or unsigned integer.
-   * @tparam F The type of the function to loop through.
-   * @param first_index The first index in the loop (inclusive).
-   * @param last_index The last index in the loop (inclusive).
-   * @param loop The function to loop through. Should take exactly one argument, the loop index.
-   * @param num_tasks The maximum number of tasks to split the loop into. The default is to use the
-   * number of threads in the pool.
-   */
-  template <typename T, typename F>
-  void parallelize_loop(T first_index, T last_index, F const& loop, ui32 num_tasks = 0)
-  {
-    if (num_tasks == 0) num_tasks = thread_count;
-    if (last_index < first_index) std::swap(last_index, first_index);
-    size_t total_size = last_index - first_index + 1;
-    size_t block_size = total_size / num_tasks;
-    if (block_size == 0) {
-      block_size = 1;
-      num_tasks  = (ui32)total_size > 1 ? (ui32)total_size : 1;
-    }
-    std::atomic<ui32> blocks_running = 0;
-    for (ui32 t = 0; t < num_tasks; t++) {
-      T start = (T)(t * block_size + first_index);
-      T end   = (t == num_tasks - 1) ? last_index : (T)((t + 1) * block_size + first_index - 1);
-      blocks_running++;
-      push_task([start, end, &loop, &blocks_running] {
-        for (T i = start; i <= end; i++)
-          loop(i);
-        blocks_running--;
-      });
-    }
-    while (blocks_running != 0) {
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief Push a function with no arguments or return value into the task queue.
-   *
-   * @tparam F The type of the function.
-   * @param task The function to push.
-   */
-  template <typename F>
-  void push_task(F const& task)
-  {
-    tasks_total++;
-    {
-      std::scoped_lock const lock(queue_mutex);
-      tasks.push(std::function<void()>(task));
-    }
-  }
-
-  /**
-   * @brief Push a function with arguments, but no return value, into the task queue.
-   * @details The function is wrapped inside a lambda in order to hide the arguments, as the tasks
-   * in the queue must be of type std::function<void()>, so they cannot have any arguments or return
-   * value. If no arguments are provided, the other overload will be used, in order to avoid the
-   * (slight) overhead of using a lambda.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the arguments.
-   * @param task The function to push.
-   * @param args The arguments to pass to the function.
-   */
-  template <typename F, typename... A>
-  void push_task(F const& task, A const&... args)
-  {
-    push_task([task, args...] { task(args...); });
-  }
-
-  /**
-   * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be
-   * completed, then destroys all threads in the pool and creates a new thread pool with the new
-   * number of threads. Any tasks that were waiting in the queue before the pool was reset will then
-   * be executed by the new threads. If the pool was paused before resetting it, the new pool will
-   * be paused as well.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  void reset(ui32 const& _thread_count = std::thread::hardware_concurrency())
-  {
-    bool was_paused = paused;
-    paused          = true;
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-    thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency();
-    threads      = std::make_unique<std::thread[]>(thread_count);
-    paused       = was_paused;
-    create_threads();
-    running = true;
-  }
-
-  /**
-   * @brief Submit a function with zero or more arguments and a return value into the task queue,
-   * and get a future for its eventual returned value.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the zero or more arguments to pass to the function.
-   * @tparam R The return type of the function.
-   * @param task The function to submit.
-   * @param args The zero or more arguments to pass to the function.
-   * @return A future to be used later to obtain the function's returned value, waiting for it to
-   * finish its execution if needed.
-   */
-  template <typename F,
-            typename... A,
-            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
-  std::future<R> submit(F const& task, A const&... args)
-  {
-    std::shared_ptr<std::promise<R>> promise(new std::promise<R>);
-    std::future<R> future = promise->get_future();
-    push_task([task, args..., promise] {
-      try {
-        if constexpr (std::is_void_v<R>) {
-          task(args...);
-          promise->set_value();
-        } else {
-          promise->set_value(task(args...));
-        }
-      } catch (...) {
-        promise->set_exception(std::current_exception());
-      };
-    });
-    return future;
-  }
-
-  /**
-   * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those
-   * that are currently running in the threads and those that are still waiting in the queue.
-   * However, if the variable paused is set to true, this function only waits for the currently
-   * running tasks (otherwise it would wait forever). To wait for a specific task, use submit()
-   * instead, and call the wait() member function of the generated future.
-   */
-  void wait_for_tasks()
-  {
-    while (true) {
-      if (!paused) {
-        if (tasks_total == 0) break;
-      } else {
-        if (get_tasks_running() == 0) break;
-      }
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief An atomic variable indicating to the workers to pause. When set to true, the workers
-   * temporarily stop popping new tasks out of the queue, although any tasks already executed will
-   * keep running until they are done. Set to false again to resume popping tasks.
-   */
-  std::atomic<bool> paused = false;
-
-  /**
-   * @brief The duration, in microseconds, that the worker function should sleep for when it cannot
-   * find any tasks in the queue. If set to 0, then instead of sleeping, the worker function will
-   * execute std::this_thread::yield() if there are no tasks in the queue. The default value is
-   * 1000.
-   */
-  ui32 sleep_duration = 1000;
-
- private:
-  /**
-   * @brief Create the threads in the pool and assign a worker to each thread.
-   */
-  void create_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i] = std::thread(&thread_pool::worker, this);
-    }
-  }
-
-  /**
-   * @brief Destroy the threads in the pool by joining them.
-   */
-  void destroy_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i].join();
-    }
-  }
-
-  /**
-   * @brief Try to pop a new task out of the queue.
-   *
-   * @param task A reference to the task. Will be populated with a function if the queue is not
-   * empty.
-   * @return true if a task was found, false if the queue is empty.
-   */
-  bool pop_task(std::function<void()>& task)
-  {
-    std::scoped_lock const lock(queue_mutex);
-    if (tasks.empty())
-      return false;
-    else {
-      task = std::move(tasks.front());
-      tasks.pop();
-      return true;
-    }
-  }
-
-  /**
-   * @brief Sleep for sleep_duration microseconds. If that variable is set to zero, yield instead.
-   *
-   */
-  void sleep_or_yield()
-  {
-    if (sleep_duration)
-      std::this_thread::sleep_for(std::chrono::microseconds(sleep_duration));
-    else
-      std::this_thread::yield();
-  }
-
-  /**
-   * @brief A worker function to be assigned to each thread in the pool. Continuously pops tasks out
-   * of the queue and executes them, as long as the atomic variable running is set to true.
-   */
-  void worker()
-  {
-    while (running) {
-      std::function<void()> task;
-      if (!paused && pop_task(task)) {
-        task();
-        tasks_total--;
-      } else {
-        sleep_or_yield();
-      }
-    }
-  }
-
-  /**
-   * @brief A mutex to synchronize access to the task queue by different threads.
-   */
-  mutable std::mutex queue_mutex;
-
-  /**
-   * @brief An atomic variable indicating to the workers to keep running. When set to false, the
-   * workers permanently stop working.
-   */
-  std::atomic<bool> running = true;
-
-  /**
-   * @brief A queue of tasks to be executed by the threads.
-   */
-  std::queue<std::function<void()>> tasks;
-
-  /**
-   * @brief The number of threads in the pool.
-   */
-  ui32 thread_count;
-
-  /**
-   * @brief A smart pointer to manage the memory allocated for the threads.
-   */
-  std::unique_ptr<std::thread[]> threads;
-
-  /**
-   * @brief An atomic variable to keep track of the total number of unfinished tasks - either still
-   * in the queue, or running in a thread.
-   */
-  std::atomic<ui32> tasks_total = 0;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/include/cudf/utilities/traits.cuh b/cpp/include/cudf/utilities/traits.cuh
index 43587ffa583..5e52e9a9cd9 100644
--- a/cpp/include/cudf/utilities/traits.cuh
+++ b/cpp/include/cudf/utilities/traits.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <cuda/std/atomic>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup utility_types
@@ -64,4 +64,4 @@ constexpr inline bool has_atomic_support(data_type type)
 
 /** @} */
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index d191e44228a..3f37ae02151 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -24,7 +24,7 @@
 
 #include <cuda/std/type_traits>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup utility_types
@@ -622,4 +622,4 @@ struct is_convertible<cudf::detail::timestamp<Duration1>, cudf::detail::timestam
 
 /** @} */
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index fd3b0581c11..4fcbca09d17 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -20,7 +20,7 @@
 
 #include <algorithm>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Compare the types of two `column_view`s
@@ -147,4 +147,4 @@ inline bool all_have_same_types(ForwardIt first, ForwardIt last)
          });
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 1aad197b1e3..15b5f921c1b 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
  * @brief Defines the mapping between `cudf::type_id` runtime type information
  * and concrete C++ types.
  */
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup utility_dispatcher
  * @{
@@ -626,4 +626,4 @@ CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) double_type_dispatcher
 std::string type_to_name(data_type type);
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index 95f4ac00a53..3b1958e7d4f 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -27,7 +27,7 @@
  * @brief Concrete type definition for dictionary columns.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup dictionary_classes
  * @{
@@ -217,4 +217,4 @@ CUDF_HOST_DEVICE inline bool operator>(dictionary_wrapper<Integer> const& lhs,
 using dictionary32 = dictionary_wrapper<int32_t>;  ///< 32-bit integer indexed dictionary wrapper
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index 840dba4f4ba..8c321cba34a 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cuda/std/chrono>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup timestamp_classes Timestamp
@@ -65,4 +67,4 @@ static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep));
 static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep));
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp
index 5194a3e8f96..1f5d54c6119 100644
--- a/cpp/include/cudf/wrappers/timestamps.hpp
+++ b/cpp/include/cudf/wrappers/timestamps.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/wrappers/durations.hpp>
 
 /**
@@ -23,7 +24,7 @@
  * @brief Concrete type definitions for int32_t and int64_t timestamps in
  * varying resolutions as durations since the UNIX epoch.
  */
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 // TODO: Use chrono::utc_clock when available in libcu++?
 template <class Duration>
@@ -82,4 +83,4 @@ static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep));
 static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep));
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 0e35ff64af4..04bd51e9aa3 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -19,13 +19,14 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 /**
@@ -99,4 +100,4 @@ class TempDirTestEnvironment : public ::testing::Environment {
 };
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index c83599a8072..944c6195afb 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -24,11 +24,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
-namespace cudf::test {
+namespace CUDF_EXPORT cudf {
+namespace test {
 
 /**
  * @brief Verbosity level of output from column and table comparison functions.
@@ -194,7 +196,7 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
  *  `column_view`'s data, and second is the column's bitmask.
  */
 template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c);
+CUDF_EXPORT std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c);
 
 /**
  * @brief Copies the data and bitmask of a `column_view` of strings
@@ -207,7 +209,8 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
  * and second is the column's bitmask.
  */
 template <>
-std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
+CUDF_EXPORT std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(
+  column_view c);
 //! @endcond
 
 /**
@@ -233,7 +236,8 @@ struct large_strings_enabler {
   void disable();
 };
 
-}  // namespace cudf::test
+}  // namespace test
+}  // namespace CUDF_EXPORT cudf
 
 // Macros for showing line of failure.
 #define CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(lhs, rhs)        \
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 2abd6f0abac..4e504ec1d30 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -24,7 +24,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -33,6 +32,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -51,7 +51,7 @@
 #include <memory>
 #include <numeric>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 namespace detail {
 /**
@@ -1755,7 +1755,7 @@ class lists_column_wrapper : public detail::column_wrapper {
       normalize_column(lists_column_view(col).child(),
                        lists_column_view(expected_hierarchy).child()),
       col.null_count(),
-      cudf::detail::copy_bitmask(
+      cudf::copy_bitmask(
         col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
       cudf::test::get_default_stream());
   }
@@ -1970,4 +1970,4 @@ class structs_column_wrapper : public detail::column_wrapper {
 };
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/debug_utilities.hpp b/cpp/include/cudf_test/debug_utilities.hpp
index a0881490b82..049b4579316 100644
--- a/cpp/include/cudf_test/debug_utilities.hpp
+++ b/cpp/include/cudf_test/debug_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf::test {
+namespace CUDF_EXPORT cudf {
+namespace test {
 
 /**
  * @brief Formats a column view as a string
@@ -44,4 +46,5 @@ std::vector<std::string> to_strings(cudf::column_view const& col);
  */
 void print(cudf::column_view const& col, std::ostream& os = std::cout);
 
-}  // namespace cudf::test
+}  // namespace test
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/default_stream.hpp b/cpp/include/cudf_test/default_stream.hpp
index 1da97d71f44..4f63add3071 100644
--- a/cpp/include/cudf_test/default_stream.hpp
+++ b/cpp/include/cudf_test/default_stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 /**
@@ -38,4 +40,4 @@ namespace test {
 rmm::cuda_stream_view const get_default_stream();
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index defc6f95823..37347e563cd 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <ftw.h>
 
@@ -29,7 +30,7 @@
  * @brief RAII class for creating a temporary directory.
  *
  */
-class temp_directory {
+class CUDF_EXPORT temp_directory {
   std::string _path;
 
  public:
diff --git a/cpp/include/cudf_test/io_metadata_utilities.hpp b/cpp/include/cudf_test/io_metadata_utilities.hpp
index 6fd1a52239c..c18d427d905 100644
--- a/cpp/include/cudf_test/io_metadata_utilities.hpp
+++ b/cpp/include/cudf_test/io_metadata_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,10 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf::test {
+namespace CUDF_EXPORT cudf {
+namespace test {
 
 void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
                            cudf::io::table_metadata out_meta);
@@ -28,4 +30,5 @@ void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
  */
 void expect_metadata_equal(cudf::io::table_metadata lhs_meta, cudf::io::table_metadata rhs_meta);
 
-}  // namespace cudf::test
+}  // namespace test
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/iterator_utilities.hpp b/cpp/include/cudf_test/iterator_utilities.hpp
index 10f6e77d889..8db0275d2f4 100644
--- a/cpp/include/cudf_test/iterator_utilities.hpp
+++ b/cpp/include/cudf_test/iterator_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,13 +18,14 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <iterator>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 namespace iterators {
 /**
@@ -136,4 +137,4 @@ template <class T>
 
 }  // namespace iterators
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh
index ae6c8cef029..828188e65c3 100644
--- a/cpp/include/cudf_test/print_utilities.cuh
+++ b/cpp/include/cudf_test/print_utilities.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -25,7 +26,8 @@
 
 #include <type_traits>
 
-namespace cudf::test::print {
+namespace CUDF_EXPORT cudf {
+namespace test::print {
 
 constexpr int32_t hex_tag = 0;
 
@@ -137,4 +139,5 @@ void print_array(std::size_t count, rmm::cuda_stream_view stream, Ts... args)
   }
 }
 
-}  // namespace cudf::test::print
+}  // namespace test::print
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/random.hpp b/cpp/include/cudf_test/random.hpp
index f4d539ecffe..fe1fb0a14bf 100644
--- a/cpp/include/cudf_test/random.hpp
+++ b/cpp/include/cudf_test/random.hpp
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <random>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 template <typename T, typename Enable = void>
@@ -170,4 +171,4 @@ class UniformRandomGenerator {
 };
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 5a077e86a0f..417bbb3d9ab 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -24,13 +24,11 @@
 
 #include <iostream>
 
+namespace cudf::test {
+
 /**
  * @brief Resource that verifies that the default stream is not used in any allocation.
- *
- * @tparam Upstream Type of the upstream resource used for
- * allocation/deallocation.
  */
-template <typename Upstream>
 class stream_checking_resource_adaptor final : public rmm::mr::device_memory_resource {
  public:
   /**
@@ -40,14 +38,13 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    *
    * @param upstream The resource used for allocating/deallocating device memory
    */
-  stream_checking_resource_adaptor(Upstream* upstream,
+  stream_checking_resource_adaptor(rmm::device_async_resource_ref upstream,
                                    bool error_on_invalid_stream,
                                    bool check_default_stream)
     : upstream_{upstream},
       error_on_invalid_stream_{error_on_invalid_stream},
       check_default_stream_{check_default_stream}
   {
-    CUDF_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   stream_checking_resource_adaptor()                                                   = delete;
@@ -86,7 +83,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     verify_stream(stream);
-    return upstream_->allocate(bytes, stream);
+    return upstream_.allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -101,7 +98,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     verify_stream(stream);
-    upstream_->deallocate(ptr, bytes, stream);
+    upstream_.deallocate_async(ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -113,8 +110,8 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
   {
     if (this == &other) { return true; }
-    auto cast = dynamic_cast<stream_checking_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    auto cast = dynamic_cast<stream_checking_resource_adaptor const*>(&other);
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
@@ -150,7 +147,8 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
     }
   }
 
-  Upstream* upstream_;            // the upstream resource used for satisfying allocation requests
+  rmm::device_async_resource_ref
+    upstream_;                    // the upstream resource used for satisfying allocation requests
   bool error_on_invalid_stream_;  // If true, throw an exception when the wrong stream is detected.
                                   // If false, simply print to stdout.
   bool check_default_stream_;  // If true, throw an exception when the default stream is observed.
@@ -158,17 +156,4 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
                                // cudf::test::get_default_stream() is observed.
 };
 
-/**
- * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the
- * upstream resource `upstream`.
- *
- * @tparam Upstream Type of the upstream `device_memory_resource`.
- * @param upstream Pointer to the upstream resource
- */
-template <typename Upstream>
-stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(
-  Upstream* upstream, bool error_on_invalid_stream, bool check_default_stream)
-{
-  return stream_checking_resource_adaptor<Upstream>{
-    upstream, error_on_invalid_stream, check_default_stream};
-}
+}  // namespace cudf::test
diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp
index 79229df4cd9..5e60419d679 100644
--- a/cpp/include/cudf_test/table_utilities.hpp
+++ b/cpp/include/cudf_test/table_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf::test::detail {
+namespace CUDF_EXPORT cudf {
+namespace test::detail {
 /**
  * @brief Verifies the property equality of two tables.
  *
@@ -57,7 +59,8 @@ void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs);
  */
 void expect_tables_equivalent(cudf::table_view lhs, cudf::table_view rhs);
 
-}  // namespace cudf::test::detail
+}  // namespace test::detail
+}  // namespace CUDF_EXPORT cudf
 
 // Macros for showing line of failure.
 #define CUDF_TEST_EXPECT_TABLE_PROPERTIES_EQUAL(lhs, rhs)        \
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 742cd764a1f..5fd2403b0f2 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -37,7 +38,7 @@
 
 // for use with groupby and reduction aggregation tests.
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 using expected_value = thrust::tuple<size_type, double, double>;
@@ -583,4 +584,4 @@ void tdigest_merge_empty(MergeFunc merge_op)
 }
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 66b831b917f..ed83ddabb00 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -20,6 +20,7 @@
 #include <cudf_test/stream_checking_resource_adaptor.hpp>
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -32,7 +33,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 /// MR factory functions
@@ -92,7 +93,7 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
 }
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
 
 /**
  * @brief Parses the cuDF test command line options.
@@ -182,8 +183,8 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
   auto const check_default_stream    = (stream_mode == "new_cudf_default");
-  auto adaptor =
-    make_stream_checking_resource_adaptor(resource, error_on_invalid_stream, check_default_stream);
+  auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
+    resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
     rmm::mr::set_current_device_resource(&adaptor);
   }
diff --git a/cpp/include/cudf_test/timestamp_utilities.cuh b/cpp/include/cudf_test/timestamp_utilities.cuh
index ebd93862151..e0789210bf9 100644
--- a/cpp/include/cudf_test/timestamp_utilities.cuh
+++ b/cpp/include/cudf_test/timestamp_utilities.cuh
@@ -19,12 +19,13 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 using time_point_ms =
   cuda::std::chrono::time_point<cuda::std::chrono::system_clock, cuda::std::chrono::milliseconds>;
@@ -75,4 +76,4 @@ inline cudf::test::fixed_width_column_wrapper<T, int64_t> generate_timestamps(in
 }
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp
index b069a34afb8..1793a8ecce0 100644
--- a/cpp/include/cudf_test/type_list_utilities.hpp
+++ b/cpp/include/cudf_test/type_list_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include "cudf_gtest.hpp"
 
+#include <cudf/utilities/export.hpp>
+
 /**
  * @file type_list_utilities.hpp
  * @brief Utilities for creating type lists for typed tests in Google Test
@@ -68,7 +70,7 @@
  * increased compile-times. Use responsibly.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 // Utilities for creating parameters for typed tests on GoogleTest
 //
@@ -627,4 +629,4 @@ using Unique = typename UniqueImpl<TYPES>::type;
 
 }  // namespace test
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index bbff45e2102..4cd01a09187 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -21,6 +21,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/durations.hpp>
@@ -40,7 +41,7 @@
  * These lists should be used for consistency across tests as well as
  * future-proofing against the addition of any new types in the future.
  */
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 namespace detail {
 template <typename TYPES, std::size_t... Indices>
@@ -433,4 +434,4 @@ static constexpr std::array<cudf::type_id, 2> non_fixed_width_type_ids{cudf::typ
                                                                        cudf::type_id::STRING};
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 375d44e367a..6559933f696 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -20,10 +20,11 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 
 /**
  * @addtogroup nvtext_tokenize
@@ -132,4 +133,4 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
index c4b89b6d495..7c49421560d 100644
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -20,7 +20,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 namespace detail {
 
 /**
@@ -35,4 +35,4 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
                                                     rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index 0c27981f80b..438a4a9afdd 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -25,7 +25,7 @@
 #include <cstdint>
 #include <cstring>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 namespace detail {
 
 /**
@@ -47,4 +47,4 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index d48027e4631..57ad008f1a9 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -23,7 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 namespace detail {
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,string_scalar
@@ -70,4 +70,4 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
                                            rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index bfdfb4d1a1c..102f2cffa18 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 //! NVText APIs
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_edit_distance
  * @{
@@ -104,4 +105,4 @@ std::unique_ptr<cudf::column> edit_distance_matrix(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index bebe2e46023..ce79d985a49 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_ngrams
  * @{
@@ -128,4 +129,4 @@ std::unique_ptr<cudf::column> hash_character_ngrams(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
index 649c17f0b1c..3c3486c079e 100644
--- a/cpp/include/nvtext/jaccard.hpp
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -17,10 +17,11 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_jaccard
  * @{
@@ -78,4 +79,4 @@ std::unique_ptr<cudf::column> jaccard_index(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7d3f6059454..fc28ecfb199 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -19,11 +19,12 @@
 #include <cudf/hashing.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_minhash
  * @{
@@ -151,4 +152,4 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 09ce323a7ae..1048cd4abad 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_ngrams
  * @{
@@ -86,4 +87,4 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index e5967e78318..ec0b8981f8f 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 //! NVText APIs
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_normalize
  * @{
@@ -108,4 +109,4 @@ std::unique_ptr<cudf::column> normalize_characters(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index aac21346c72..eedcd3976ca 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 //! NVText APIs
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_replace
  * @{
@@ -142,4 +143,4 @@ std::unique_ptr<cudf::column> filter_tokens(
   rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 20b81aba661..4607c42ceed 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_stemmer
  * @{
@@ -172,4 +173,4 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index a4e06495a1d..b5636c8401b 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 
 /**
  * @addtogroup nvtext_tokenize
@@ -160,4 +161,4 @@ tokenizer_result subword_tokenize(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 29fed0759c7..833b53efcde 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_tokenize
  * @{
@@ -309,4 +310,4 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of tokenize group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 5422304c5cb..a60a7f63882 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <memory>
@@ -423,13 +424,16 @@ std::unique_ptr<Base> make_sum_aggregation()
 {
   return std::make_unique<detail::sum_aggregation>();
 }
-template std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_sum_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_sum_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation> make_sum_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_sum_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_sum_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_sum_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_sum_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_sum_aggregation<groupby_scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_sum_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_sum_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_sum_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a PRODUCT aggregation
@@ -438,13 +442,15 @@ std::unique_ptr<Base> make_product_aggregation()
 {
   return std::make_unique<detail::product_aggregation>();
 }
-template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_product_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_product_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
 make_product_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_product_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_product_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_product_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_product_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_product_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MIN aggregation
@@ -453,13 +459,16 @@ std::unique_ptr<Base> make_min_aggregation()
 {
   return std::make_unique<detail::min_aggregation>();
 }
-template std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_min_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_min_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation> make_min_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_min_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_min_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_min_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_min_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_min_aggregation<groupby_scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_min_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_min_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_min_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MAX aggregation
@@ -468,13 +477,16 @@ std::unique_ptr<Base> make_max_aggregation()
 {
   return std::make_unique<detail::max_aggregation>();
 }
-template std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_max_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_max_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation> make_max_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_max_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_max_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_max_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_max_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_max_aggregation<groupby_scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_max_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_max_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_max_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a COUNT aggregation
@@ -485,14 +497,14 @@ std::unique_ptr<Base> make_count_aggregation(null_policy null_handling)
     (null_handling == null_policy::INCLUDE) ? aggregation::COUNT_ALL : aggregation::COUNT_VALID;
   return std::make_unique<detail::count_aggregation>(kind);
 }
-template std::unique_ptr<aggregation> make_count_aggregation<aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<rolling_aggregation> make_count_aggregation<rolling_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_count_aggregation<groupby_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_scan_aggregation> make_count_aggregation<groupby_scan_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_count_aggregation<aggregation>(
   null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_count_aggregation<rolling_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_count_aggregation<groupby_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_count_aggregation<groupby_scan_aggregation>(null_policy null_handling);
 
 /// Factory to create a HISTOGRAM aggregation
 template <typename Base>
@@ -500,9 +512,11 @@ std::unique_ptr<Base> make_histogram_aggregation()
 {
   return std::make_unique<detail::histogram_aggregation>();
 }
-template std::unique_ptr<aggregation> make_histogram_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_histogram_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_histogram_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_histogram_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_histogram_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_histogram_aggregation<reduce_aggregation>();
 
 /// Factory to create a ANY aggregation
 template <typename Base>
@@ -510,9 +524,9 @@ std::unique_ptr<Base> make_any_aggregation()
 {
   return std::make_unique<detail::any_aggregation>();
 }
-template std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
-template std::unique_ptr<reduce_aggregation> make_any_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_any_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_any_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a ALL aggregation
@@ -521,9 +535,9 @@ std::unique_ptr<Base> make_all_aggregation()
 {
   return std::make_unique<detail::all_aggregation>();
 }
-template std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
-template std::unique_ptr<reduce_aggregation> make_all_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_all_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_all_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a SUM_OF_SQUARES aggregation
@@ -532,11 +546,12 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation()
 {
   return std::make_unique<detail::sum_of_squares_aggregation>();
 }
-template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
 make_sum_of_squares_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_sum_of_squares_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_sum_of_squares_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_sum_of_squares_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MEAN aggregation
@@ -545,11 +560,14 @@ std::unique_ptr<Base> make_mean_aggregation()
 {
   return std::make_unique<detail::mean_aggregation>();
 }
-template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_mean_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_mean_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_mean_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_mean_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_mean_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_mean_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a M2 aggregation
@@ -558,8 +576,9 @@ std::unique_ptr<Base> make_m2_aggregation()
 {
   return std::make_unique<detail::m2_aggregation>();
 }
-template std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_m2_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_m2_aggregation<groupby_aggregation>();
 
 /// Factory to create a VARIANCE aggregation
 template <typename Base>
@@ -567,14 +586,15 @@ std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
 {
   return std::make_unique<detail::var_aggregation>(ddof);
 }
-template std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(size_type ddof);
-template std::unique_ptr<rolling_aggregation> make_variance_aggregation<rolling_aggregation>(
-  size_type ddof);
-template std::unique_ptr<groupby_aggregation> make_variance_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(
   size_type ddof);
-template std::unique_ptr<reduce_aggregation> make_variance_aggregation<reduce_aggregation>(
-  size_type ddof);
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_variance_aggregation<rolling_aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_variance_aggregation<groupby_aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_variance_aggregation<reduce_aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_variance_aggregation<segmented_reduce_aggregation>(size_type ddof);
 
 /// Factory to create a STD aggregation
@@ -583,14 +603,14 @@ std::unique_ptr<Base> make_std_aggregation(size_type ddof)
 {
   return std::make_unique<detail::std_aggregation>(ddof);
 }
-template std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
-template std::unique_ptr<rolling_aggregation> make_std_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation> make_std_aggregation<rolling_aggregation>(
   size_type ddof);
-template std::unique_ptr<groupby_aggregation> make_std_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation> make_std_aggregation<groupby_aggregation>(
   size_type ddof);
-template std::unique_ptr<reduce_aggregation> make_std_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_std_aggregation<reduce_aggregation>(
   size_type ddof);
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_std_aggregation<segmented_reduce_aggregation>(size_type ddof);
 
 /// Factory to create a MEDIAN aggregation
@@ -599,9 +619,11 @@ std::unique_ptr<Base> make_median_aggregation()
 {
   return std::make_unique<detail::median_aggregation>();
 }
-template std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_median_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_median_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_median_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_median_aggregation<reduce_aggregation>();
 
 /// Factory to create a QUANTILE aggregation
 template <typename Base>
@@ -610,12 +632,14 @@ std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& quant
 {
   return std::make_unique<detail::quantile_aggregation>(quantiles, interp);
 }
-template std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
-  std::vector<double> const& quantiles, interpolation interp);
-template std::unique_ptr<groupby_aggregation> make_quantile_aggregation<groupby_aggregation>(
-  std::vector<double> const& quantiles, interpolation interp);
-template std::unique_ptr<reduce_aggregation> make_quantile_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
   std::vector<double> const& quantiles, interpolation interp);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_quantile_aggregation<groupby_aggregation>(std::vector<double> const& quantiles,
+                                               interpolation interp);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_quantile_aggregation<reduce_aggregation>(std::vector<double> const& quantiles,
+                                              interpolation interp);
 
 /// Factory to create an ARGMAX aggregation
 template <typename Base>
@@ -623,9 +647,11 @@ std::unique_ptr<Base> make_argmax_aggregation()
 {
   return std::make_unique<detail::argmax_aggregation>();
 }
-template std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_argmax_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_argmax_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_argmax_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_argmax_aggregation<groupby_aggregation>();
 
 /// Factory to create an ARGMIN aggregation
 template <typename Base>
@@ -633,9 +659,11 @@ std::unique_ptr<Base> make_argmin_aggregation()
 {
   return std::make_unique<detail::argmin_aggregation>();
 }
-template std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_argmin_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_argmin_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_argmin_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_argmin_aggregation<groupby_aggregation>();
 
 /// Factory to create an NUNIQUE aggregation
 template <typename Base>
@@ -643,13 +671,13 @@ std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::nunique_aggregation>(null_handling);
 }
-template std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_nunique_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
   null_policy null_handling);
-template std::unique_ptr<reduce_aggregation> make_nunique_aggregation<reduce_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_nunique_aggregation<groupby_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_nunique_aggregation<reduce_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_nunique_aggregation<segmented_reduce_aggregation>(null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
@@ -658,14 +686,14 @@ std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null
 {
   return std::make_unique<detail::nth_element_aggregation>(n, null_handling);
 }
-template std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
-  size_type n, null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_nth_element_aggregation<groupby_aggregation>(
-  size_type n, null_policy null_handling);
-template std::unique_ptr<reduce_aggregation> make_nth_element_aggregation<reduce_aggregation>(
-  size_type n, null_policy null_handling);
-template std::unique_ptr<rolling_aggregation> make_nth_element_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
   size_type n, null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_nth_element_aggregation<groupby_aggregation>(size_type n, null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_nth_element_aggregation<reduce_aggregation>(size_type n, null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_nth_element_aggregation<rolling_aggregation>(size_type n, null_policy null_handling);
 
 /// Factory to create a ROW_NUMBER aggregation
 template <typename Base>
@@ -673,8 +701,9 @@ std::unique_ptr<Base> make_row_number_aggregation()
 {
   return std::make_unique<detail::row_number_aggregation>();
 }
-template std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_row_number_aggregation<rolling_aggregation>();
 
 /// Factory to create an EWMA aggregation
 template <typename Base>
@@ -682,9 +711,9 @@ std::unique_ptr<Base> make_ewma_aggregation(double const com, cudf::ewm_history
 {
   return std::make_unique<detail::ewma_aggregation>(com, history);
 }
-template std::unique_ptr<aggregation> make_ewma_aggregation<aggregation>(double const com,
-                                                                         cudf::ewm_history history);
-template std::unique_ptr<scan_aggregation> make_ewma_aggregation<scan_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_ewma_aggregation<aggregation>(
+  double const com, cudf::ewm_history history);
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_ewma_aggregation<scan_aggregation>(
   double const com, cudf::ewm_history history);
 
 /// Factory to create a RANK aggregation
@@ -698,19 +727,19 @@ std::unique_ptr<Base> make_rank_aggregation(rank_method method,
   return std::make_unique<detail::rank_aggregation>(
     method, column_order, null_handling, null_precedence, percentage);
 }
-template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_rank_aggregation<aggregation>(
   rank_method method,
   order column_order,
   null_policy null_handling,
   null_order null_precedence,
   rank_percentage percentage);
-template std::unique_ptr<groupby_scan_aggregation> make_rank_aggregation<groupby_scan_aggregation>(
-  rank_method method,
-  order column_order,
-  null_policy null_handling,
-  null_order null_precedence,
-  rank_percentage percentage);
-template std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>(
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_rank_aggregation<groupby_scan_aggregation>(rank_method method,
+                                                order column_order,
+                                                null_policy null_handling,
+                                                null_order null_precedence,
+                                                rank_percentage percentage);
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>(
   rank_method method,
   order column_order,
   null_policy null_handling,
@@ -723,14 +752,14 @@ std::unique_ptr<Base> make_collect_list_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::collect_list_aggregation>(null_handling);
 }
-template std::unique_ptr<aggregation> make_collect_list_aggregation<aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<rolling_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_collect_list_aggregation<groupby_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<reduce_aggregation> make_collect_list_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_collect_list_aggregation<aggregation>(
   null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_collect_list_aggregation<rolling_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_collect_list_aggregation<groupby_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_collect_list_aggregation<reduce_aggregation>(null_policy null_handling);
 
 /// Factory to create a COLLECT_SET aggregation
 template <typename Base>
@@ -740,14 +769,20 @@ std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling,
 {
   return std::make_unique<detail::collect_set_aggregation>(null_handling, nulls_equal, nans_equal);
 }
-template std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
-  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
-template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolling_aggregation>(
-  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
-template std::unique_ptr<groupby_aggregation> make_collect_set_aggregation<groupby_aggregation>(
-  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
-template std::unique_ptr<reduce_aggregation> make_collect_set_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_collect_set_aggregation<rolling_aggregation>(null_policy null_handling,
+                                                  null_equality nulls_equal,
+                                                  nan_equality nans_equal);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_collect_set_aggregation<groupby_aggregation>(null_policy null_handling,
+                                                  null_equality nulls_equal,
+                                                  nan_equality nans_equal);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_collect_set_aggregation<reduce_aggregation>(null_policy null_handling,
+                                                 null_equality nulls_equal,
+                                                 nan_equality nans_equal);
 
 /// Factory to create a LAG aggregation
 template <typename Base>
@@ -755,8 +790,9 @@ std::unique_ptr<Base> make_lag_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LAG, offset);
 }
-template std::unique_ptr<aggregation> make_lag_aggregation<aggregation>(size_type offset);
-template std::unique_ptr<rolling_aggregation> make_lag_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_lag_aggregation<aggregation>(
+  size_type offset);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation> make_lag_aggregation<rolling_aggregation>(
   size_type offset);
 
 /// Factory to create a LEAD aggregation
@@ -765,9 +801,10 @@ std::unique_ptr<Base> make_lead_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LEAD, offset);
 }
-template std::unique_ptr<aggregation> make_lead_aggregation<aggregation>(size_type offset);
-template std::unique_ptr<rolling_aggregation> make_lead_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_lead_aggregation<aggregation>(
   size_type offset);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_lead_aggregation<rolling_aggregation>(size_type offset);
 
 /// Factory to create a UDF aggregation
 template <typename Base>
@@ -781,9 +818,9 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                 output_type};
   return std::unique_ptr<detail::udf_aggregation>(a);
 }
-template std::unique_ptr<aggregation> make_udf_aggregation<aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_udf_aggregation<aggregation>(
   udf_type type, std::string const& user_defined_aggregator, data_type output_type);
-template std::unique_ptr<rolling_aggregation> make_udf_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation> make_udf_aggregation<rolling_aggregation>(
   udf_type type, std::string const& user_defined_aggregator, data_type output_type);
 
 /// Factory to create a MERGE_LISTS aggregation
@@ -792,9 +829,11 @@ std::unique_ptr<Base> make_merge_lists_aggregation()
 {
   return std::make_unique<detail::merge_lists_aggregation>();
 }
-template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_merge_lists_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_merge_lists_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_merge_lists_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_merge_lists_aggregation<reduce_aggregation>();
 
 /// Factory to create a MERGE_SETS aggregation
 template <typename Base>
@@ -803,12 +842,12 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
 {
   return std::make_unique<detail::merge_sets_aggregation>(nulls_equal, nans_equal);
 }
-template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
-                                                                               nan_equality);
-template std::unique_ptr<groupby_aggregation> make_merge_sets_aggregation<groupby_aggregation>(
-  null_equality, nan_equality);
-template std::unique_ptr<reduce_aggregation> make_merge_sets_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(
   null_equality, nan_equality);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+  make_merge_sets_aggregation<groupby_aggregation>(null_equality, nan_equality);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+  make_merge_sets_aggregation<reduce_aggregation>(null_equality, nan_equality);
 
 /// Factory to create a MERGE_M2 aggregation
 template <typename Base>
@@ -816,8 +855,9 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 {
   return std::make_unique<detail::merge_m2_aggregation>();
 }
-template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_merge_m2_aggregation<groupby_aggregation>();
 
 /// Factory to create a MERGE_HISTOGRAM aggregation
 template <typename Base>
@@ -825,10 +865,11 @@ std::unique_ptr<Base> make_merge_histogram_aggregation()
 {
   return std::make_unique<detail::merge_histogram_aggregation>();
 }
-template std::unique_ptr<aggregation> make_merge_histogram_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_histogram_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
 make_merge_histogram_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_merge_histogram_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_merge_histogram_aggregation<reduce_aggregation>();
 
 /// Factory to create a COVARIANCE aggregation
 template <typename Base>
@@ -836,10 +877,10 @@ std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods, size_ty
 {
   return std::make_unique<detail::covariance_aggregation>(min_periods, ddof);
 }
-template std::unique_ptr<aggregation> make_covariance_aggregation<aggregation>(
-  size_type min_periods, size_type ddof);
-template std::unique_ptr<groupby_aggregation> make_covariance_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_covariance_aggregation<aggregation>(
   size_type min_periods, size_type ddof);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_covariance_aggregation<groupby_aggregation>(size_type min_periods, size_type ddof);
 
 /// Factory to create a CORRELATION aggregation
 template <typename Base>
@@ -847,33 +888,34 @@ std::unique_ptr<Base> make_correlation_aggregation(correlation_type type, size_t
 {
   return std::make_unique<detail::correlation_aggregation>(type, min_periods);
 }
-template std::unique_ptr<aggregation> make_correlation_aggregation<aggregation>(
-  correlation_type type, size_type min_periods);
-template std::unique_ptr<groupby_aggregation> make_correlation_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_correlation_aggregation<aggregation>(
   correlation_type type, size_type min_periods);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_correlation_aggregation<groupby_aggregation>(correlation_type type, size_type min_periods);
 
 template <typename Base>
 std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids)
 {
   return std::make_unique<detail::tdigest_aggregation>(max_centroids);
 }
-template std::unique_ptr<aggregation> make_tdigest_aggregation<aggregation>(int max_centroids);
-template std::unique_ptr<groupby_aggregation> make_tdigest_aggregation<groupby_aggregation>(
-  int max_centroids);
-template std::unique_ptr<reduce_aggregation> make_tdigest_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_tdigest_aggregation<aggregation>(
   int max_centroids);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_tdigest_aggregation<groupby_aggregation>(int max_centroids);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_tdigest_aggregation<reduce_aggregation>(int max_centroids);
 
 template <typename Base>
 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids)
 {
   return std::make_unique<detail::merge_tdigest_aggregation>(max_centroids);
 }
-template std::unique_ptr<aggregation> make_merge_tdigest_aggregation<aggregation>(
-  int max_centroids);
-template std::unique_ptr<groupby_aggregation> make_merge_tdigest_aggregation<groupby_aggregation>(
-  int max_centroids);
-template std::unique_ptr<reduce_aggregation> make_merge_tdigest_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_tdigest_aggregation<aggregation>(
   int max_centroids);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_merge_tdigest_aggregation<groupby_aggregation>(int max_centroids);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_merge_tdigest_aggregation<reduce_aggregation>(int max_centroids);
 
 namespace detail {
 namespace {
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 8ac1491547d..3ac8547baad 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -50,6 +50,11 @@
 namespace cudf {
 namespace binops {
 
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)
+{
+  return cudf::binops::compiled::is_supported_operation(out, lhs, rhs, op);
+}
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  */
@@ -194,7 +199,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match", std::invalid_argument);
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
       output_type.id() == type_id::STRING and
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index ba0253ec853..7a0bc312434 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -18,6 +18,7 @@
 #include "operation.cuh"
 #include "struct_binary_ops.cuh"
 
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/structs/utilities.hpp>
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index e0f0ccdc861..4806c7a94e8 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_view.hpp>
+#include <cudf/detail/is_element_valid.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 4d16298c605..b0f9e9f0e74 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,8 +15,10 @@
  */
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -27,10 +29,37 @@
 #include <algorithm>
 #include <exception>
 #include <numeric>
+#include <string>
 #include <vector>
 
 namespace cudf {
 namespace detail {
+namespace {
+
+template <typename ColumnView>
+void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept
+{
+  if (cudf::experimental::prefetch::detail::prefetch_config::instance().get(key)) {
+    if (cudf::is_fixed_width(col.type())) {
+      cudf::experimental::prefetch::detail::prefetch_noexcept(
+        key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
+    } else if (col.type().id() == type_id::STRING) {
+      strings_column_view scv{col};
+
+      cudf::experimental::prefetch::detail::prefetch_noexcept(
+        key,
+        data_ptr,
+        scv.chars_size(cudf::get_default_stream()) * sizeof(char),
+        cudf::get_default_stream());
+    } else {
+      std::cout << key << ": Unsupported type: " << static_cast<int32_t>(col.type().id())
+                << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
 column_view_base::column_view_base(data_type type,
                                    size_type size,
                                    void const* data,
@@ -126,6 +155,7 @@ bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
 {
   return shallow_equivalent_impl(lhs, rhs);
 }
+
 }  // namespace detail
 
 // Immutable view constructor
@@ -175,6 +205,18 @@ mutable_column_view::operator column_view() const
   return column_view{_type, _size, _data, _null_mask, _null_count, _offset, std::move(child_views)};
 }
 
+void const* column_view::get_data() const noexcept
+{
+  detail::prefetch_col_data(*this, _data, "column_view::get_data");
+  return _data;
+}
+
+void const* mutable_column_view::get_data() const noexcept
+{
+  detail::prefetch_col_data(*this, _data, "mutable_column_view::get_data");
+  return _data;
+}
+
 size_type count_descendants(column_view parent)
 {
   auto descendants = [](auto const& child) { return count_descendants(child); };
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 6acbafd24fb..ac9931335ff 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/concatenate.hpp>
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
@@ -73,8 +74,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
   });
 
   // Assemble contiguous array of device views
-  auto device_views = thrust::host_vector<column_device_view>();
-  device_views.reserve(views.size());
+  auto device_views =
+    cudf::detail::make_empty_host_vector<column_device_view>(views.size(), stream);
   std::transform(device_view_owners.cbegin(),
                  device_view_owners.cend(),
                  std::back_inserter(device_views),
@@ -84,7 +85,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
-  auto offsets = thrust::host_vector<size_t>(views.size() + 1);
+  auto offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   thrust::transform_inclusive_scan(
     thrust::host,
     device_views.cbegin(),
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 37db2c74790..95544742fb7 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1539,7 +1539,8 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
 
     std::vector<std::size_t> num_batches_per_iteration;
     std::vector<std::size_t> size_of_batches_per_iteration;
-    std::vector<std::size_t> accum_size_per_iteration;
+    auto accum_size_per_iteration =
+      cudf::detail::make_empty_host_vector<std::size_t>(h_offsets.size(), stream);
     std::size_t accum_size = 0;
     {
       auto current_offset_it = h_offsets.begin();
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index d69d214a881..581d0a00924 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 1b0d201501b..7ca1b51df98 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -485,14 +485,12 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
   CUDF_EXPECTS(transition_times.size() == offsets.size(),
                "Error reading TZif file for timezone " + std::string{timezone_name});
 
-  std::vector<timestamp_s> ttimes_typed;
-  ttimes_typed.reserve(transition_times.size());
+  auto ttimes_typed = make_empty_host_vector<timestamp_s>(transition_times.size(), stream);
   std::transform(transition_times.cbegin(),
                  transition_times.cend(),
                  std::back_inserter(ttimes_typed),
                  [](auto ts) { return timestamp_s{duration_s{ts}}; });
-  std::vector<duration_s> offsets_typed;
-  offsets_typed.reserve(offsets.size());
+  auto offsets_typed = make_empty_host_vector<duration_s>(offsets.size(), stream);
   std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
     return duration_s{ts};
   });
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index fdc3d9d0ecf..72828309425 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -105,7 +105,7 @@ struct compute_children_offsets_fn {
    */
   rmm::device_uvector<offsets_pair> create_children_offsets(rmm::cuda_stream_view stream)
   {
-    std::vector<offsets_pair> offsets(columns_ptrs.size());
+    auto offsets = cudf::detail::make_host_vector<offsets_pair>(columns_ptrs.size(), stream);
     thrust::transform_exclusive_scan(
       thrust::host,
       columns_ptrs.begin(),
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 37f8fa7a05b..0617d71fa51 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -77,7 +77,9 @@ std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
 std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_column,
                                                std::unique_ptr<column> indices_column,
                                                rmm::device_buffer&& null_mask,
-                                               size_type null_count)
+                                               size_type null_count,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls");
   CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls");
@@ -89,7 +91,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
   children.emplace_back(std::move(keys_column));
   return std::make_unique<column>(data_type{type_id::DICTIONARY32},
                                   count,
-                                  rmm::device_buffer{},
+                                  rmm::device_buffer{0, stream, mr},
                                   std::move(null_mask),
                                   null_count,
                                   std::move(children));
@@ -134,8 +136,11 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
   auto indices_column = [&] {
     // If the types match, then just commandeer the column's data buffer.
     if (new_type.id() == indices_type) {
-      return std::make_unique<column>(
-        new_type, indices_size, std::move(*(contents.data.release())), rmm::device_buffer{}, 0);
+      return std::make_unique<column>(new_type,
+                                      indices_size,
+                                      std::move(*(contents.data.release())),
+                                      rmm::device_buffer{0, stream, mr},
+                                      0);
     }
     // If the new type does not match, then convert the data.
     cudf::column_view cast_view{
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 08a33d40abe..cf40fda5971 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -27,6 +27,7 @@
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/update_keys.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index 3e6d693dde5..f984f307ddd 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/calendrical_month_sequence.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 605d813ed1e..4292552a800 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -16,9 +16,16 @@
 
 #include "arrow_utilities.hpp"
 
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
 #include <nanoarrow/nanoarrow.h>
 
 namespace cudf {
@@ -83,9 +90,33 @@ ArrowType id_to_arrow_type(cudf::type_id id)
     case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
     case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
     case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    case cudf::type_id::DECIMAL128: return NANOARROW_TYPE_DECIMAL128;
     default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
   }
 }
 
+ArrowType id_to_arrow_storage_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::TIMESTAMP_SECONDS:
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::DURATION_SECONDS:
+    case cudf::type_id::DURATION_MILLISECONDS:
+    case cudf::type_id::DURATION_MICROSECONDS:
+    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    default: return id_to_arrow_type(id);
+  }
+}
+
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 4e2628ab689..1cee3071fcb 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -18,8 +18,12 @@
 
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <nanoarrow/nanoarrow.h>
-#include <nanoarrow/nanoarrow_types.h>
 
 namespace cudf {
 namespace detail {
@@ -47,5 +51,42 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view);
  */
 ArrowType id_to_arrow_type(cudf::type_id id);
 
+/**
+ * @brief Map cudf column type id to the storage type for Arrow
+ *
+ * Specifically this is for handling the underlying storage type of
+ * timestamps and durations.
+ *
+ * @param id column type id
+ * @return ArrowType storage type
+ */
+ArrowType id_to_arrow_storage_type(cudf::type_id id);
+
+/**
+ * @brief Helper to initialize ArrowArray struct
+ *
+ * @param arr Pointer to ArrowArray to initialize
+ * @param storage_type The type to initialize with
+ * @param column view for column to get the length and null count from
+ * @return nanoarrow status code, should be NANOARROW_OK if there are no errors
+ */
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column);
+
+/**
+ * @brief Helper to convert decimal values to 128-bit versions for Arrow compatibility
+ *
+ * The template parameter should be the underlying type of the data (e.g. int32_t for
+ * 32-bit decimal and int64_t for 64-bit decimal).
+ *
+ * @param input column_view of the data
+ * @param stream cuda stream to perform the operations on
+ * @param mr memory resource to allocate the returned device_uvector with
+ * @return unique_ptr to a device_buffer containing the upcasted data
+ */
+template <typename DeviceType>
+std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/decimal_conversion_utilities.cu b/cpp/src/interop/decimal_conversion_utilities.cu
new file mode 100644
index 00000000000..2f81c754a30
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cu
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "decimal_conversion_utilities.cuh"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  static_assert(std::is_same_v<DecimalType, int32_t> or std::is_same_v<DecimalType, int64_t>,
+                "Only int32 and int64 decimal types can be converted to decimal128.");
+
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+  auto buf = std::make_unique<rmm::device_buffer>(column.size() * sizeof(__int128_t), stream, mr);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream, mr),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(buf->data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // the lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return buf;
+}
+
+// Instantiate templates for int32_t and int64_t decimal types
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int32_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int64_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh
new file mode 100644
index 00000000000..41263147404
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <type_traits>
+
+namespace cudf::detail {
+
+/**
+ * @brief Convert decimal32 and decimal64 numeric data to decimal128 and return the device vector
+ *
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::detail
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index e1d289e67a3..440df571de0 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -25,7 +25,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -39,6 +38,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 
@@ -144,9 +144,6 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
-               "Large strings are not yet supported in from_arrow_device",
-               cudf::data_type_error);
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
       {type,
@@ -158,12 +155,15 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
       {});
   }
 
-  auto offsets_view = column_view{data_type(type_id::INT32),
+  data_type offsets_type(type_id::INT32);
+  if (schema->type == NANOARROW_TYPE_LARGE_STRING) { offsets_type = data_type(type_id::INT64); }
+  auto offsets_view = column_view{offsets_type,
                                   static_cast<size_type>(input->offset + input->length) + 1,
                                   input->buffers[fixed_width_data_buffer_idx],
                                   nullptr,
                                   0,
                                   0};
+
   return std::make_tuple<column_view, owned_columns_t>(
     {type,
      static_cast<size_type>(input->length),
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index b3087dedf98..efde8f2a463 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -28,7 +28,6 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -42,6 +41,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 622a3aba4bb..3d41f856f4f 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 #include "detail/arrow_allocator.hpp"
 
 #include <cudf/column/column.hpp>
@@ -157,33 +159,20 @@ std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
                                                             arrow::MemoryPool* ar_mr,
                                                             rmm::cuda_stream_view stream)
 {
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-
-  rmm::device_uvector<DeviceType> buf(input.size() * BIT_WIDTH_RATIO, stream);
-
-  auto count = thrust::make_counting_iterator(0);
-
-  thrust::for_each(
-    rmm::exec_policy(cudf::get_default_stream()),
-    count,
-    count + input.size(),
-    [in = input.begin<DeviceType>(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
-      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-      // The lowest order bits are the value, the remainder
-      // simply matches the sign bit to satisfy the two's
-      // complement integer representation of negative numbers.
-      out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-      }
-    });
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(
+    input, stream, rmm::mr::get_current_device_resource());
 
-  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
+
+  auto const buf_size_in_bytes = buf->size();
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                                buf->data(),
+                                buf_size_in_bytes,
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   auto type    = arrow::decimal(precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
@@ -473,7 +462,7 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
 {
   auto const column = cudf::make_column_from_scalar(input, 1, stream);
   cudf::table_view const tv{{column->view()}};
-  auto const arrow_table  = cudf::to_arrow(tv, {metadata}, stream);
+  auto const arrow_table  = detail::to_arrow(tv, {metadata}, stream, ar_mr);
   auto const ac           = arrow_table->column(0);
   auto const maybe_scalar = ac->GetScalar(0);
   if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); }
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index b9d3a59e647..cea7cdebcba 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
@@ -24,7 +25,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -44,6 +44,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 namespace detail {
@@ -56,14 +57,6 @@ void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
   delete unique_buffer;
 }
 
-int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
-{
-  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
-  arr->length     = column.size();
-  arr->null_count = column.null_count();
-  return NANOARROW_OK;
-}
-
 template <typename>
 struct is_device_scalar : public std::false_type {};
 
@@ -99,21 +92,6 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
   return NANOARROW_OK;
 }
 
-ArrowType id_to_arrow_storage_type(cudf::type_id id)
-{
-  switch (id) {
-    case cudf::type_id::TIMESTAMP_SECONDS:
-    case cudf::type_id::TIMESTAMP_MILLISECONDS:
-    case cudf::type_id::TIMESTAMP_MICROSECONDS:
-    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
-    case cudf::type_id::DURATION_SECONDS:
-    case cudf::type_id::DURATION_MILLISECONDS:
-    case cudf::type_id::DURATION_MICROSECONDS:
-    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
-    default: return id_to_arrow_type(id);
-  }
-}
-
 struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
@@ -156,35 +134,17 @@ struct dispatch_to_arrow_device {
 };
 
 template <typename DeviceType>
-int decimals_to_arrow(cudf::column_view input,
-                      rmm::cuda_stream_view stream,
-                      rmm::device_async_resource_ref mr,
-                      ArrowArray* out)
+int construct_decimals(cudf::column_view input,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr,
+                       ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-  auto buf =
-    std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
-
-  auto count = thrust::counting_iterator<size_type>(0);
-
-  thrust::for_each(
-    rmm::exec_policy(stream, mr),
-    count,
-    count + input.size(),
-    [in = input.begin<DeviceType>(), out = buf->data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
-      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-      // the lowest order bits are the value, the remainder
-      // simply matches the sign bit to satisfy the two's
-      // complement integer representation of negative numbers.
-      out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-      }
-    });
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(input, stream, mr);
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
@@ -198,7 +158,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
   return NANOARROW_OK;
@@ -211,7 +171,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
   return NANOARROW_OK;
@@ -256,8 +216,15 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
                                                             rmm::device_async_resource_ref mr,
                                                             ArrowArray* out)
 {
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
   nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
 
   if (column.size() == 0) {
     // the scalar zero here is necessary because the spec for string arrays states
@@ -265,8 +232,14 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
     // the case of a 0 length string array, there should be exactly 1 value, zero,
     // in the offsets buffer. While some arrow implementations may accept a zero-sized
     // offsets buffer, best practices would be to allocate the buffer with the single value.
-    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    if (nanoarrow_type == NANOARROW_TYPE_STRING) {
+      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    } else {
+      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    }
+
     ArrowArrayMove(tmp.get(), out);
     return NANOARROW_OK;
   }
@@ -436,7 +409,7 @@ template <>
 int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
 {
   using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
   return NANOARROW_OK;
 }
@@ -445,7 +418,7 @@ template <>
 int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
 {
   using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
   return NANOARROW_OK;
 }
@@ -481,13 +454,26 @@ int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
 template <>
 int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out) const
 {
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
   nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
 
   if (column.size() == 0) {
     // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
-    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
+      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    } else {
+      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    }
+
     ArrowArrayMove(tmp.get(), out);
     return NANOARROW_OK;
   }
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
new file mode 100644
index 00000000000..193b3a3b5a2
--- /dev/null
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
+
+#include <iostream>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+struct dispatch_to_arrow_host {
+  cudf::column_view column;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+  int populate_validity_bitmap(ArrowBitmap* bitmap) const
+  {
+    if (!column.has_nulls()) { return NANOARROW_OK; }
+
+    NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast<int64_t>(column.size()), 0));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data,
+                                  (column.offset() > 0)
+                                    ? cudf::detail::copy_bitmask(column, stream, mr).data()
+                                    : column.null_mask(),
+                                  bitmap->buffer.size_bytes,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    return NANOARROW_OK;
+  }
+
+  template <typename T>
+  int populate_data_buffer(device_span<T const> input, ArrowBuffer* buffer) const
+  {
+    NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value()));
+    return NANOARROW_OK;
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() && !cudf::is_fixed_point<T>())>
+  int operator()(ArrowArray*) const
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_host", cudf::data_type_error);
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  int operator()(ArrowArray* out) const
+  {
+    nanoarrow::UniqueArray tmp;
+
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+
+    NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+    using DataType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<DataType const>(column.data<DataType>(), column.size()),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  // convert decimal types from libcudf to arrow where those types are not directly
+  // supported by Arrow. These types must be fit into 128 bits, the smallest
+  // decimal resolution supported by Arrow
+  template <typename T,
+            CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() &&
+                           (std::is_same_v<T, numeric::decimal32> ||
+                            std::is_same_v<T, numeric::decimal64>))>
+  int operator()(ArrowArray* out) const
+  {
+    using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal32>, int32_t, int64_t>;
+    nanoarrow::UniqueArray tmp;
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+
+    NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+    auto buf = detail::convert_decimals_to_decimal128<DeviceType>(column, stream, mr);
+    // No need to synchronize stream here as populate_data_buffer uses the same stream to copy data
+    // to host.
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<__int128_t const>(
+                             reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+};
+
+int get_column(cudf::column_view column,
+               rmm::cuda_stream_view stream,
+               rmm::device_async_resource_ref mr,
+               ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_host::operator()<bool>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto bitmask = bools_to_mask(column, stream, mr);
+  NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+    device_span<uint8_t const>(reinterpret_cast<const uint8_t*>(bitmask.first->data()),
+                               bitmask.first->size()),
+    ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::string_view>(ArrowArray* out) const
+{
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
+
+  if (column.size() == 0) {
+    // initialize the offset buffer with a single zero by convention
+    if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
+      NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendInt64(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+    } else {
+      NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+    }
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+
+  auto const scv     = cudf::strings_column_view(column);
+  auto const offsets = scv.offsets();
+  if (offsets.type().id() == cudf::type_id::INT64) {
+    NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+      device_span<int64_t const>(offsets.data<int64_t>() + scv.offset(), scv.size() + 1),
+      ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  } else {
+    NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+      device_span<int32_t const>(offsets.data<int32_t>() + scv.offset(), scv.size() + 1),
+      ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  }
+
+  NANOARROW_RETURN_NOT_OK(
+    populate_data_buffer(device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)),
+                         ArrowArrayBuffer(tmp.get(), 2)));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::list_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto const lcv = cudf::lists_column_view(column);
+
+  if (column.size() == 0) {
+    // initialize the offsets buffer with a single zero by convention for 0 length
+    NANOARROW_RETURN_NOT_OK(
+      ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+  } else {
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<int32_t const>(lcv.offsets_begin(), (column.size() + 1)),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  }
+
+  NANOARROW_RETURN_NOT_OK(get_column(lcv.child(), stream, mr, tmp->children[0]));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::dictionary32>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto dcv          = cudf::dictionary_column_view(column);
+  auto dict_indices = dcv.get_indices_annotated();
+  switch (dict_indices.type().id()) {
+    case type_id::INT8:
+    case type_id::UINT8:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int8_t const>(dict_indices.data<int8_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT16:
+    case type_id::UINT16:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int16_t const>(dict_indices.data<int16_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT32:
+    case type_id::UINT32:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int32_t const>(dict_indices.data<int32_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT64:
+    case type_id::UINT64:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int64_t const>(dict_indices.data<int64_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    default: CUDF_FAIL("unsupported type for dictionary indices");
+  }
+
+  NANOARROW_RETURN_NOT_OK(get_column(dcv.keys(), stream, mr, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::struct_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+
+  auto const scv = cudf::structs_column_view(column);
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto const child      = scv.get_sliced_child(i, stream);
+    NANOARROW_RETURN_NOT_OK(get_column(child, stream, mr, child_ptr));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+int get_column(cudf::column_view column,
+               rmm::cuda_stream_view stream,
+               rmm::device_async_resource_ref mr,
+               ArrowArray* out)
+{
+  return column.type().id() != type_id::EMPTY
+           ? type_dispatcher(column.type(), dispatch_to_arrow_host{column, stream, mr}, out)
+           : initialize_array(out, NANOARROW_TYPE_NA, column);
+}
+
+unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out)
+{
+  ArrowError err;
+  if (ArrowArrayFinishBuildingDefault(out.get(), &err) != NANOARROW_OK) {
+    std::cerr << err.message << std::endl;
+    CUDF_FAIL("failed to build");
+  }
+
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+
+  result->device_id   = -1;
+  result->device_type = ARROW_DEVICE_CPU;
+  result->sync_event  = nullptr;
+  ArrowArrayMove(out.get(), &result->array);
+  return result;
+}
+
+}  // namespace
+
+unique_device_array_t to_arrow_host(cudf::table_view const& table,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  for (cudf::size_type i = 0; i < table.num_columns(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = table.column(i);
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, child));
+  }
+
+  // wait for all the stream operations to complete before we return.
+  // this ensures that the host memory that we're returning will be populated
+  // before we return from this function.
+  stream.synchronize();
+
+  return create_device_array(std::move(tmp));
+}
+
+unique_device_array_t to_arrow_host(cudf::column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_THROW_NOT_OK(
+    cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, tmp.get()));
+
+  // wait for all the stream operations to complete before we return.
+  // this ensures that the host memory that we're returning will be populated
+  // before we return from this function.
+  stream.synchronize();
+
+  return create_device_array(std::move(tmp));
+}
+
+}  // namespace detail
+
+unique_device_array_t to_arrow_host(cudf::column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_host(col, stream, mr);
+}
+
+unique_device_array_t to_arrow_host(cudf::table_view const& table,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_host(table, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index 19915464236..b98ca8a7bed 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -20,7 +20,6 @@
 #include <cudf/detail/interop.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -120,7 +119,11 @@ int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
                                                           column_metadata const&,
                                                           ArrowSchema* out)
 {
-  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+  return ((input.num_children() == 0 ||
+           input.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+             type_id::INT32))
+           ? ArrowSchemaSetType(out, NANOARROW_TYPE_STRING)
+           : ArrowSchemaSetType(out, NANOARROW_TYPE_LARGE_STRING);
 }
 
 // these forward declarations are needed due to the recursive calls to them
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 814efe2b5a1..69a0e982a5b 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -554,9 +554,11 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
       auto d_global_dict_data = rmm::device_uvector<char>(0, stream);
 
       if (total_dictionary_entries > 0) {
-        auto h_global_dict      = std::vector<string_index_pair>(total_dictionary_entries);
-        auto h_global_dict_data = std::vector<char>(dictionary_data_size);
-        size_t dict_pos         = 0;
+        auto h_global_dict =
+          cudf::detail::make_host_vector<string_index_pair>(total_dictionary_entries, stream);
+        auto h_global_dict_data =
+          cudf::detail::make_host_vector<char>(dictionary_data_size, stream);
+        size_t dict_pos = 0;
 
         for (size_t i = 0; i < column_types.size(); ++i) {
           auto const col_idx          = selected_columns[i].first;
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 5908b77c98b..8bfca2b30df 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -73,6 +74,7 @@ constexpr std::size_t BUFFER_PADDING_MULTIPLE{8};
  * @param[in] parse_hdr Whether or not to parse GZIP header
  * @param[in] stream CUDA stream to use
  */
+CUDF_EXPORT
 void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
                 device_span<compression_result> results,
@@ -101,6 +103,7 @@ void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const>
  * @param[out] results List of output status structures
  * @param[in] stream CUDA stream to use
  */
+CUDF_EXPORT
 void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
                 device_span<compression_result> results,
@@ -113,6 +116,7 @@ void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
  *
  * @return The size in bytes of required temporary memory
  */
+CUDF_EXPORT
 size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
 
 /**
@@ -128,6 +132,7 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  * @param[in] scratch_size Size in bytes of the temporary memory
  * @param[in] stream CUDA stream to use
  */
+CUDF_EXPORT
 void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
                   device_span<device_span<uint8_t> const> outputs,
                   device_span<compression_result> results,
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 05faded651d..40d4372ae9d 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -567,7 +567,7 @@ void infer_column_types(parse_options const& parse_opts,
 }
 
 std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                       std::vector<column_parse::flags> const& column_flags,
+                                       host_span<column_parse::flags const> column_flags,
                                        std::vector<std::string> const& column_names,
                                        device_span<char const> data,
                                        device_span<uint64_t const> row_offsets,
@@ -592,8 +592,8 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_columns);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
+  auto h_data  = cudf::detail::make_host_vector<void*>(num_active_columns, stream);
+  auto h_valid = cudf::detail::make_host_vector<bitmask_type*>(num_active_columns, stream);
 
   for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
@@ -622,14 +622,16 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
   return out_buffers;
 }
 
-std::vector<data_type> determine_column_types(csv_reader_options const& reader_opts,
-                                              parse_options const& parse_opts,
-                                              host_span<std::string const> column_names,
-                                              device_span<char const> data,
-                                              device_span<uint64_t const> row_offsets,
-                                              int32_t num_records,
-                                              host_span<column_parse::flags> column_flags,
-                                              rmm::cuda_stream_view stream)
+cudf::detail::host_vector<data_type> determine_column_types(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  host_span<std::string const> column_names,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  int32_t num_records,
+  host_span<column_parse::flags> column_flags,
+  cudf::size_type num_active_columns,
+  rmm::cuda_stream_view stream)
 {
   std::vector<data_type> column_types(column_flags.size());
 
@@ -653,7 +655,8 @@ std::vector<data_type> determine_column_types(csv_reader_options const& reader_o
                      stream);
 
   // compact column_types to only include active columns
-  std::vector<data_type> active_col_types;
+  auto active_col_types =
+    cudf::detail::make_empty_host_vector<data_type>(num_active_columns, stream);
   std::copy_if(column_types.cbegin(),
                column_types.cend(),
                std::back_inserter(active_col_types),
@@ -697,8 +700,10 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   auto const num_actual_columns = static_cast<int32_t>(column_names.size());
   auto num_active_columns       = num_actual_columns;
-  auto column_flags             = std::vector<column_parse::flags>(
-    num_actual_columns, column_parse::enabled | column_parse::inferred);
+  auto column_flags =
+    cudf::detail::make_host_vector<column_parse::flags>(num_actual_columns, stream);
+  std::fill(
+    column_flags.begin(), column_flags.end(), column_parse::enabled | column_parse::inferred);
 
   // User did not pass column names to override names in the file
   // Process names from the file to remove empty and duplicated strings
@@ -842,8 +847,15 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   // Exclude the end-of-data row from number of rows with actual data
   auto const num_records  = std::max(row_offsets.size(), 1ul) - 1;
-  auto const column_types = determine_column_types(
-    reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream);
+  auto const column_types = determine_column_types(reader_opts,
+                                                   parse_opts,
+                                                   column_names,
+                                                   data,
+                                                   row_offsets,
+                                                   num_records,
+                                                   column_flags,
+                                                   num_active_columns,
+                                                   stream);
 
   auto metadata    = table_metadata{};
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 2171764decd..0e70984b39c 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -18,7 +18,9 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/sequence.h>
 
 namespace cudf::io::fst::detail {
@@ -44,9 +46,10 @@ using StateIndexT = uint32_t;
 template <int32_t NUM_ITEMS>
 struct VectorCompositeOp {
   template <typename VectorT>
-  __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
+  __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
   {
     VectorT res{};
+#pragma unroll
     for (int32_t i = 0; i < NUM_ITEMS; ++i) {
       res.Set(i, rhs.Get(lhs.Get(i)));
     }
@@ -57,61 +60,275 @@ struct VectorCompositeOp {
 /**
  * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
  * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
- * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols
- * it intends to write out and writing out such symbols to the given output iterators.
+ * to emit any output symbol (the "transduced" output) and, if so, keeps track of *how many* symbols
+ * it intends to write out.
+ */
+template <typename TransducerTableT>
+class DFACountCallbackWrapper {
+ public:
+  __device__ __forceinline__ DFACountCallbackWrapper(TransducerTableT transducer_table)
+    : transducer_table(transducer_table)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const&)
+  {
+    out_count = 0;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+    out_count += count;
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+  TransducerTableT const transducer_table;
+  uint32_t out_count{};
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators.
  *
+ * @tparam MaxTranslatedOutChars The maximum number of symbols that are written on a any given state
+ * transition
  * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
  * the symbols that are supposed to be emitted on a given state transition.
- * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
  * transducer table are assignable.
- * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
  */
-template <typename TransducerTableT, typename TransducedOutItT, typename TransducedIndexOutItT>
-class DFASimulationCallbackWrapper {
+template <int MaxTranslatedOutChars,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class DFAWriteCallbackWrapper {
  public:
-  __host__ __device__ __forceinline__ DFASimulationCallbackWrapper(
-    TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it)
-    : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false)
+  __device__ __forceinline__ DFAWriteCallbackWrapper(TransducerTableT transducer_table,
+                                                     TransducedOutItT out_it,
+                                                     TransducedIndexOutItT out_idx_it,
+                                                     uint32_t out_offset,
+                                                     uint32_t /*tile_out_offset*/,
+                                                     uint32_t /*tile_in_offset*/,
+                                                     uint32_t /*tile_out_count*/)
+    : transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      out_offset(out_offset)
   {
   }
 
   template <typename OffsetT>
-  __host__ __device__ __forceinline__ void Init(OffsetT const& offset)
+  __device__ __forceinline__ void Init(OffsetT const& in_offset)
+  {
+    this->in_offset = in_offset;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ <= 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_> /*MaxTranslatedOutChars*/)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+#pragma unroll
+    for (uint32_t out_char = 0; out_char < MaxTranslatedOutChars_; out_char++) {
+      if (out_char < count) {
+        out_it[out_offset + out_char] =
+          transducer_table(old_state, symbol_id, out_char, read_symbol);
+        out_idx_it[out_offset + out_char] = in_offset + character_index;
+      }
+    }
+    out_offset += count;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ > 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_>)
   {
-    this->offset = offset;
-    if (!write) out_count = 0;
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol);
+      out_idx_it[out_offset + out_char] = in_offset + character_index;
+    }
+    out_offset += count;
   }
 
   template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
-                                                      StateIndexT const old_state,
-                                                      StateIndexT const new_state,
-                                                      SymbolIndexT const symbol_id,
-                                                      SymbolT const read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    ReadSymbol(character_index,
+               old_state,
+               new_state,
+               symbol_id,
+               read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars>{});
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+
+ public:
+  TransducerTableT const transducer_table;
+  TransducedOutItT out_it;
+  TransducedIndexOutItT out_idx_it;
+  uint32_t out_offset;
+  uint32_t in_offset;
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators. This class uses a shared memory-backed write buffer to coalesce writes to
+ * global memory.
+ *
+ * @tparam DiscardIndexOutput Whether to discard the indexes instead of writing them to the given
+ * output iterator
+ * @tparam DiscardTranslatedOutput Whether to discard the translated output symbols instead of
+ * writing them to the given output iterator
+ * @tparam NumWriteBufferItems The number of items to allocate in shared memory for the write
+ * buffer.
+ * @tparam OutputT The type of the translated items
+ * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
+ * the symbols that are supposed to be emitted on a given state transition.
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
+ * transducer table are assignable.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
+ */
+template <bool DiscardIndexOutput,
+          bool DiscardTranslatedOutput,
+          int NumWriteBufferItems,
+          typename OutputT,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class WriteCoalescingCallbackWrapper {
+  struct TempStorage_Offsets {
+    uint16_t compacted_offset[NumWriteBufferItems];
+  };
+  struct TempStorage_Symbols {
+    OutputT compacted_symbols[NumWriteBufferItems];
+  };
+  using offset_cache_t =
+    ::cuda::std::conditional_t<DiscardIndexOutput, cub::NullType, TempStorage_Offsets>;
+  using symbol_cache_t = ::cuda::std::
+    conditional_t<DiscardTranslatedOutput, cub::Uninitialized<cub::NullType>, TempStorage_Symbols>;
+  struct TempStorage_ : offset_cache_t, symbol_cache_t {};
+
+  __device__ __forceinline__ TempStorage_& PrivateStorage()
+  {
+    __shared__ TempStorage private_storage;
+    return private_storage.Alias();
+  }
+  TempStorage_& temp_storage;
+
+ public:
+  struct TempStorage : cub::Uninitialized<TempStorage_> {};
+
+  __device__ __forceinline__ WriteCoalescingCallbackWrapper(TransducerTableT transducer_table,
+                                                            TransducedOutItT out_it,
+                                                            TransducedIndexOutItT out_idx_it,
+                                                            uint32_t thread_out_offset,
+                                                            uint32_t tile_out_offset,
+                                                            uint32_t tile_in_offset,
+                                                            uint32_t tile_out_count)
+    : temp_storage(PrivateStorage()),
+      transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      thread_out_offset(thread_out_offset),
+      tile_out_offset(tile_out_offset),
+      tile_in_offset(tile_in_offset),
+      tile_out_count(tile_out_count)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const& offset)
+  {
+    this->in_offset = offset;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
   {
     uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
-    if (write) {
-#if defined(__CUDA_ARCH__)
-#pragma unroll 1
-#endif
-      for (uint32_t out_char = 0; out_char < count; out_char++) {
-        out_it[out_count + out_char] =
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      if constexpr (!DiscardIndexOutput) {
+        temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] =
+          in_offset + character_index - tile_in_offset;
+      }
+      if constexpr (!DiscardTranslatedOutput) {
+        temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] =
           transducer_table(old_state, symbol_id, out_char, read_symbol);
-        out_idx_it[out_count + out_char] = offset + character_index;
       }
     }
-    out_count += count;
+    thread_out_offset += count;
   }
 
-  __host__ __device__ __forceinline__ void TearDown() {}
+  __device__ __forceinline__ void TearDown()
+  {
+    __syncthreads();
+    if constexpr (!DiscardTranslatedOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char];
+      }
+    }
+    if constexpr (!DiscardIndexOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_idx_it[tile_out_offset + out_char] =
+          temp_storage.compacted_offset[out_char] + tile_in_offset;
+      }
+    }
+    __syncthreads();
+  }
 
  public:
   TransducerTableT const transducer_table;
   TransducedOutItT out_it;
   TransducedIndexOutItT out_idx_it;
-  uint32_t out_count;
-  uint32_t offset;
-  bool write;
+  uint32_t thread_out_offset;
+  uint32_t tile_out_offset;
+  uint32_t tile_in_offset;
+  uint32_t in_offset;
+  uint32_t tile_out_count;
 };
 
 /**
@@ -125,17 +342,18 @@ class DFASimulationCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __host__ __device__ __forceinline__ StateVectorTransitionOp(
+  __device__ __forceinline__ StateVectorTransitionOp(
     TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol) const
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol) const
   {
+#pragma unroll
     for (int32_t i = 0; i < NUM_INSTANCES; ++i) {
       state_vector[i] = transition_table(state_vector[i], read_symbol_id);
     }
@@ -152,17 +370,17 @@ struct StateTransitionOp {
   TransitionTableT const& transition_table;
   CallbackOpT& callback_op;
 
-  __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
-                                                        StateIndexT state,
-                                                        CallbackOpT& callback_op)
+  __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
+                                               StateIndexT state,
+                                               CallbackOpT& callback_op)
     : transition_table(transition_table), state(state), callback_op(callback_op)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol)
   {
     // Remember what state we were in before we made the transition
     StateIndexT previous_state = state;
@@ -420,7 +638,7 @@ struct AgentDFA {
     __syncthreads();
 
     // Thread's symbols
-    CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
+    CharT const* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
 
     // Parse thread's symbols and transition the state-vector
     if (is_full_block) {
@@ -538,6 +756,43 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // The state transition vector passed on to the second stage of the algorithm
   StateVectorT out_state_vector;
 
+  using OutSymbolT = typename DfaT::OutSymbolT;
+  // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t num_max_translated_out = DfaT::MAX_TRANSLATED_OUT;
+  static constexpr bool discard_out_index =
+    ::cuda::std::is_same<TransducedIndexOutItT, thrust::discard_iterator<>>::value;
+  static constexpr bool discard_out_it =
+    ::cuda::std::is_same<TransducedOutItT, thrust::discard_iterator<>>::value;
+  using NonWriteCoalescingT =
+    DFAWriteCallbackWrapper<num_max_translated_out,
+                            decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                            TransducedOutItT,
+                            TransducedIndexOutItT>;
+
+  using WriteCoalescingT =
+    WriteCoalescingCallbackWrapper<discard_out_index,
+                                   discard_out_it,
+                                   num_max_translated_out * SYMBOLS_PER_BLOCK,
+                                   OutSymbolT,
+                                   decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                                   TransducedOutItT,
+                                   TransducedIndexOutItT>;
+
+  static constexpr bool is_translation_pass = (!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS;
+
+  // Use write-coalescing only if the worst-case output size per tile fits into shared memory
+  static constexpr bool can_use_smem_cache =
+    (sizeof(typename WriteCoalescingT::TempStorage) + sizeof(typename AgentDfaSimT::TempStorage) +
+     sizeof(typename DfaT::SymbolGroupStorageT) + sizeof(typename DfaT::TransitionTableStorageT) +
+     sizeof(typename DfaT::TranslationTableStorageT)) < (48 * 1024);
+  static constexpr bool use_smem_cache =
+    is_translation_pass and
+    (sizeof(typename WriteCoalescingT::TempStorage) <= AgentDFAPolicy::SMEM_THRESHOLD) and
+    can_use_smem_cache;
+
+  using DFASimulationCallbackWrapperT =
+    cuda::std::conditional_t<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>;
+
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
@@ -576,7 +831,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that
     // incorporates all preceding blocks/tiles results)
     //------------------------------------------------------------------------------
-    if (IS_SINGLE_PASS) {
+    if constexpr (IS_SINGLE_PASS) {
       uint32_t tile_idx             = blockIdx.x;
       using StateVectorCompositeOpT = VectorCompositeOp<NUM_STATES>;
 
@@ -623,10 +878,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     }
 
     // Perform finite-state machine simulation, computing size of transduced output
-    DFASimulationCallbackWrapper<decltype(dfa.InitTranslationTable(transducer_table_storage)),
-                                 TransducedOutItT,
-                                 TransducedIndexOutItT>
-      callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it);
+    DFACountCallbackWrapper count_chars_callback_op{transducer_table};
 
     StateIndexT t_start_state = state;
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
@@ -635,7 +887,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         state,
-                                        callback_wrapper,
+                                        count_chars_callback_op,
                                         cub::Int2Type<IS_SINGLE_PASS>());
 
     __syncthreads();
@@ -650,15 +902,18 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage;
 
     uint32_t tile_idx = blockIdx.x;
+    uint32_t tile_out_offset{};
+    uint32_t tile_out_count{};
+    uint32_t thread_out_offset{};
     if (tile_idx == 0) {
       OffsetT block_aggregate = 0;
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(callback_wrapper.out_count,
-                       callback_wrapper.out_count,
+        .ExclusiveScan(count_chars_callback_op.out_count,
+                       thread_out_offset,
                        static_cast<OffsetT>(0),
                        cub::Sum{},
                        block_aggregate);
-
+      tile_out_count = block_aggregate;
       if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) {
         offset_tile_state.SetInclusive(0, block_aggregate);
       }
@@ -671,22 +926,28 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
         offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx);
 
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(
-          callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op);
-
+        .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op);
+      tile_out_offset = prefix_op.GetExclusivePrefix();
+      tile_out_count  = prefix_op.GetBlockAggregate();
       if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) {
         *d_num_transduced_out_it = prefix_op.GetInclusivePrefix();
       }
     }
 
-    callback_wrapper.write = true;
+    DFASimulationCallbackWrapperT write_translated_callback_op{transducer_table,
+                                                               transduced_out_it,
+                                                               transduced_out_idx_it,
+                                                               thread_out_offset,
+                                                               tile_out_offset,
+                                                               blockIdx.x * SYMBOLS_PER_BLOCK,
+                                                               tile_out_count};
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
                                         transition_table,
                                         d_chars,
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         t_start_state,
-                                        callback_wrapper,
+                                        write_translated_callback_op,
                                         cub::Int2Type<true>());
   }
 }
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
index be63ec6539f..ef5e9c8a78f 100644
--- a/cpp/src/io/fst/dispatch_dfa.cuh
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -37,6 +37,11 @@ struct AgentDFAPolicy {
 
   // The number of symbols processed by each thread
   static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+
+  // If the shared memory-backed write buffer exceeds this threshold, the FST will skip buffering
+  // the output in a write buffer and instead immediately write out to global memory, potentially
+  // resulting in non-coalesced writes
+  static constexpr std::size_t SMEM_THRESHOLD = 24 * 1024;
 };
 
 /**
@@ -49,7 +54,7 @@ struct DeviceFSMPolicy {
   struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> {
     enum {
       BLOCK_THREADS    = 128,
-      ITEMS_PER_THREAD = 32,
+      ITEMS_PER_THREAD = 16,
     };
 
     using AgentDFAPolicy = AgentDFAPolicy<BLOCK_THREADS, ITEMS_PER_THREAD>;
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 5532a7f994b..ae1f81fd541 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -367,18 +367,18 @@ class TransitionTable {
 
   template <typename StateIdT>
   static KernelParameter InitDeviceTransitionTable(
-    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& translation_table)
+    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& transition_table)
   {
     KernelParameter init_data{};
-    // translation_table[state][symbol] -> new state
-    for (std::size_t state = 0; state < translation_table.size(); ++state) {
-      for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) {
+    // transition_table[state][symbol] -> new state
+    for (std::size_t state = 0; state < transition_table.size(); ++state) {
+      for (std::size_t symbol = 0; symbol < transition_table[state].size(); ++symbol) {
         CUDF_EXPECTS(
-          static_cast<int64_t>(translation_table[state][symbol]) <=
+          static_cast<int64_t>(transition_table[state][symbol]) <=
             std::numeric_limits<ItemT>::max(),
           "Target state index value exceeds value representable by the transition table's type");
         init_data.transitions[symbol * MAX_NUM_STATES + state] =
-          static_cast<ItemT>(translation_table[state][symbol]);
+          static_cast<ItemT>(transition_table[state][symbol]);
       }
     }
 
@@ -494,6 +494,10 @@ class dfa_device_view {
   // This is a value queried by the DFA simulation algorithm
   static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
 
+  using OutSymbolT                            = typename TranslationTableT::OutSymbolT;
+  static constexpr int32_t MIN_TRANSLATED_OUT = TranslationTableT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t MAX_TRANSLATED_OUT = TranslationTableT::MAX_TRANSLATED_OUT;
+
   using SymbolGroupStorageT      = std::conditional_t<is_complex_op<SymbolGroupIdLookupT>::value,
                                                  typename SymbolGroupIdLookupT::TempStorage,
                                                  typename cub::NullType>;
@@ -542,24 +546,33 @@ class dfa_device_view {
  * @tparam OutSymbolT The symbol type being output
  * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output
  * symbols
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ * @tparam MIN_TRANSLATED_OUT_ The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT_ The maximum number of symbols being output by a single state
+ * transition
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used.
  */
-template <typename OutSymbolT,
+template <typename OutSymbolT_,
           typename OutSymbolOffsetT,
           int32_t MAX_NUM_SYMBOLS,
           int32_t MAX_NUM_STATES,
+          int32_t MIN_TRANSLATED_OUT_,
+          int32_t MAX_TRANSLATED_OUT_,
           int32_t MAX_TABLE_SIZE = (MAX_NUM_SYMBOLS * MAX_NUM_STATES)>
 class TransducerLookupTable {
  private:
   struct _TempStorage {
     OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
-    OutSymbolT out_symbols[MAX_TABLE_SIZE];
+    OutSymbolT_ out_symbols[MAX_TABLE_SIZE];
   };
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
@@ -567,6 +580,8 @@ class TransducerLookupTable {
                                                OutSymbolOffsetT,
                                                MAX_NUM_SYMBOLS,
                                                MAX_NUM_STATES,
+                                               MIN_TRANSLATED_OUT,
+                                               MAX_TRANSLATED_OUT,
                                                MAX_TABLE_SIZE>;
 
     OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
@@ -686,14 +701,19 @@ class TransducerLookupTable {
  * sequence of symbols that the finite-state transducer is supposed to output for each transition.
  *
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used
+ * @tparam MIN_TRANSLATED_OUT The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT The maximum number of symbols being output by a single state
+ * transition
  * @tparam OutSymbolT The symbol type being output
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
  * @param translation_table The translation table
  * @return A translation table of type `TransducerLookupTable`.
  */
 template <std::size_t MAX_TABLE_SIZE,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
           typename OutSymbolT,
           std::size_t MAX_NUM_SYMBOLS,
           std::size_t MAX_NUM_STATES>
@@ -705,20 +725,30 @@ auto make_translation_table(std::array<std::array<std::vector<OutSymbolT>, MAX_N
                                                     OutSymbolOffsetT,
                                                     MAX_NUM_SYMBOLS,
                                                     MAX_NUM_STATES,
+                                                    MIN_TRANSLATED_OUT,
+                                                    MAX_TRANSLATED_OUT,
                                                     MAX_TABLE_SIZE>;
   return translation_table_t::InitDeviceTranslationTable(translation_table);
 }
 
-template <typename TranslationOpT>
+template <typename TranslationOpT,
+          typename OutSymbolT_,
+          std::int32_t MIN_TRANSLATED_OUT_,
+          std::int32_t MAX_TRANSLATED_OUT_>
 class TranslationOp {
  private:
   struct _TempStorage {};
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
-    using LookupTableT = TranslationOp<TranslationOpT>;
+    using LookupTableT =
+      TranslationOp<TranslationOpT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>;
     TranslationOpT translation_op;
   };
 
@@ -772,6 +802,10 @@ class TranslationOp {
  *
  * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`
+ * @tparam MIN_TRANSLATED_SYMBOLS The minimum number of translated output symbols for any given
+ * input symbol
+ * @tparam MAX_TRANSLATED_SYMBOLS The maximum number of translated output symbols for any given
+ * input symbol
  * @param map_op A function object that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`.
  * Invocations of the first signature, (1), must return the number of symbols that are emitted for
@@ -779,10 +813,14 @@ class TranslationOp {
  * that transition, where `i` corresponds to `relative_offse`
  * @return A translation table of type `TranslationO`
  */
-template <typename FunctorT>
+template <typename OutSymbolT,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
+          typename FunctorT>
 auto make_translation_functor(FunctorT map_op)
 {
-  return TranslationOp<FunctorT>::InitDeviceTranslationTable(map_op);
+  return TranslationOp<FunctorT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>::
+    InitDeviceTranslationTable(map_op);
 }
 
 /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 6d2834206d4..62c3c5cd245 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -41,6 +41,7 @@
 #include <algorithm>
 
 namespace cudf::io {
+
 // Returns builder for csv_reader_options
 csv_reader_options_builder csv_reader_options::builder(source_info src)
 {
@@ -472,6 +473,8 @@ chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
 {
 }
 
+chunked_orc_reader::chunked_orc_reader() = default;
+
 // This destructor destroys the internal reader instance.
 // Since the declaration of the internal `reader` object does not exist in the header, this
 // destructor needs to be defined in a separate source file which can access to that object's
@@ -492,6 +495,10 @@ table_with_metadata chunked_orc_reader::read_chunk() const
   return reader->read_chunk();
 }
 
+orc_chunked_writer::orc_chunked_writer() = default;
+
+orc_chunked_writer::~orc_chunked_writer() = default;
+
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
@@ -618,6 +625,8 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
   return writer->close(options.get_column_chunks_file_paths());
 }
 
+chunked_parquet_reader::chunked_parquet_reader() = default;
+
 /**
  * @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader
  */
@@ -672,6 +681,8 @@ table_with_metadata chunked_parquet_reader::read_chunk() const
   return reader->read_chunk();
 }
 
+parquet_chunked_writer::parquet_chunked_writer() = default;
+
 /**
  * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
@@ -686,6 +697,8 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co
     std::move(sinks), options, io_detail::single_write_mode::NO, stream);
 }
 
+parquet_chunked_writer::~parquet_chunked_writer() = default;
+
 /**
  * @copydoc cudf::io::parquet_chunked_writer::write
  */
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 3e587768b11..17fa7abdffe 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -622,7 +622,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  std::vector<uint8_t> ignore_vals(num_columns, 0);
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
@@ -812,7 +812,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     return thrust::get<1>(a) < thrust::get<1>(b);
   });
   // move columns data to device.
-  std::vector<json_column_data> columns_data(num_columns);
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
   for (auto& [col_id, col_ref] : columns) {
     if (col_id == parent_node_sentinel) continue;
     auto& col            = col_ref.get();
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index ca56a12eb36..760b2214365 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -302,11 +302,14 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
-    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
-    fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
+                          fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_quotes::TransduceToNormalizedQuotes{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
@@ -327,11 +330,14 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
-    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+                          fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_whitespace::TransduceToNormalizedWS{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index e12892a2d50..20c143f66c7 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
@@ -28,10 +29,12 @@
 #include <vector>
 
 // Forward declaration of parse_options from parsing_utils.cuh
-namespace cudf::io {
+namespace cudf {
+namespace io {
+
 struct parse_options;
-}
-namespace cudf::io::json {
+
+namespace json {
 
 /**
  * @brief Struct that encapsulate all information of a columnar tree representation.
@@ -201,6 +204,7 @@ namespace detail {
  * @param[in] delimiter Specifies the delimiter to use as separator for JSON lines input
  * @param[in] stream The cuda stream to dispatch GPU kernels to
  */
+CUDF_EXPORT
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
@@ -216,6 +220,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
  * @param stream The cuda stream to dispatch GPU kernels to
  * @return Returns the post-processed token stream
  */
+CUDF_EXPORT
 std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> process_token_stream(
   device_span<PdaTokenT const> tokens,
   device_span<SymbolOffsetT const> token_indices,
@@ -232,6 +237,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
  * @return A tree representation of the input JSON string as vectors of node type, parent index,
  * level, begin index, and end index in the input JSON string
  */
+CUDF_EXPORT
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
@@ -251,6 +257,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * @param mr Optional, resource with which to allocate
  * @return A tuple of the output column indices and the row offsets within each column for each node
  */
+CUDF_EXPORT
 std::tuple<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
@@ -315,6 +322,7 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
  * @param mr Optional, resource with which to allocate
  * @return The data parsed from the given JSON input
  */
+CUDF_EXPORT
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -348,4 +356,6 @@ struct path_from_tree {
 
 }  // namespace detail
 
-}  // namespace cudf::io::json
+}  // namespace json
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index a007754ef4f..1e484d74679 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1455,11 +1455,14 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  auto json_to_stack_ops_fst = fst::detail::make_fst(
+  static constexpr auto min_translated_out = 0;
+  static constexpr auto max_translated_out = 1;
+  auto json_to_stack_ops_fst               = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)),
     fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)),
-    fst::detail::make_translation_table<max_translation_table_size>(
-      to_stack_op::get_translation_table(stack_behavior)),
+    fst::detail::
+      make_translation_table<max_translation_table_size, min_translated_out, max_translated_out>(
+        to_stack_op::get_translation_table(stack_behavior)),
     stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
@@ -1507,11 +1510,12 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
-  auto filter_fst =
-    fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
-                          fst::detail::make_transition_table(token_filter::transition_table),
-                          fst::detail::make_translation_functor(token_filter::TransduceToken{}),
-                          stream);
+  using symbol_t  = thrust::tuple<PdaTokenT, SymbolOffsetT>;
+  auto filter_fst = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
+    fst::detail::make_transition_table(token_filter::transition_table),
+    fst::detail::make_translation_functor<symbol_t, 0, 2>(token_filter::TransduceToken{}),
+    stream);
 
   auto const mr = rmm::mr::get_current_device_resource();
   rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
@@ -1598,7 +1602,8 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
       fst::detail::make_symbol_group_lookup_op(
         fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}),
       fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
-      fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
+      fst::detail::make_translation_functor<StackSymbolT, 1, 1>(
+        fix_stack_of_excess_chars::TransduceInputOp{}),
       stream);
     fix_stack_of_excess_chars.Transduce(zip_in,
                                         static_cast<SymbolOffsetT>(json_in.size()),
@@ -1619,7 +1624,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   auto json_to_tokens_fst = fst::detail::make_fst(
     fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
-    fst::detail::make_translation_table<max_translation_table_size>(
+    fst::detail::make_translation_table<max_translation_table_size, 0, 3>(
       tokenizer_pda::get_translation_table(recover_from_error)),
     stream);
 
@@ -1698,10 +1703,8 @@ void make_json_column(json_column& root_column,
   auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
   // Copy the JSON tokens to the host
-  thrust::host_vector<PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
-  thrust::host_vector<SymbolOffsetT> token_indices_gpu =
-    cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
+  auto tokens            = cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
+  auto token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 9cd39038348..590f70864b1 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -78,10 +78,9 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   auto constexpr num_delimiter_chars = 1;
 
   if (compression == compression_type::NONE) {
-    std::vector<size_t> delimiter_map{};
+    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
     std::vector<size_t> prefsum_source_sizes(sources.size());
     std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    delimiter_map.reserve(sources.size());
     size_t bytes_read = 0;
     std::transform_inclusive_scan(sources.begin(),
                                   sources.end(),
@@ -148,20 +147,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   return buffer.first(uncomp_data.size());
 }
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream)
+size_t estimate_size_per_subchunk(size_t chunk_size)
 {
-  auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1);
-  rmm::device_uvector<char> buffer(total_source_size, stream);
-  auto readbufspan = ingest_raw_input(buffer,
-                                      sources,
-                                      reader_opts.get_compression(),
-                                      reader_opts.get_byte_range_offset(),
-                                      reader_opts.get_byte_range_size(),
-                                      stream);
-  return find_first_delimiter(readbufspan, '\n', stream);
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+  return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
 }
 
 /**
@@ -183,7 +174,6 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
 
   size_t const total_source_size            = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
@@ -198,17 +188,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  // Some magic numbers
-  constexpr int num_subchunks               = 10;  // per chunk_size
-  constexpr size_t min_subchunk_size        = 10000;
-  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
-  constexpr int estimated_compression_ratio = 4;
-
-  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
-  // 10kb) and the byte range size
-
-  size_t const size_per_subchunk =
-    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+  int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  size_t const size_per_subchunk     = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
@@ -308,67 +289,78 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  std::for_each(sources.begin(), sources.end(), [](auto const& source) {
-    CUDF_EXPECTS(source->size() < std::numeric_limits<int>::max(),
-                 "The size of each source file must be less than INT_MAX bytes");
-  });
-
-  constexpr size_t batch_size_ub = std::numeric_limits<int>::max();
-  size_t const chunk_offset      = reader_opts.get_byte_range_offset();
+  /*
+   * The batched JSON reader enforces that the size of each batch is at most INT_MAX
+   * bytes (~2.14GB). Batches are defined to be byte range chunks - characterized by
+   * chunk offset and chunk size - that may span across multiple source files.
+   * Note that the batched reader does not work for compressed inputs or for regular
+   * JSON inputs.
+   */
+  size_t const total_source_size = sources_size(sources, 0, 0);
+  size_t chunk_offset            = reader_opts.get_byte_range_offset();
   size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                     = !chunk_size ? sources_size(sources, 0, 0) : chunk_size;
-
-  // Identify the position of starting source file from which to begin batching based on
-  // byte range offset. If the offset is larger than the sum of all source
-  // sizes, then start_source is total number of source files i.e. no file is read
-  size_t const start_source = [&]() {
-    size_t sum = 0;
+  chunk_size                     = !chunk_size ? total_source_size - chunk_offset
+                                               : std::min(chunk_size, total_source_size - chunk_offset);
+
+  size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
+  size_t const batch_size_ub =
+    std::numeric_limits<int>::max() - (max_subchunks_prealloced * size_per_subchunk);
+
+  /*
+   * Identify the position (zero-indexed) of starting source file from which to begin
+   * batching based on byte range offset. If the offset is larger than the sum of all
+   * source sizes, then start_source is total number of source files i.e. no file is
+   * read
+   */
+
+  // Prefix sum of source file sizes
+  size_t pref_source_size = 0;
+  // Starting source file from which to being batching evaluated using byte range offset
+  size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
     for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
-      if (sum + sources[src_idx]->size() > chunk_offset) return src_idx;
-      sum += sources[src_idx]->size();
+      if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; }
+      pref_source_size += sources[src_idx]->size();
     }
     return sources.size();
   }();
-
-  // Construct batches of source files, with starting position of batches indicated by
-  // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch
-  // is capped at INT_MAX bytes.
-  size_t cur_size = 0;
-  std::vector<size_t> batch_positions;
-  std::vector<size_t> batch_sizes;
-  batch_positions.push_back(0);
-  for (size_t i = start_source; i < sources.size(); i++) {
-    cur_size += sources[i]->size();
-    if (cur_size >= batch_size_ub) {
-      batch_positions.push_back(i);
-      batch_sizes.push_back(cur_size - sources[i]->size());
-      cur_size = sources[i]->size();
+  /*
+   * Construct batches of byte ranges spanning source files, with the starting position of batches
+   * indicated by `batch_offsets`. `pref_bytes_size` gives the bytes position from which the current
+   * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading
+   * stops.
+   */
+  size_t pref_bytes_size = chunk_offset;
+  size_t end_bytes_size  = chunk_offset + chunk_size;
+  std::vector<size_t> batch_offsets{pref_bytes_size};
+  for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
+    pref_source_size += sources[i]->size();
+    // If the current source file can subsume multiple batches, we split the file until the
+    // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`)
+    while (pref_bytes_size < end_bytes_size &&
+           pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) {
+      auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size);
+      batch_offsets.push_back(batch_offsets.back() + next_batch_size);
+      pref_bytes_size += next_batch_size;
     }
+    i++;
   }
-  batch_positions.push_back(sources.size());
-  batch_sizes.push_back(cur_size);
-
-  // If there is a single batch, then we can directly return the table without the
-  // unnecessary concatenate
-  if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr);
+  /*
+   * If there is a single batch, then we can directly return the table without the
+   * unnecessary concatenate. The size of batch_offsets is 1 if all sources are empty,
+   * or if end_bytes_size is larger than total_source_size.
+   */
+  if (batch_offsets.size() <= 2) return read_batch(sources, reader_opts, stream, mr);
 
   std::vector<cudf::io::table_with_metadata> partial_tables;
   json_reader_options batched_reader_opts{reader_opts};
-
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (size_t i = 0; i < batch_sizes.size(); i++) {
-    batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size));
-    partial_tables.emplace_back(read_batch(
-      host_span<std::unique_ptr<datasource>>(sources.begin() + batch_positions[i],
-                                             batch_positions[i + 1] - batch_positions[i]),
-      batched_reader_opts,
-      stream,
-      rmm::mr::get_current_device_resource()));
-    if (chunk_size <= batch_sizes[i]) break;
-    chunk_size -= batch_sizes[i];
-    batched_reader_opts.set_byte_range_offset(0);
+  for (size_t i = 0; i < batch_offsets.size() - 1; i++) {
+    batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
+    batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
+    partial_tables.emplace_back(
+      read_batch(sources, batched_reader_opts, stream, rmm::mr::get_current_device_resource()));
   }
 
   auto expects_schema_equality =
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 0c30b4cad46..32de4ebabfa 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -19,6 +19,7 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -27,7 +28,21 @@
 
 #include <memory>
 
-namespace cudf::io::json::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::json::detail {
+
+// Some magic numbers
+constexpr int num_subchunks               = 10;  // per chunk_size
+constexpr size_t min_subchunk_size        = 10000;
+constexpr int estimated_compression_ratio = 4;
+constexpr int max_subchunks_prealloced    = 3;
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream);
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
@@ -38,9 +53,5 @@ size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
                                rmm::cuda_stream_view stream);
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream);
-
-}  // namespace cudf::io::json::detail
+}  // namespace io::json::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 8e20505d3ff..e3b9a048be8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -492,11 +492,17 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
+  auto const num_struct_cols =
+    std::count_if(chunks[0].begin(), chunks[0].end(), [](auto const& chunk) {
+      return chunk.type_kind == STRUCT;
+    });
+  auto prefix_sums_to_update =
+    cudf::detail::make_empty_host_vector<thrust::pair<size_type, uint32_t*>>(num_struct_cols,
+                                                                             stream);
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
+      prefix_sums_to_update.push_back({col_idx, d_prefix_sums + num_stripes * col_idx});
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 805959327ac..80f32512b98 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1417,8 +1417,8 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
   if (rg_bounds.count() == 0) return;
 
   // Convert map to a vector of views of the `elem_sizes` device buffers
-  std::vector<decimal_column_element_sizes> h_sizes;
-  h_sizes.reserve(elem_sizes.size());
+  auto h_sizes =
+    cudf::detail::make_empty_host_vector<decimal_column_element_sizes>(elem_sizes.size(), stream);
   std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) {
     return decimal_column_element_sizes{p.first, p.second};
   });
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 4cb20bb7518..f3b8cfbc836 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -444,14 +444,17 @@ namespace {
  */
 file_segmentation calculate_segmentation(host_span<orc_column_view const> columns,
                                          hostdevice_2dvector<rowgroup_rows>&& rowgroup_bounds,
-                                         stripe_size_limits max_stripe_size)
+                                         stripe_size_limits max_stripe_size,
+                                         rmm::cuda_stream_view stream)
 {
-  std::vector<stripe_rowgroups> infos;
-  auto const num_rowgroups = rowgroup_bounds.size().first;
-  size_t stripe_start      = 0;
-  size_t stripe_bytes      = 0;
-  size_type stripe_rows    = 0;
-  for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
+  // Number of stripes is not known in advance. Only reserve a single element to use pinned memory
+  // resource if at all enabled.
+  auto infos                    = cudf::detail::make_empty_host_vector<stripe_rowgroups>(1, stream);
+  size_type const num_rowgroups = rowgroup_bounds.size().first;
+  size_type stripe_start        = 0;
+  size_t stripe_bytes           = 0;
+  size_type stripe_rows         = 0;
+  for (size_type rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
     auto const rowgroup_total_bytes =
       std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
         auto const rows = rowgroup_bounds[rg_idx][col.index()].size();
@@ -470,7 +473,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     // Check if adding the current rowgroup to the stripe will make the stripe too large or long
     if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes ||
                                     stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) {
-      infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(rg_idx - stripe_start)});
       stripe_start = rg_idx;
       stripe_bytes = 0;
       stripe_rows  = 0;
@@ -479,7 +484,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     stripe_bytes += rowgroup_total_bytes;
     stripe_rows += rowgroup_rows_max;
     if (rg_idx + 1 == num_rowgroups) {
-      infos.emplace_back(infos.size(), stripe_start, num_rowgroups - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(num_rowgroups - stripe_start)});
     }
   }
 
@@ -1336,7 +1343,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     if (num_file_blobs == 0) { return {}; }
 
     // Create empty file stats and merge groups
-    std::vector<statistics_chunk> h_stat_chunks(num_file_blobs);
+    auto h_stat_chunks = cudf::detail::make_host_vector<statistics_chunk>(num_file_blobs, stream);
     cudf::detail::hostdevice_vector<statistics_merge_group> stats_merge(num_file_blobs, stream);
     // Fill in stats_merge and stat_chunks on the host
     for (auto i = 0u; i < num_file_blobs; ++i) {
@@ -1677,39 +1684,39 @@ struct pushdown_null_masks {
   // Owning vector for masks in device memory
   std::vector<rmm::device_uvector<bitmask_type>> data;
   // Pointers to pushdown masks in device memory. Can be same for multiple columns.
-  std::vector<bitmask_type const*> masks;
+  cudf::detail::host_vector<bitmask_type const*> masks;
 };
 
 pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
                                              rmm::cuda_stream_view stream)
 {
-  std::vector<bitmask_type const*> mask_ptrs;
-  mask_ptrs.reserve(orc_table.num_columns());
+  auto mask_ptrs =
+    cudf::detail::make_empty_host_vector<bitmask_type const*>(orc_table.num_columns(), stream);
   std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
   for (auto const& col : orc_table.columns) {
     // Leaf columns don't need pushdown masks
     if (col.num_children() == 0) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr;
     auto const null_mask      = col.null_mask();
 
     if (null_mask == nullptr and parent_pd_mask == nullptr) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     if (col.orc_kind() == STRUCT) {
       if (null_mask != nullptr and parent_pd_mask == nullptr) {
         // Reuse own null mask
-        mask_ptrs.emplace_back(null_mask);
+        mask_ptrs.push_back(null_mask);
       } else if (null_mask == nullptr and parent_pd_mask != nullptr) {
         // Reuse parent's pushdown mask
-        mask_ptrs.emplace_back(parent_pd_mask);
+        mask_ptrs.push_back(parent_pd_mask);
       } else {
         // Both are nullable, allocate new pushdown mask
         pd_masks.emplace_back(num_bitmask_words(col.size()), stream);
-        mask_ptrs.emplace_back(pd_masks.back().data());
+        mask_ptrs.push_back({pd_masks.back().data()});
 
         thrust::transform(rmm::exec_policy(stream),
                           null_mask,
@@ -1724,7 +1731,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
-      mask_ptrs.emplace_back(pd_masks.back().data());
+      mask_ptrs.push_back({pd_masks.back().data()});
       pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
     }
   }
@@ -1815,8 +1822,7 @@ orc_table_view make_orc_table_view(table_view const& table,
     append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
-  std::vector<TypeKind> type_kinds;
-  type_kinds.reserve(orc_columns.size());
+  auto type_kinds = cudf::detail::make_empty_host_vector<TypeKind>(orc_columns.size(), stream);
   std::transform(
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
@@ -2299,7 +2305,7 @@ auto convert_table_to_orc_data(table_view const& input,
 
   // Decide stripe boundaries based on rowgroups and char counts
   auto segmentation =
-    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
+    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size, stream);
 
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index bd082befe0c..f5f8b3cfed9 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -78,10 +78,9 @@ struct orc_table_view {
  * Provides a container-like interface to iterate over rowgroup indices.
  */
 struct stripe_rowgroups {
-  uint32_t id;     // stripe id
-  uint32_t first;  // first rowgroup in the stripe
-  uint32_t size;   // number of rowgroups in the stripe
-  stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
+  size_type id;     // stripe id
+  size_type first;  // first rowgroup in the stripe
+  size_type size;   // number of rowgroups in the stripe
   [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); }
   [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
@@ -125,7 +124,7 @@ class orc_streams {
  */
 struct file_segmentation {
   hostdevice_2dvector<rowgroup_rows> rowgroups;
-  std::vector<stripe_rowgroups> stripes;
+  cudf::detail::host_vector<stripe_rowgroups> stripes;
 
   auto num_rowgroups() const noexcept { return rowgroups.size().first; }
   auto num_stripes() const noexcept { return stripes.size(); }
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index bcc9adfc8c0..12c24e2b848 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet.hpp"
 
+#include <cudf/utilities/export.hpp>
+
 #include <algorithm>
 #include <cstddef>
 #include <optional>
@@ -25,7 +27,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf::io::parquet::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::parquet::detail {
 
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
@@ -149,4 +152,5 @@ class CompactProtocolReader {
   friend class parquet_field_struct_blob;
 };
 
-}  // namespace cudf::io::parquet::detail
+}  // namespace io::parquet::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 11f4a00ee8b..481c1e9fcdd 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -141,11 +141,11 @@ struct stats_caster {
       // Local struct to hold host columns
       struct host_column {
         // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
-        thrust::host_vector<T> val;
+        cudf::detail::host_vector<T> val;
         std::vector<bitmask_type> null_mask;
         cudf::size_type null_count = 0;
-        host_column(size_type total_row_groups)
-          : val(total_row_groups),
+        host_column(size_type total_row_groups, rmm::cuda_stream_view stream)
+          : val{cudf::detail::make_host_vector<T>(total_row_groups, stream)},
             null_mask(
               cudf::util::div_rounding_up_safe<size_type>(
                 cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
@@ -170,8 +170,14 @@ struct stats_caster {
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
         {
-          std::vector<char> chars{};
-          std::vector<cudf::size_type> offsets(1, 0);
+          auto const total_char_count = std::accumulate(
+            host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) {
+              return sum + str.size_bytes();
+            });
+          auto chars = cudf::detail::make_empty_host_vector<char>(total_char_count, stream);
+          auto offsets =
+            cudf::detail::make_empty_host_vector<cudf::size_type>(host_strings.size() + 1, stream);
+          offsets.push_back(0);
           for (auto const& str : host_strings) {
             auto tmp =
               str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
@@ -206,8 +212,8 @@ struct stats_caster {
             null_count);
         }
       };  // local struct host_column
-      host_column min(total_row_groups);
-      host_column max(total_row_groups);
+      host_column min(total_row_groups, stream);
+      host_column max(total_row_groups, stream);
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 8dfd68cd9b8..65dafb568c0 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -41,6 +41,11 @@ chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
+  // TODO: skip_rows not currently supported in chunked parquet reader until
+  // https://github.com/rapidsai/cudf/issues/16186 is closed
+  CUDF_EXPECTS(options.get_skip_rows() == 0,
+               "skip_rows > 0 is not currently supported in the Chunked Parquet reader.");
+
   _impl = std::make_unique<impl>(
     chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
 }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f705f6626e7..68ec61ead0a 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -26,6 +26,7 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
@@ -549,7 +550,17 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
   out_columns.reserve(_output_buffers.size());
 
   // no work to do (this can happen on the first pass if we have no rows to read)
-  if (!has_more_work()) { return finalize_output(out_metadata, out_columns); }
+  if (!has_more_work()) {
+    // Check if number of rows per source should be included in output metadata.
+    if (include_output_num_rows_per_source()) {
+      // Empty dataframe case: Simply initialize to a list of zeros
+      out_metadata.num_rows_per_source =
+        std::vector<size_t>(_file_itm_data.num_rows_per_source.size(), 0);
+    }
+
+    // Finalize output
+    return finalize_output(mode, out_metadata, out_columns);
+  }
 
   auto& pass            = *_pass_itm_data;
   auto& subpass         = *pass.subpass;
@@ -585,11 +596,80 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
     }
   }
 
+  // Check if number of rows per source should be included in output metadata.
+  if (include_output_num_rows_per_source()) {
+    // For chunked reading, compute the output number of rows per source
+    if (mode == read_mode::CHUNKED_READ) {
+      out_metadata.num_rows_per_source =
+        calculate_output_num_rows_per_source(read_info.skip_rows, read_info.num_rows);
+    }
+    // Simply move the number of rows per file if reading all at once
+    else {
+      // Move is okay here as we are reading in one go.
+      out_metadata.num_rows_per_source = std::move(_file_itm_data.num_rows_per_source);
+    }
+  }
+
   // Add empty columns if needed. Filter output columns based on filter.
-  return finalize_output(out_metadata, out_columns);
+  return finalize_output(mode, out_metadata, out_columns);
+}
+
+std::vector<size_t> reader::impl::calculate_output_num_rows_per_source(size_t const chunk_start_row,
+                                                                       size_t const chunk_num_rows)
+{
+  // Handle base cases.
+  if (_file_itm_data.num_rows_per_source.size() == 0) {
+    return {};
+  } else if (_file_itm_data.num_rows_per_source.size() == 1) {
+    return {chunk_num_rows};
+  }
+
+  std::vector<size_t> num_rows_per_source(_file_itm_data.num_rows_per_source.size(), 0);
+
+  // Subtract global skip rows from the start_row as we took care of that when computing
+  // _file_itm_data.num_rows_per_source
+  auto const start_row = chunk_start_row - _file_itm_data.global_skip_rows;
+  auto const end_row   = start_row + chunk_num_rows;
+  CUDF_EXPECTS(start_row <= end_row and end_row <= _file_itm_data.global_num_rows,
+               "Encountered invalid output chunk row bounds.");
+
+  // Copy reference to a const local variable for better readability
+  auto const& partial_sum_nrows_source = _file_itm_data.exclusive_sum_num_rows_per_source;
+
+  // Binary search start_row and end_row in exclusive_sum_num_rows_per_source vector
+  auto const start_iter =
+    std::upper_bound(partial_sum_nrows_source.cbegin(), partial_sum_nrows_source.cend(), start_row);
+  auto const end_iter =
+    (end_row == _file_itm_data.global_skip_rows + _file_itm_data.global_num_rows)
+      ? partial_sum_nrows_source.cend() - 1
+      : std::upper_bound(start_iter, partial_sum_nrows_source.cend(), end_row);
+
+  // Compute the array offset index for both iterators
+  auto const start_idx = std::distance(partial_sum_nrows_source.cbegin(), start_iter);
+  auto const end_idx   = std::distance(partial_sum_nrows_source.cbegin(), end_iter);
+
+  CUDF_EXPECTS(start_idx <= end_idx,
+               "Encountered invalid source files indexes for output chunk row bounds");
+
+  // If the entire chunk is from the same source file, then the count is simply num_rows
+  if (start_idx == end_idx) {
+    num_rows_per_source[start_idx] = chunk_num_rows;
+  } else {
+    // Compute the number of rows from the first source file
+    num_rows_per_source[start_idx] = partial_sum_nrows_source[start_idx] - start_row;
+    // Compute the number of rows from the last source file
+    num_rows_per_source[end_idx] = end_row - partial_sum_nrows_source[end_idx - 1];
+    // Simply copy the number of rows for each source in range: (start_idx, end_idx)
+    std::copy(_file_itm_data.num_rows_per_source.cbegin() + start_idx + 1,
+              _file_itm_data.num_rows_per_source.cbegin() + end_idx,
+              num_rows_per_source.begin() + start_idx + 1);
+  }
+
+  return num_rows_per_source;
 }
 
-table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+table_with_metadata reader::impl::finalize_output(read_mode mode,
+                                                  table_metadata& out_metadata,
                                                   std::vector<std::unique_ptr<column>>& out_columns)
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 3b8e80a29e6..5e3cc4301f9 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -262,11 +262,13 @@ class reader::impl {
    * @brief Finalize the output table by adding empty columns for the non-selected columns in
    * schema.
    *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param out_metadata The output table metadata
    * @param out_columns The columns for building the output table
    * @return The output table along with columns' metadata
    */
-  table_with_metadata finalize_output(table_metadata& out_metadata,
+  table_with_metadata finalize_output(read_mode mode,
+                                      table_metadata& out_metadata,
                                       std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
@@ -336,11 +338,36 @@ class reader::impl {
              : true;
   }
 
+  /**
+   * @brief Check if this is the first output chunk
+   *
+   * @return True if this is the first output chunk
+   */
   [[nodiscard]] bool is_first_output_chunk() const
   {
     return _file_itm_data._output_chunk_count == 0;
   }
 
+  /**
+   * @brief Check if number of rows per source should be included in output metadata.
+   *
+   * @return True if AST filter is not present
+   */
+  [[nodiscard]] bool include_output_num_rows_per_source() const
+  {
+    return not _expr_conv.get_converted_expr().has_value();
+  }
+
+  /**
+   * @brief Calculate the number of rows read from each source in the output chunk
+   *
+   * @param chunk_start_row The offset of the first row in the output chunk
+   * @param chunk_num_rows The number of rows in the the output chunk
+   * @return Vector of number of rows from each respective data source in the output chunk
+   */
+  [[nodiscard]] std::vector<size_t> calculate_output_num_rows_per_source(size_t chunk_start_row,
+                                                                         size_t chunk_num_rows);
+
   rmm::cuda_stream_view _stream;
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
@@ -387,7 +414,7 @@ class reader::impl {
 
   // chunked reading happens in 2 parts:
   //
-  // At the top level, the entire file is divided up into "passes" omn which we try and limit the
+  // At the top level, the entire file is divided up into "passes" on which we try and limit the
   // total amount of temporary memory (compressed data, decompressed data) in use
   // via _input_pass_read_limit.
   //
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 3da303e6928..794750ab6d2 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -804,16 +804,16 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   rmm::device_buffer decomp_pages(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
+  auto comp_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto comp_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
+  auto copy_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto copy_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
   thrust::fill(rmm::exec_policy_nosync(stream),
@@ -822,7 +822,6 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
                compression_result{0, compression_status::FAILURE});
 
   size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
   for (auto const& codec : codecs) {
     if (codec.num_pages == 0) { continue; }
 
@@ -836,56 +835,64 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
       if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
+        copy_in.push_back({page.page_data, static_cast<size_t>(offset)});
+        copy_out.push_back({dst_base, static_cast<size_t>(offset)});
       }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      comp_in.push_back(
+        {page.page_data + offset, static_cast<size_t>(page.compressed_page_size - offset)});
+      comp_out.push_back(
+        {dst_base + offset, static_cast<size_t>(page.uncompressed_page_size - offset)});
       page.page_data = dst_base;
       decomp_offset += page.uncompressed_page_size;
     });
+  }
+  auto d_comp_in = cudf::detail::make_device_uvector_async(
+    comp_in, stream, rmm::mr::get_current_device_resource());
+  auto d_comp_out = cudf::detail::make_device_uvector_async(
+    comp_out, stream, rmm::mr::get_current_device_resource());
+
+  int32_t start_pos = 0;
+  for (auto const& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    device_span<device_span<uint8_t const> const> d_comp_in_view{d_comp_in.data() + start_pos,
+                                                                 codec.num_pages};
+
+    device_span<device_span<uint8_t> const> d_comp_out_view(d_comp_out.data() + start_pos,
+                                                            codec.num_pages);
 
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(
-      comp_in_view, stream, rmm::mr::get_current_device_resource());
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(
-      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        gpuinflate(
+          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
                                      d_comp_res_view,
                                      codec.max_decompressed_size,
                                      codec.total_decomp_size,
                                      stream);
         } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+          gpu_unsnap(d_comp_in_view, d_comp_out, d_comp_res_view, stream);
         }
         break;
       case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
                                    stream);
         break;
       case BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
+        gpu_debrotli(d_comp_in_view,
+                     d_comp_out_view,
                      d_comp_res_view,
                      debrotli_scratch.data(),
                      debrotli_scratch.size(),
@@ -893,8 +900,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         break;
       case LZ4_RAW:
         nvcomp::batched_decompress(nvcomp::compression_type::LZ4,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
@@ -1127,9 +1134,8 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
                                 decomp_sum{});
 
   // retrieve to host so we can call nvcomp to get compression scratch sizes
-  std::vector<decompression_info> h_decomp_info =
-    cudf::detail::make_std_vector_sync(decomp_info, stream);
-  std::vector<size_t> temp_cost(pages.size());
+  auto h_decomp_info = cudf::detail::make_host_vector_sync(decomp_info, stream);
+  auto temp_cost     = cudf::detail::make_host_vector<size_t>(pages.size(), stream);
   thrust::transform(thrust::host,
                     h_decomp_info.begin(),
                     h_decomp_info.end(),
@@ -1232,22 +1238,22 @@ void reader::impl::setup_next_pass(read_mode mode)
       pass.skip_rows = _file_itm_data.global_skip_rows;
       pass.num_rows  = _file_itm_data.global_num_rows;
     } else {
-      auto const global_start_row = _file_itm_data.global_skip_rows;
-      auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-      auto const start_row =
-        std::max(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass],
-                 global_start_row);
-      auto const end_row =
-        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
-                 global_end_row);
-
-      // skip_rows is always global in the sense that it is relative to the first row of
-      // everything we will be reading, regardless of what pass we are on.
-      // num_rows is how many rows we are reading this pass.
-      pass.skip_rows =
-        global_start_row +
+      // pass_start_row and pass_end_row are computed from the selected row groups relative to the
+      // global_skip_rows.
+      auto const pass_start_row =
         _file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass];
-      pass.num_rows = end_row - start_row;
+      auto const pass_end_row =
+        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
+                 _file_itm_data.global_num_rows);
+
+      // pass.skip_rows is always global in the sense that it is relative to the first row of
+      // the data source (global row number 0), regardless of what pass we are on. Therefore,
+      // we must re-add global_skip_rows to the pass_start_row which is relative to the
+      // global_skip_rows.
+      pass.skip_rows = _file_itm_data.global_skip_rows + pass_start_row;
+      // num_rows is how many rows we are reading this pass. Since this is a difference, adding
+      // global_skip_rows to both variables is redundant.
+      pass.num_rows = pass_end_row - pass_start_row;
     }
 
     // load page information for the chunk. this retrieves the compressed bytes for all the
@@ -1509,6 +1515,7 @@ void reader::impl::create_global_chunk_info()
 
   // Initialize column chunk information
   auto remaining_rows = num_rows;
+  auto skip_rows      = _file_itm_data.global_skip_rows;
   for (auto const& rg : row_groups_info) {
     auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
     auto const row_group_start = rg.start_row;
@@ -1561,7 +1568,12 @@ void reader::impl::create_global_chunk_info()
                                        schema.type == BYTE_ARRAY and _strings_to_categorical));
     }
 
-    remaining_rows -= row_group_rows;
+    // Adjust for skip_rows when updating the remaining rows after the first group
+    remaining_rows -=
+      (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
+                  : row_group_rows;
+    // Set skip_rows = 0 as it is no longer needed for subsequent row_groups
+    skip_rows = 0;
   }
 }
 
@@ -1598,6 +1610,9 @@ void reader::impl::compute_input_passes()
   _file_itm_data.input_pass_row_group_offsets.push_back(0);
   _file_itm_data.input_pass_start_row_count.push_back(0);
 
+  // To handle global_skip_rows when computing input passes
+  int skip_rows = _file_itm_data.global_skip_rows;
+
   for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
     auto const& rgi       = row_groups_info[cur_rg_index];
     auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
@@ -1606,6 +1621,14 @@ void reader::impl::compute_input_passes()
     auto const [compressed_rg_size, _ /*compressed + uncompressed*/] =
       get_row_group_size(row_group);
 
+    // We must use the effective size of the first row group we are reading to accurately calculate
+    // the first non-zero input_pass_start_row_count.
+    auto const row_group_rows =
+      (skip_rows) ? rgi.start_row + row_group.num_rows - skip_rows : row_group.num_rows;
+
+    //  Set skip_rows = 0 as it is no longer needed for subsequent row_groups
+    skip_rows = 0;
+
     // can we add this row group
     if (cur_pass_byte_size + compressed_rg_size >= comp_read_limit) {
       // A single row group (the current one) is larger than the read limit:
@@ -1613,7 +1636,7 @@ void reader::impl::compute_input_passes()
       // row group
       if (cur_rg_start == cur_rg_index) {
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group.num_rows);
+        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group_rows);
         cur_rg_start       = cur_rg_index + 1;
         cur_pass_byte_size = 0;
       }
@@ -1627,7 +1650,7 @@ void reader::impl::compute_input_passes()
     } else {
       cur_pass_byte_size += compressed_rg_size;
     }
-    cur_row_count += row_group.num_rows;
+    cur_row_count += row_group_rows;
   }
 
   // add the last pass if necessary
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index b959c793011..3a3cdd34a58 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -41,6 +41,12 @@ struct file_intermediate_data {
   // is not capped by global_skip_rows and global_num_rows.
   std::vector<std::size_t> input_pass_start_row_count{};
 
+  // number of rows to be read from each data source
+  std::vector<std::size_t> num_rows_per_source{};
+
+  // partial sum of the number of rows per data source
+  std::vector<std::size_t> exclusive_sum_num_rows_per_source{};
+
   size_t _current_input_pass{0};  // current input pass index
   size_t _output_chunk_count{0};  // how many output chunks we have produced
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index d1e9a823d3b..581c44d024b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -945,7 +945,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -976,6 +976,9 @@ aggregate_reader_metadata::select_row_groups(
                      static_cast<size_type>(from_opts.second)};
   }();
 
+  // Get number of rows in each data source
+  std::vector<size_t> num_rows_per_source(per_file_metadata.size(), 0);
+
   if (!row_group_indices.empty()) {
     CUDF_EXPECTS(row_group_indices.size() == per_file_metadata.size(),
                  "Must specify row groups for each source");
@@ -989,28 +992,45 @@ aggregate_reader_metadata::select_row_groups(
         selection.emplace_back(rowgroup_idx, rows_to_read, src_idx);
         // if page-level indexes are present, then collect extra chunk and page info.
         column_info_for_row_group(selection.back(), 0);
-        rows_to_read += get_row_group(rowgroup_idx, src_idx).num_rows;
+        auto const rows_this_rg = get_row_group(rowgroup_idx, src_idx).num_rows;
+        rows_to_read += rows_this_rg;
+        num_rows_per_source[src_idx] += rows_this_rg;
       }
     }
   } else {
     size_type count = 0;
     for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
       auto const& fmd = per_file_metadata[src_idx];
-      for (size_t rg_idx = 0; rg_idx < fmd.row_groups.size(); ++rg_idx) {
+      for (size_t rg_idx = 0;
+           rg_idx < fmd.row_groups.size() and count < rows_to_skip + rows_to_read;
+           ++rg_idx) {
         auto const& rg             = fmd.row_groups[rg_idx];
         auto const chunk_start_row = count;
         count += rg.num_rows;
         if (count > rows_to_skip || count == 0) {
+          // start row of this row group adjusted with rows_to_skip
+          num_rows_per_source[src_idx] += count;
+          num_rows_per_source[src_idx] -=
+            (chunk_start_row <= rows_to_skip) ? rows_to_skip : chunk_start_row;
+
+          // We need the unadjusted start index of this row group to correctly initialize
+          // ColumnChunkDesc for this row group in create_global_chunk_info() and calculate
+          // the row offset for the first pass in compute_input_passes().
           selection.emplace_back(rg_idx, chunk_start_row, src_idx);
-          // if page-level indexes are present, then collect extra chunk and page info.
+
+          // If page-level indexes are present, then collect extra chunk and page info.
+          // The page indexes rely on absolute row numbers, not adjusted for skip_rows.
           column_info_for_row_group(selection.back(), chunk_start_row);
         }
-        if (count >= rows_to_skip + rows_to_read) { break; }
+        // Adjust the number of rows for the last source file.
+        if (count >= rows_to_skip + rows_to_read) {
+          num_rows_per_source[src_idx] -= count - rows_to_skip - rows_to_read;
+        }
       }
     }
   }
 
-  return {rows_to_skip, rows_to_read, std::move(selection)};
+  return {rows_to_skip, rows_to_read, std::move(selection), std::move(num_rows_per_source)};
 }
 
 std::tuple<std::vector<input_column_info>,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6bfa8519c76..309132a5347 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -282,17 +282,17 @@ class aggregate_reader_metadata {
    * @param output_column_schemas schema indices of output columns
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return A tuple of corrected row_start, row_count and list of row group indexes and its
-   *         starting row
+   * @return A tuple of corrected row_start, row_count, list of row group indexes and its
+   *         starting row, and list of number of rows per source.
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
-    host_span<std::vector<size_type> const> row_group_indices,
-    int64_t row_start,
-    std::optional<size_type> const& row_count,
-    host_span<data_type const> output_dtypes,
-    host_span<int const> output_column_schemas,
-    std::optional<std::reference_wrapper<ast::expression const>> filter,
-    rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
+  select_row_groups(host_span<std::vector<size_type> const> row_group_indices,
+                    int64_t row_start,
+                    std::optional<size_type> const& row_count,
+                    host_span<data_type const> output_dtypes,
+                    host_span<int const> output_column_schemas,
+                    std::optional<std::reference_wrapper<ast::expression const>> filter,
+                    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of columns
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f28a7311ccb..e006cc7d714 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -370,7 +370,7 @@ void fill_in_page_info(host_span<ColumnChunkDesc> chunks,
                        rmm::cuda_stream_view stream)
 {
   auto const num_pages = pages.size();
-  std::vector<page_index_info> page_indexes(num_pages);
+  auto page_indexes    = cudf::detail::make_host_vector<page_index_info>(num_pages, stream);
 
   for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
     auto const& chunk = chunks[c];
@@ -1031,8 +1031,8 @@ struct get_page_num_rows {
 };
 
 struct input_col_info {
-  int const schema_idx;
-  size_type const nesting_depth;
+  int schema_idx;
+  size_type nesting_depth;
 };
 
 /**
@@ -1235,8 +1235,10 @@ void reader::impl::preprocess_file(read_mode mode)
                    [](auto const& col) { return col.type; });
   }
 
-  std::tie(
-    _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
+  std::tie(_file_itm_data.global_skip_rows,
+           _file_itm_data.global_num_rows,
+           _file_itm_data.row_groups,
+           _file_itm_data.num_rows_per_source) =
     _metadata->select_row_groups(_options.row_group_indices,
                                  _options.skip_rows,
                                  _options.num_rows,
@@ -1245,9 +1247,18 @@ void reader::impl::preprocess_file(read_mode mode)
                                  _expr_conv.get_converted_expr(),
                                  _stream);
 
+  // Inclusive scan the number of rows per source
+  if (not _expr_conv.get_converted_expr().has_value() and mode == read_mode::CHUNKED_READ) {
+    _file_itm_data.exclusive_sum_num_rows_per_source.resize(
+      _file_itm_data.num_rows_per_source.size());
+    thrust::inclusive_scan(_file_itm_data.num_rows_per_source.cbegin(),
+                           _file_itm_data.num_rows_per_source.cend(),
+                           _file_itm_data.exclusive_sum_num_rows_per_source.begin());
+  }
+
   // check for page indexes
-  _has_page_index = std::all_of(_file_itm_data.row_groups.begin(),
-                                _file_itm_data.row_groups.end(),
+  _has_page_index = std::all_of(_file_itm_data.row_groups.cbegin(),
+                                _file_itm_data.row_groups.cend(),
                                 [](auto const& row_group) { return row_group.has_page_index(); });
 
   if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
@@ -1512,8 +1523,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    std::vector<input_col_info> h_cols_info;
-    h_cols_info.reserve(_input_columns.size());
+    auto h_cols_info =
+      cudf::detail::make_empty_host_vector<input_col_info>(_input_columns.size(), _stream);
     std::transform(_input_columns.cbegin(),
                    _input_columns.cend(),
                    std::back_inserter(h_cols_info),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 8413e716224..36a1d8377bf 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,7 @@
 #include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
+#include "interop/decimal_conversion_utilities.cuh"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
@@ -1601,50 +1602,12 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
   return ck->ck_stat_size * num_pages + column_index_truncate_length + padding + size_struct_size;
 }
 
-/**
- * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
- *
- * @tparam DecimalType to convert from
- *
- * @param column A view of the input columns
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return A device vector containing the converted decimal128 data
- */
-template <typename DecimalType>
-rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
-                                                           rmm::cuda_stream_view stream)
-{
-  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
-
-  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
-
-  thrust::for_each(rmm::exec_policy_nosync(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(column.size()),
-                   [in  = column.begin<DecimalType>(),
-                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
-                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                     // The lowest order bits are the value, the remainder
-                     // simply matches the sign bit to satisfy the two's
-                     // complement integer representation of negative numbers.
-                     out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                     }
-                   });
-
-  return d128_buffer;
-}
-
 /**
  * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
  *        update the input table metadata, and return a new vector of column views.
  *
  * @param[in,out] table_meta The table metadata
- * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param[in,out] d128_buffers Buffers containing the converted decimal128 data.
  * @param input The input table
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
@@ -1652,7 +1615,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co
  */
 std::vector<column_view> convert_decimal_columns_and_metadata(
   table_input_metadata& table_meta,
-  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  std::vector<std::unique_ptr<rmm::device_buffer>>& d128_buffers,
   table_view const& table,
   rmm::cuda_stream_view stream)
 {
@@ -1673,28 +1636,30 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     switch (column.type().id()) {
       case type_id::DECIMAL32:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int32_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
                 converted_children};
       case type_id::DECIMAL64:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int64_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
@@ -1722,6 +1687,9 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     std::back_inserter(converted_column_views),
     [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
 
+  // Synchronize stream here to ensure all decimal128 buffers are ready.
+  stream.synchronize();
+
   return converted_column_views;
 }
 
@@ -1780,13 +1748,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    rmm::cuda_stream_view stream)
 {
   // Container to store decimal128 converted data if needed
-  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+  std::vector<std::unique_ptr<rmm::device_buffer>> d128_buffers;
 
   // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
   // and initialize LinkedColVector
   auto vec = table_to_linked_columns(
     (write_arrow_schema)
-      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_buffers, input, stream)})
       : input);
 
   auto schema_tree = construct_parquet_schema_tree(
@@ -1824,7 +1792,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   size_type max_page_fragment_size =
     max_page_fragment_size_opt.value_or(default_max_page_fragment_size);
 
-  std::vector<size_type> column_frag_size(num_columns, max_page_fragment_size);
+  auto column_frag_size = cudf::detail::make_host_vector<size_type>(num_columns, stream);
+  std::fill(column_frag_size.begin(), column_frag_size.end(), max_page_fragment_size);
 
   if (input.num_rows() > 0 && not max_page_fragment_size_opt.has_value()) {
     std::vector<size_t> column_sizes;
@@ -1880,7 +1849,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end());
 
-  std::vector<int> part_frag_offset;  // Store the idx of the first fragment in each partition
+  auto part_frag_offset =
+    cudf::detail::make_empty_host_vector<int>(num_frag_in_part.size() + 1, stream);
+  // Store the idx of the first fragment in each partition
   std::exclusive_scan(
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
diff --git a/cpp/src/io/utilities/base64_utilities.hpp b/cpp/src/io/utilities/base64_utilities.hpp
index 537d9c96d6b..b1eb120c47f 100644
--- a/cpp/src/io/utilities/base64_utilities.hpp
+++ b/cpp/src/io/utilities/base64_utilities.hpp
@@ -61,10 +61,13 @@
 // altered: applying clang-format for libcudf on this file.
 
 // altered: include required headers
+#include <cudf/utilities/export.hpp>
+
 #include <string>
 
 // altered: use cudf namespaces
-namespace cudf::io::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
 
 /**
  * @brief Encodes input string to base64 and returns it
@@ -84,4 +87,5 @@ std::string base64_encode(std::string_view string_to_encode);
  */
 std::string base64_decode(std::string_view encoded_string);
 
-}  // namespace cudf::io::detail
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index aa1b29a101f..73362334e26 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -20,11 +20,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/types.hpp>
@@ -933,7 +933,7 @@ std::unique_ptr<column> parse_data(
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
   if (null_mask.is_empty()) {
-    null_mask = cudf::detail::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
+    null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
   }
 
   // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 9fe5959436d..d7b54399f8d 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -223,7 +223,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath)
     // The benefit from multithreaded read plateaus around 16 threads
     pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
 {
-  pool.sleep_duration = 10;
 }
 
 namespace {
@@ -232,14 +231,15 @@ template <typename DataT,
           typename F,
           typename ResultT = std::invoke_result_t<F, DataT*, size_t, size_t>>
 std::vector<std::future<ResultT>> make_sliced_tasks(
-  F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
+  F function, DataT* ptr, size_t offset, size_t size, BS::thread_pool& pool)
 {
   constexpr size_t default_max_slice_size = 4 * 1024 * 1024;
   static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size);
   auto const slices                = make_file_io_slices(size, max_slice_size);
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
-    return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset);
+    return pool.submit_task(
+      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 91ef41fba6e..7e47b5b3d10 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -19,13 +19,13 @@
 #ifdef CUFILE_FOUND
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/utilities/thread_pool.hpp>
-
+#include <BS_thread_pool.hpp>
 #include <cufile.h>
 #endif
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -150,7 +150,7 @@ class cufile_input_impl final : public cufile_input {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 
 /**
@@ -167,7 +167,7 @@ class cufile_output_impl final : public cufile_output {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 #else
 
@@ -212,7 +212,7 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
 /**
  * @brief Byte range to be read/written in a single operation.
  */
-struct file_io_slice {
+CUDF_EXPORT struct file_io_slice {
   size_t offset;
   size_t size;
 };
@@ -222,7 +222,7 @@ struct file_io_slice {
  *
  * If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse.
  */
-std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size);
+CUDF_EXPORT std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size);
 
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 7fdcc65d77b..7c607099cdc 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -21,7 +21,8 @@
 #include <optional>
 #include <utility>
 
-namespace cudf::io::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
 
 /**
  * @brief Adjusts the input skip_rows and num_rows options to the actual number of rows to
@@ -38,4 +39,5 @@ std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
                                                             std::optional<int64_t> const& num_rows,
                                                             int64_t num_source_rows);
 
-}  // namespace cudf::io::detail
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index 3e6f57f2896..0d9e7e40e4e 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -18,6 +18,7 @@
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -43,7 +44,7 @@ namespace detail {
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The inferred data type
  */
-cudf::data_type infer_data_type(
+CUDF_EXPORT cudf::data_type infer_data_type(
   cudf::io::json_inference_options_view const& options,
   device_span<char const> data,
   thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
@@ -66,7 +67,7 @@ namespace json::detail {
  * @param mr The resource to be used for device memory allocation
  * @return The column that contains the parsed data
  */
-std::unique_ptr<column> parse_data(
+CUDF_EXPORT std::unique_ptr<column> parse_data(
   char const* data,
   thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
   size_type col_size,
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index 677743d77d0..caea8dabb88 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <optional>
@@ -67,7 +68,8 @@ inline trie_view make_trie_view(optional_trie const& t)
  *
  * @return A host vector of nodes representing the serialized trie
  */
-trie create_serialized_trie(std::vector<std::string> const& keys, rmm::cuda_stream_view stream);
+CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
+                                        rmm::cuda_stream_view stream);
 
 /*
  * @brief Searches for a string in a serialized trie.
diff --git a/cpp/src/jit/parser.hpp b/cpp/src/jit/parser.hpp
index 55528bed6cf..85c8d63192f 100644
--- a/cpp/src/jit/parser.hpp
+++ b/cpp/src/jit/parser.hpp
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <map>
 #include <set>
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace jit {
 /**
  * @brief Parse and transform a piece of PTX code that contains the implementation
@@ -239,4 +241,4 @@ inline std::string parse_single_function_ptx(std::string const& src,
 std::string parse_single_function_cuda(std::string const& src, std::string const& function_name);
 
 }  // namespace jit
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index b0184ff6a86..eb9b687630b 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -185,6 +185,8 @@ probe_join_hash_table(
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  cudf::experimental::prefetch::detail::prefetch("hash_join", *left_indices, stream);
+  cudf::experimental::prefetch::detail::prefetch("hash_join", *right_indices, stream);
 
   auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
 
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index f03d394d6d7..30c03a8cd68 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/contains.hpp>
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/lists/list_device_view.cuh>
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 3d609a262b9..8cd58e7eff2 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 779eca438db..90f7994b21d 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/gather.cuh>
+#include <cudf/lists/gather.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 5625e1bf05c..50f40924478 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -257,10 +257,8 @@ dremel_data get_encoding(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets =
-    cudf::detail::make_host_vector_async(d_column_offsets, stream);
-  thrust::host_vector<size_type> column_ends =
-    cudf::detail::make_host_vector_async(d_column_ends, stream);
+  auto column_offsets = cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  auto column_ends    = cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 370d7480578..46c4fc78a6f 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -229,8 +229,8 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
   if (null_or_empty_count == 0) {
     // performance penalty to run the below loop if there are no nulls or empty lists.
     // run simple explode instead
-    return include_position ? explode_position(input_table, explode_column_idx, stream, mr)
-                            : explode(input_table, explode_column_idx, stream, mr);
+    return include_position ? detail::explode_position(input_table, explode_column_idx, stream, mr)
+                            : detail::explode(input_table, explode_column_idx, stream, mr);
   }
 
   auto gather_map_size = sliced_child.size() + null_or_empty_count;
@@ -300,58 +300,63 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 }  // namespace detail
 
 /**
- * @copydoc cudf::explode(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type explode_column_idx,
+                               rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode(input_table, explode_column_idx, cudf::get_default_stream(), mr);
+  return detail::explode(input_table, explode_column_idx, stream, mr);
 }
 
 /**
- * @copydoc cudf::explode_position(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode_position(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type explode_column_idx,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_position(input_table, explode_column_idx, cudf::get_default_stream(), mr);
+  return detail::explode_position(input_table, explode_column_idx, stream, mr);
 }
 
 /**
- * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type explode_column_idx,
+                                     rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_outer(
-    input_table, explode_column_idx, false, cudf::get_default_stream(), mr);
+  return detail::explode_outer(input_table, explode_column_idx, false, stream, mr);
 }
 
 /**
  * @copydoc cudf::explode_outer_position(table_view const&, size_type,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer_position(table_view const& input_table,
                                               size_type explode_column_idx,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_outer(
-    input_table, explode_column_idx, true, cudf::get_default_stream(), mr);
+  return detail::explode_outer(input_table, explode_column_idx, true, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 1d18b8c677c..5c7ab68d64b 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -26,6 +26,7 @@
 #include <cudf/lists/detail/combine.hpp>
 #include <cudf/lists/detail/set_operations.hpp>
 #include <cudf/lists/detail/stream_compaction.hpp>
+#include <cudf/lists/set_operations.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index 40dee010bd5..cdcb4aa957f 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/lists/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 7ecaa0fba56..e2c8d49a4ab 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -27,6 +27,7 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/merge.hpp>
 #include <cudf/strings/detail/merge.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 82b169c78ed..9810373b751 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/partitioning.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
@@ -271,8 +272,8 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition   = 0,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  cudf::size_type start_partition,
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_robin_partition(
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index b25254cfe49..5d748de0019 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/quantiles.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index af3bda2e62e..0b0e6701304 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/quantiles.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index da36b7ab1da..421ed26e26d 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/quantiles.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 2c1181972c5..6cb58786971 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -107,8 +107,7 @@ rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
  * respectively of the minimums and maximums of the input pairs.
  */
 template <typename T>
-struct minmax_binary_op
-  : public thrust::binary_function<minmax_pair<T>, minmax_pair<T>, minmax_pair<T>> {
+struct minmax_binary_op {
   __device__ minmax_pair<T> operator()(minmax_pair<T> const& lhs, minmax_pair<T> const& rhs) const
   {
     return minmax_pair<T>{thrust::min(lhs.min_val, rhs.min_val),
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 0befb6ac7d7..0dbfc271a25 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/scan.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 7c02a8d1b99..ee35d716d6e 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/scan.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
 #include <cudf/reduction.hpp>
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 48ab5963a29..e6de065dabb 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -13,11 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/reduction.hpp>
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 3dfa0b65814..2a03a5504c1 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -167,11 +167,6 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 
 }  // namespace
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
                                   rmm::cuda_stream_view stream,
@@ -183,15 +178,13 @@ std::unique_ptr<column> byte_cast(column_view const& input,
 
 }  // namespace detail
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr);
+  return detail::byte_cast(input, endian_configuration, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 580db0e24c5..7473b6045af 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/reshape.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
+#include <cudf/reshape.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
@@ -263,10 +264,11 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::interleave_columns(input, cudf::get_default_stream(), mr);
+  return detail::interleave_columns(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 1c4019b2c73..3d4fb73c000 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/reshape.hpp>
+#include <cudf/reshape.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -63,10 +64,11 @@ std::unique_ptr<table> tile(table_view const& in,
 
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
+                            rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tile(in, count, cudf::get_default_stream(), mr);
+  return detail::tile(in, count, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index e612bd01118..5dff40a3396 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/rolling.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/resource_ref.hpp>
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 07425a92413..83209c55c8a 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -216,7 +216,7 @@ template class fixed_point_scalar<numeric::decimal32>;
 template class fixed_point_scalar<numeric::decimal64>;
 template class fixed_point_scalar<numeric::decimal128>;
 
-namespace detail {
+namespace CUDF_HIDDEN detail {
 
 template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(T value,
@@ -306,7 +306,7 @@ template class fixed_width_scalar<duration_ms>;
 template class fixed_width_scalar<duration_us>;
 template class fixed_width_scalar<duration_ns>;
 
-}  // namespace detail
+}  // namespace CUDF_HIDDEN detail
 
 template <typename T>
 numeric_scalar<T>::numeric_scalar(T value,
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 8f05196a71c..57f2c59de40 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
+#include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index e88acf68e28..2aa9e24174b 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -17,10 +17,12 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/search.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 4fb983dc5a6..81227cb9a2d 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/search.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 328d3f0cee4..80651a4ec44 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
+#include <cudf/search.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c4cc0dbe09d..b534e9b2e5b 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -169,8 +169,10 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    auto offsets = cudf::detail::make_device_uvector_async(
-      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    auto h_offsets = cudf::detail::make_host_vector<size_type>(2, stream);
+    h_offsets[0]   = 0;
+    h_offsets[1]   = chars.size();
+    auto offsets   = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
     return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 2f4ebf97264..64a2107e17a 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -123,7 +123,7 @@ struct format_compiler {
     : format(fmt), d_items(0, stream)
   {
     specifiers.insert(extra_specifiers.begin(), extra_specifiers.end());
-    std::vector<format_item> items;
+    auto items  = cudf::detail::make_empty_host_vector<format_item>(format.length(), stream);
     auto str    = format.data();
     auto length = format.length();
     while (length > 0) {
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 2e4a776d3c0..514ab965fc5 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 7622e39e735..352e0f9f41a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 
   // Compute the partition offsets and size of offset column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  auto input_offsets = std::vector<size_t>(views.size() + 1);
+  auto input_offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   auto offset_it     = std::next(input_offsets.begin());
   thrust::transform(
     thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t {
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index a34828fa97e..48620af8cad 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -129,7 +129,7 @@ std::unique_ptr<column> filter_characters(
 
   // convert input table for copy to device memory
   size_type table_size = static_cast<size_type>(characters_to_filter.size());
-  thrust::host_vector<char_range> htable(table_size);
+  auto htable          = cudf::detail::make_host_vector<char_range>(table_size, stream);
   std::transform(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index cd60a4296b9..31234ea42ec 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -171,7 +171,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
   auto d_buffer          = rmm::device_buffer(buffer_size, stream);
 
   // copy all the reprog_device instances to a device memory array
-  std::vector<reprog_device> progs;
+  auto progs = cudf::detail::make_empty_host_vector<reprog_device>(h_progs.size(), stream);
   std::transform(h_progs.begin(),
                  h_progs.end(),
                  std::back_inserter(progs),
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 233fee14694..cf973638cc4 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 16b22d0de4c..a242b008a54 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -97,7 +97,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
-  thrust::host_vector<translate_table> htable(table_size);
+  auto htable = cudf::detail::make_host_vector<translate_table>(table_size, stream);
   std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
     return translate_table{entry.first, entry.second};
   });
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index f70598f33be..068d89a52dc 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "strings/char_types/char_cases.h"
 #include "strings/char_types/char_flags.h"
 
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 13c31e8ae4c..2969557c78f 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -308,7 +308,10 @@ auto decompose_structs(table_view table,
 auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
 {
   std::vector<detail::dremel_data> dremel_data;
-  std::vector<detail::dremel_device_view> dremel_device_views;
+  auto const num_list_columns = std::count_if(
+    table.begin(), table.end(), [](auto const& col) { return col.type().id() == type_id::LIST; });
+  auto dremel_device_views =
+    cudf::detail::make_empty_host_vector<detail::dremel_device_view>(num_list_columns, stream);
   for (auto const& col : table) {
     if (col.type().id() == type_id::LIST) {
       dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream));
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 723c306da1d..808f2d1b284 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index bfac7ab586e..12a15eb7e34 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -25,6 +25,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 3d0822d8545..0efb881eb3e 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf/detail/utilities/integer_utils.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -26,15 +29,24 @@ namespace cudf::detail {
 
 namespace {
 
+// Simple kernel to copy between device buffers
+CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx < n) { dst[idx] = src[idx]; }
+}
+
 void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
 {
   if (size == 0) return;
 
   if (size < get_kernel_pinned_copy_threshold()) {
-    thrust::copy_n(rmm::exec_policy_nosync(stream),
-                   static_cast<const char*>(src),
-                   size,
-                   static_cast<char*>(dst));
+    const int block_size = 256;
+    auto const grid_size = cudf::util::div_rounding_up_safe<size_t>(size, block_size);
+    // We are explicitly launching the kernel here instead of calling a thrust function because the
+    // thrust function can potentially call cudaMemcpyAsync instead of using a kernel
+    copy_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+      static_cast<char const*>(src), static_cast<char*>(dst), size);
   } else {
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
   }
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/host_memory.cpp
similarity index 73%
rename from cpp/src/utilities/pinned_memory.cpp
rename to cpp/src/utilities/host_memory.cpp
index 3ea4293fc60..7c3cea42023 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource {
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
+                        cuda::stream_ref stream)
   {
     if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
@@ -92,14 +92,14 @@ class fixed_pinned_pool_memory_resource {
     }
   }
 
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
   {
     return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
   }
 
   void deallocate(void* ptr,
                   std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
     deallocate_async(ptr, bytes, alignment, stream_);
     stream_.wait();
@@ -186,6 +186,61 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
   return mr_ref;
 }
 
+class new_delete_memory_resource {
+ public:
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    try {
+      return rmm::detail::aligned_host_allocate(
+        bytes, alignment, [](std::size_t size) { return ::operator new(size); });
+    } catch (std::bad_alloc const& e) {
+      CUDF_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, alignment);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    rmm::detail::aligned_host_deallocate(
+      ptr, bytes, alignment, [](void* ptr) { ::operator delete(ptr); });
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        [[maybe_unused]] cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, alignment);
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  bool operator==(new_delete_memory_resource const& other) const { return true; }
+
+  bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); }
+
+  friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+};
+
+static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
+              "Pageable pool mr must be accessible from the host");
+
 }  // namespace
 
 rmm::host_device_async_resource_ref set_pinned_memory_resource(
@@ -225,4 +280,29 @@ void set_kernel_pinned_copy_threshold(size_t threshold)
 
 size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
 
+CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
+{
+  // use pageable memory for all host allocations
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_allocate_host_as_pinned_threshold(size_t threshold)
+{
+  allocate_host_as_pinned_threshold() = threshold;
+}
+
+size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); }
+
+namespace detail {
+
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource()
+{
+  static new_delete_memory_resource mr{};
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+}  // namespace detail
+
 }  // namespace cudf
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
new file mode 100644
index 00000000000..86d6cc00764
--- /dev/null
+++ b/cpp/src/utilities/prefetch.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/prefetch.hpp>
+
+#include <rmm/cuda_device.hpp>
+
+#include <iostream>
+
+namespace cudf::experimental::prefetch {
+
+namespace detail {
+
+prefetch_config& prefetch_config::instance()
+{
+  static prefetch_config instance;
+  return instance;
+}
+
+bool prefetch_config::get(std::string_view key)
+{
+  std::shared_lock<std::shared_mutex> lock(config_mtx);
+  auto const it = config_values.find(key.data());
+  return it == config_values.end() ? false : it->second;  // default to not prefetching
+}
+
+void prefetch_config::set(std::string_view key, bool value)
+{
+  std::lock_guard<std::shared_mutex> lock(config_mtx);
+  config_values[key.data()] = value;
+}
+
+cudaError_t prefetch_noexcept(std::string_view key,
+                              void const* ptr,
+                              std::size_t size,
+                              rmm::cuda_stream_view stream,
+                              rmm::cuda_device_id device_id) noexcept
+{
+  if (prefetch_config::instance().get(key)) {
+    if (prefetch_config::instance().debug) {
+      std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr
+                << std::endl;
+    }
+    auto result = cudaMemPrefetchAsync(ptr, size, device_id.value(), stream.value());
+    // Need to flush the CUDA error so that the context is not corrupted.
+    if (result == cudaErrorInvalidValue) { cudaGetLastError(); }
+    return result;
+  }
+  return cudaSuccess;
+}
+
+void prefetch(std::string_view key,
+              void const* ptr,
+              std::size_t size,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id)
+{
+  auto result = prefetch_noexcept(key, ptr, size, stream, device_id);
+  // Ignore cudaErrorInvalidValue because that will be raised if prefetching is
+  // attempted on unmanaged memory.
+  if ((result != cudaErrorInvalidValue) && (result != cudaSuccess)) {
+    std::cerr << "Prefetch failed" << std::endl;
+    CUDF_CUDA_TRY(result);
+  }
+}
+
+}  // namespace detail
+
+void enable_prefetching(std::string_view key)
+{
+  detail::prefetch_config::instance().set(key, true);
+}
+
+void disable_prefetching(std::string_view key)
+{
+  detail::prefetch_config::instance().set(key, false);
+}
+
+void prefetch_debugging(bool enable) { detail::prefetch_config::instance().debug = enable; }
+}  // namespace cudf::experimental::prefetch
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8e2017ccb97..4dffcb41ba2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -270,6 +270,7 @@ ConfigureTest(
   INTEROP_TEST
   interop/to_arrow_device_test.cpp
   interop/to_arrow_test.cpp
+  interop/to_arrow_host_test.cpp
   interop/from_arrow_test.cpp
   interop/from_arrow_device_test.cpp
   interop/from_arrow_host_test.cpp
@@ -313,17 +314,17 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp
+  JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu
   GPUS 1
   PERCENT 30
 )
-ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp)
-ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
-ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
+ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
+ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
+ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp)
-ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json_whitespace_normalization_test.cu)
+ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
+ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
@@ -572,7 +573,7 @@ ConfigureTest(
   LARGE_STRINGS_TEST
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
-  large_strings/json_tests.cpp
+  large_strings/json_tests.cu
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
@@ -688,7 +689,10 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
+# Deprecation from 16297 and fixes in 16379 caused this test to be empty This will be reenabled once
+# the deprecated APIs have been replaced in 24.10.
+#
+# ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
@@ -700,6 +704,7 @@ ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE test
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 1346dcd4666..def6e94452e 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -42,5 +42,5 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
 
   EXPECT_THROW(cudf::binary_operation(
                  lhs, rhs, cudf::binary_operator::ADD, cudf::data_type(cudf::type_id::INT64)),
-               cudf::logic_error);
+               std::invalid_argument);
 }
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 6eaa1a07e08..733e5814425 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// These interop functions are deprecated. We keep the code in this
+// test and will migrate the tests to export the arrow C data
+// interface which we consume with from_arrow_host. For now, the tests
+// are commented out.
+
+#if 0
+
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -595,3 +602,5 @@ TEST_F(FromArrowStructScalarTest, Basic)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view());
 }
+
+#endif
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 4147728b2a6..a961f73d955 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -18,7 +18,6 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -29,6 +28,7 @@
 #include <cudf/wrappers/durations.hpp>
 
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 struct generated_test_data {
   generated_test_data(cudf::size_type length)
@@ -211,6 +211,7 @@ DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64);
 DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
 DEFINE_NANOARROW_STORAGE(int32_t, INT32);
+DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128);
 
 #undef DEFINE_NANOARROW_STORAGE
 
@@ -255,8 +256,7 @@ std::enable_if_t<std::is_same_v<T, bool>, nanoarrow::UniqueArray> get_nanoarrow_
     ArrowBitmap out;
     ArrowBitmapInit(&out);
     NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1));
-    out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0);
-    out.size_bits         = b.size();
+    std::memset(out.buffer.data, 0, out.buffer.size_bytes);
 
     for (size_t i = 0; i < b.size(); ++i) {
       ArrowBitSetTo(out.buffer.data, i, static_cast<uint8_t>(b[i]));
@@ -296,6 +296,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, nanoarrow::UniqueArray> g
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(tmp.get()), mask.size()));
   NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get()));
   NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size()));
 
@@ -378,3 +379,5 @@ get_nanoarrow_cudf_table(cudf::size_type length);
 
 std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
 get_nanoarrow_host_tables(cudf::size_type length);
+
+void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end);
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 8903f09b82b..51216a8512c 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -31,7 +31,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -711,6 +710,83 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
+TEST_F(ToArrowDeviceTest, FixedPoint32Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {6, 4, 2, 0, -1, -3, -5}) {
+    auto const expect_data =
+      std::vector<int32_t>{-1000, -1, -1, -1, 2400, 0, 0, 0, -3456, -1, -1, -1,
+                           4650,  0,  0,  0,  5154, 0, 0, 0, 6800,  0,  0,  0};
+    auto col = fp_wrapper<int32_t>({-1000, 2400, -3456, 4650, 5154, 6800}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+    auto result_dev_data = std::make_unique<rmm::device_uvector<int32_t>>(
+      expect_data.size(), cudf::get_default_stream());
+    cudaMemcpy(result_dev_data->data(),
+               expect_data.data(),
+               sizeof(int32_t) * expect_data.size(),
+               cudaMemcpyHostToDevice);
+
+    cudf::get_default_stream().synchronize();
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    expected_array->children[0]->length = input.num_rows();
+    NANOARROW_THROW_NOT_OK(
+      ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
+    ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
+      const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(input.view().column(0).null_mask()));
+
+    auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
+      ArrowArrayBuffer(expected_array->children[0], 1),
+      ArrowBufferDeallocator(
+        [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+          auto buf =
+            reinterpret_cast<std::unique_ptr<rmm::device_uvector<int32_t>>*>(alloc->private_data);
+          delete buf;
+        },
+        new std::unique_ptr<rmm::device_uvector<int32_t>>(std::move(result_dev_data)))));
+    ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  }
+}
+
 TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 {
   using namespace numeric;
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
new file mode 100644
index 00000000000..fc0ed6c9352
--- /dev/null
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -0,0 +1,1117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <numeric>
+
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+struct BaseToArrowHostFixture : public cudf::test::BaseFixture {
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> compare_subset(
+    ArrowArrayView const* expected,
+    int64_t start_offset_expected,
+    ArrowArrayView const* actual,
+    int64_t start_offset_actual,
+    int64_t length)
+  {
+    for (int64_t i = 0; i < length; ++i) {
+      const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i);
+      EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i));
+      if (is_null) continue;
+
+      const auto expected_val = ArrowArrayViewGetIntUnsafe(expected, start_offset_expected + i);
+      const auto actual_val   = ArrowArrayViewGetIntUnsafe(actual, start_offset_actual + i);
+
+      EXPECT_EQ(expected_val, actual_val);
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> compare_subset(
+    ArrowArrayView const* expected,
+    int64_t start_offset_expected,
+    ArrowArrayView const* actual,
+    int64_t start_offset_actual,
+    int64_t length)
+  {
+    for (int64_t i = 0; i < length; ++i) {
+      const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i);
+      EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i));
+      if (is_null) continue;
+
+      const auto expected_view = ArrowArrayViewGetBytesUnsafe(expected, start_offset_expected + i);
+      const auto actual_view   = ArrowArrayViewGetBytesUnsafe(actual, start_offset_actual + i);
+
+      EXPECT_EQ(expected_view.size_bytes, actual_view.size_bytes);
+      EXPECT_TRUE(
+        0 == std::memcmp(expected_view.data.data, actual_view.data.data, expected_view.size_bytes));
+    }
+  }
+
+  void compare_child_subset(ArrowArrayView const* expected,
+                            int64_t exp_start_offset,
+                            ArrowArrayView const* actual,
+                            int64_t act_start_offset,
+                            int64_t length)
+  {
+    EXPECT_EQ(expected->storage_type, actual->storage_type);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    switch (expected->storage_type) {
+      case NANOARROW_TYPE_LIST:
+        for (int64_t i = 0; i < length; ++i) {
+          const auto expected_start = exp_start_offset + i;
+          const auto actual_start   = act_start_offset + i;
+
+          // ArrowArrayViewIsNull accounts for the array offset, so we can properly
+          // compare the validity of indexes
+          const bool is_null = ArrowArrayViewIsNull(expected, expected_start);
+          EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start));
+          if (is_null) continue;
+
+          // ArrowArrayViewListChildOffset does not account for array offset, so we need
+          // to add the offset to the index in order to get the correct offset into the list
+          const int64_t start_offset_expected =
+            ArrowArrayViewListChildOffset(expected, expected->offset + expected_start);
+          const int64_t start_offset_actual =
+            ArrowArrayViewListChildOffset(actual, actual->offset + actual_start);
+
+          const int64_t end_offset_expected =
+            ArrowArrayViewListChildOffset(expected, expected->offset + expected_start + 1);
+          const int64_t end_offset_actual =
+            ArrowArrayViewListChildOffset(actual, actual->offset + actual_start + 1);
+
+          // verify the list lengths are the same
+          EXPECT_EQ(end_offset_expected - start_offset_expected,
+                    end_offset_actual - start_offset_actual);
+          // compare the list values
+          compare_child_subset(expected->children[0],
+                               start_offset_expected,
+                               actual->children[0],
+                               start_offset_actual,
+                               end_offset_expected - start_offset_expected);
+        }
+        break;
+      case NANOARROW_TYPE_STRUCT:
+        for (int64_t i = 0; i < length; ++i) {
+          SCOPED_TRACE("idx: " + std::to_string(i));
+          const auto expected_start = exp_start_offset + i;
+          const auto actual_start   = act_start_offset + i;
+
+          const bool is_null = ArrowArrayViewIsNull(expected, expected_start);
+          EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start));
+          if (is_null) continue;
+
+          for (int64_t child = 0; child < expected->n_children; ++child) {
+            SCOPED_TRACE("child: " + std::to_string(child));
+            compare_child_subset(expected->children[child],
+                                 expected_start + expected->offset,
+                                 actual->children[child],
+                                 actual_start + actual->offset,
+                                 1);
+          }
+        }
+        break;
+      case NANOARROW_TYPE_STRING:
+      case NANOARROW_TYPE_LARGE_STRING:
+      case NANOARROW_TYPE_BINARY:
+      case NANOARROW_TYPE_LARGE_BINARY:
+        compare_subset<cudf::string_view>(
+          expected, exp_start_offset, actual, act_start_offset, length);
+        break;
+      default:
+        compare_subset<int64_t>(expected, exp_start_offset, actual, act_start_offset, length);
+        break;
+    }
+  }
+
+  void compare_arrays(ArrowArrayView const* expected, ArrowArrayView const* actual)
+  {
+    EXPECT_EQ(expected->length, actual->length);
+    EXPECT_EQ(expected->null_count, actual->null_count);
+    EXPECT_EQ(expected->offset, actual->offset);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+    EXPECT_EQ(expected->storage_type, actual->storage_type);
+
+    // cudf automatically pushes down nulls and purges non-empty, non-zero nulls
+    // from the children columns. So while we can memcmp the buffers for top
+    // level arrays, we need to do an "equivalence" comparison for nested
+    // arrays (lists and structs) by checking each index for null and skipping
+    // comparisons for children if null.
+    switch (expected->storage_type) {
+      case NANOARROW_TYPE_STRUCT:
+        // if we're a struct with no children, then we just skip
+        // attempting to compare the children
+        if (expected->n_children == 0) {
+          EXPECT_EQ(nullptr, actual->children);
+          break;
+        }
+        // otherwise we can fallthrough and do the same thing we do for lists
+      case NANOARROW_TYPE_LIST:
+        compare_child_subset(expected, 0, actual, 0, expected->length);
+        break;
+      default:
+        for (int64_t i = 0; i < actual->array->n_buffers; ++i) {
+          SCOPED_TRACE("buffer " + std::to_string(i));
+          auto expected_buf = expected->buffer_views[i];
+          auto actual_buf   = actual->buffer_views[i];
+
+          EXPECT_TRUE(0 == std::memcmp(expected_buf.data.data,
+                                       actual_buf.data.data,
+                                       expected_buf.size_bytes));
+        }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_arrays(expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+};
+
+struct ToArrowHostDeviceTest : public BaseToArrowHostFixture {};
+template <typename T>
+struct ToArrowHostDeviceTestDurationsTest : public BaseToArrowHostFixture {};
+
+TYPED_TEST_SUITE(ToArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(ToArrowHostDeviceTest, EmptyTable)
+{
+  auto [tbl, schema, arr] = get_nanoarrow_host_tables(0);
+
+  auto got_arrow_host = cudf::to_arrow_host(tbl->view());
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewReset(&actual);
+}
+
+TEST_F(ToArrowHostDeviceTest, DateTimeTable)
+{
+  auto data = std::initializer_list<int64_t>{1, 2, 3, 4, 5, 6};
+  auto col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  expected.length              = data.size();
+  expected.children[0]->length = data.size();
+  ArrowArrayViewSetLength(expected.children[0], data.size());
+  expected.children[0]->buffer_views[0].data.data  = nullptr;
+  expected.children[0]->buffer_views[0].size_bytes = 0;
+  expected.children[0]->buffer_views[1].data.data  = data.begin();
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewReset(&actual);
+}
+
+TYPED_TEST(ToArrowHostDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  const ArrowTimeUnit arrow_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+
+  expected.length              = data.size();
+  expected.children[0]->length = data.size();
+  ArrowArrayViewSetLength(expected.children[0], data.size());
+  expected.children[0]->buffer_views[0].data.data  = nullptr;
+  expected.children[0]->buffer_views[0].size_bytes = 0;
+  expected.children[0]->buffer_views[1].data.data  = data.begin();
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+TEST_F(ToArrowHostDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    expected_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  auto list_arr = get_nanoarrow_list_array<int64_t>({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1});
+  std::vector<int32_t> offset{0, 0, 2};
+
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1));
+
+  nanoarrow::UniqueArray expected_arr;
+  EXPECT_EQ(NANOARROW_OK,
+            ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr));
+  expected_arr->length     = input_view.num_rows();
+  expected_arr->null_count = 0;
+
+  ArrowArraySetValidityBitmap(expected_arr->children[0], &mask);
+  expected_arr->children[0]->length     = input_view.num_rows();
+  expected_arr->children[0]->null_count = 1;
+  auto offset_buf                       = ArrowArrayBuffer(expected_arr->children[0], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
+  list_arr.move(expected_arr->children[0]->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr));
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+TEST_F(ToArrowHostDeviceTest, StructColumn)
+{
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  cudf::table_view input_view({struct_col->view()});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto child = expected_schema->children[0];
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
+  child->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
+  child->children[1]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
+  child->children[2]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
+  child->children[3]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
+  child->children[3]->children[0]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
+
+  // create nanoarrow table
+  // first our underlying arrays
+  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"};
+  std::vector<std::string> str2{"CUDF", "ROCKS", "EVERYWHERE"};
+  auto str_array  = get_nanoarrow_array<cudf::string_view>(str);
+  auto int_array  = get_nanoarrow_array<int32_t>({48, 27, 25});
+  auto str2_array = get_nanoarrow_array<cudf::string_view>(str2, {0, 1, 0});
+  // struct null will get pushed down and superimposed on this array
+  auto int2_array = get_nanoarrow_array<int32_t, uint8_t>({12, 24, 47}, {1, 0, 0});
+  auto bool_array = get_nanoarrow_array<bool>({true, true, false});
+  auto list_arr =
+    get_nanoarrow_list_array<int64_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9});
+  std::vector<int32_t> offset{0, 3, 4, 6};
+
+  nanoarrow::UniqueArray expected_arr;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr));
+  expected_arr->length = input_view.num_rows();
+
+  auto array_a        = expected_arr->children[0];
+  auto view_a         = input_view.column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  str_array.move(array_a->children[0]);
+  int_array.move(array_a->children[1]);
+  bool_array.move(array_a->children[2]);
+
+  array_a->children[3]->length     = input_view.num_rows();
+  array_a->children[3]->null_count = 0;
+
+  auto offset_buf = ArrowArrayBuffer(array_a->children[3], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
+  list_arr.move(array_a->children[3]->children[0]);
+
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+
+  auto array_struct = array_a->children[4];
+  auto view_struct  = view_a.child(4);
+  ArrowArraySetValidityBitmap(array_struct, &mask);
+  array_struct->null_count = view_struct.null_count();
+  array_struct->length     = view_struct.size();
+
+  str2_array.move(array_struct->children[0]);
+  int2_array.move(array_struct->children[1]);
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr));
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int32_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int32_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<int32_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+struct ToArrowHostDeviceTestSlice
+  : public ToArrowHostDeviceTest,
+    public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {};
+
+TEST_P(ToArrowHostDeviceTestSlice, SliceTest)
+{
+  auto [table, expected_schema, expected_array] = get_nanoarrow_host_tables(10000);
+  auto cudf_table_view                          = table->view();
+  auto const [start, end]                       = GetParam();
+
+  slice_host_nanoarrow(expected_array.get(), start, end);
+  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
+  auto got_arrow_host    = cudf::to_arrow_host(sliced_cudf_table);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+INSTANTIATE_TEST_CASE_P(ToArrowHostDeviceTest,
+                        ToArrowHostDeviceTestSlice,
+                        ::testing::Values(std::make_tuple(0, 10000),
+                                          std::make_tuple(100, 3000),
+                                          std::make_tuple(0, 0),
+                                          std::make_tuple(0, 3000)));
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index a1ece0ce0f1..328ba210a3f 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// These interop functions are deprecated. We keep the code in this
+// test and will migrate the tests to export via the arrow C data
+// interface with to_arrow_host which arrow can consume. For now, the
+// test is commented out.
+
+#if 0
+
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -196,6 +203,7 @@ TEST_F(ToArrowTest, DateTimeTable)
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
+
   auto expected_arrow_table = arrow::Table::Make(schema, {arr});
 
   auto got_arrow_table = cudf::to_arrow(input_view, {{"a"}});
@@ -685,3 +693,5 @@ TEST_F(ToArrowStructScalarTest, Basic)
 }
 
 CUDF_TEST_PROGRAM_MAIN()
+
+#endif
diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp
index 382d21fabb8..0177300eda9 100644
--- a/cpp/tests/io/fst/common.hpp
+++ b/cpp/tests/io/fst/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,6 +69,8 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_s
    /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
 
 // Translation table (i.e., for each transition, what are the symbols that we output)
+static constexpr auto min_translated_out = 1;
+static constexpr auto max_translated_out = 1;
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_out_tt{
   {/* IN_STATE         {      [      }      ]      "      \    OTHER */
    /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}},
diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu
index 4df0d3ae04d..8a8d3d39e0f 100644
--- a/cpp/tests/io/fst/fst_test.cu
+++ b/cpp/tests/io/fst/fst_test.cu
@@ -169,7 +169,9 @@ TEST_F(FstTest, GroundTruth)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   // Allocate device-side temporary storage & run algorithm
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json/json_chunked_reader.cu
similarity index 64%
rename from cpp/tests/io/json_chunked_reader.cpp
rename to cpp/tests/io/json/json_chunked_reader.cu
index 23d54f7263c..b9dee54752c 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json/json_chunked_reader.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "io/json/read_json.hpp"
+#include "json_utils.cuh"
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -37,65 +37,6 @@ cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
     ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
-// function to extract first delimiter in the string in each chunk,
-// collate together and form byte_range for each chunk,
-// parse separately.
-std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
-  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
-  cudf::io::json_reader_options const& reader_opts,
-  int32_t chunk_size,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  using namespace cudf::io::json::detail;
-  using cudf::size_type;
-  size_t total_source_size = 0;
-  for (auto const& source : sources) {
-    total_source_size += source->size();
-  }
-  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
-  constexpr size_type no_min_value = -1;
-
-  // Get the first delimiter in each chunk.
-  std::vector<size_type> first_delimiter_index(num_chunks);
-  auto reader_opts_chunk = reader_opts;
-  for (size_t i = 0; i < num_chunks; i++) {
-    auto const chunk_start = i * chunk_size;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_size);
-    first_delimiter_index[i] =
-      find_first_delimiter_in_chunk(sources, reader_opts_chunk, '\n', stream);
-    if (first_delimiter_index[i] != no_min_value) { first_delimiter_index[i] += chunk_start; }
-  }
-
-  // Process and allocate record start, end for each worker.
-  using record_range = std::pair<size_type, size_type>;
-  std::vector<record_range> record_ranges;
-  record_ranges.reserve(num_chunks);
-  first_delimiter_index[0] = 0;
-  auto prev                = first_delimiter_index[0];
-  for (size_t i = 1; i < num_chunks; i++) {
-    if (first_delimiter_index[i] == no_min_value) continue;
-    record_ranges.emplace_back(prev, first_delimiter_index[i]);
-    prev = first_delimiter_index[i];
-  }
-  record_ranges.emplace_back(prev, total_source_size);
-
-  std::vector<cudf::io::table_with_metadata> tables;
-  // Process each chunk in parallel.
-  for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    if (chunk_start == -1 or chunk_end == -1 or
-        static_cast<size_t>(chunk_start) >= total_source_size)
-      continue;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
-    tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
-  }
-  // assume all records have same number of columns, and inferred same type. (or schema is passed)
-  // TODO a step before to merge all columns, types and infer final schema.
-  return tables;
-}
-
 TEST_F(JsonReaderTest, ByteRange_SingleSource)
 {
   std::string const json_string = R"(
@@ -118,11 +59,11 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
@@ -213,11 +154,11 @@ TEST_F(JsonReaderTest, ByteRange_MultiSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
similarity index 100%
rename from cpp/tests/io/json_quote_normalization_test.cpp
rename to cpp/tests/io/json/json_quote_normalization_test.cpp
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json/json_test.cpp
similarity index 100%
rename from cpp/tests/io/json_test.cpp
rename to cpp/tests/io/json/json_test.cpp
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
similarity index 99%
rename from cpp/tests/io/json_tree.cpp
rename to cpp/tests/io/json/json_tree.cpp
index 7a72b77e1fb..8bcd5790e99 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -235,10 +235,8 @@ tree_meta_t2 get_tree_representation_cpu(
 {
   constexpr bool include_quote_char = true;
   // Copy the JSON tokens to the host
-  thrust::host_vector<cuio_json::PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(tokens_gpu, stream);
-  thrust::host_vector<cuio_json::SymbolOffsetT> token_indices =
-    cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
+  auto tokens        = cudf::detail::make_host_vector_async(tokens_gpu, stream);
+  auto token_indices = cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json/json_type_cast_test.cu
similarity index 100%
rename from cpp/tests/io/json_type_cast_test.cu
rename to cpp/tests/io/json/json_type_cast_test.cu
diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh
new file mode 100644
index 00000000000..9383797d91b
--- /dev/null
+++ b/cpp/tests/io/json/json_utils.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "io/json/read_json.hpp"
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <numeric>
+
+// Helper function to test correctness of JSON byte range reading.
+// We split the input source files into a set of byte range chunks each of size
+// `chunk_size` and return an array of partial tables constructed from each chunk
+template <typename IndexType = std::int32_t>
+std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
+  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
+  cudf::io::json_reader_options const& reader_opts,
+  IndexType chunk_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto total_source_size = [&sources]() {
+    return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+      auto const size = source->size();
+      return sum + size;
+    });
+  }();
+  auto find_first_delimiter_in_chunk =
+    [total_source_size, &sources, &stream](
+      cudf::io::json_reader_options const& reader_opts) -> IndexType {
+    rmm::device_uvector<char> buffer(total_source_size, stream);
+    auto readbufspan = cudf::io::json::detail::ingest_raw_input(buffer,
+                                                                sources,
+                                                                reader_opts.get_compression(),
+                                                                reader_opts.get_byte_range_offset(),
+                                                                reader_opts.get_byte_range_size(),
+                                                                stream);
+    // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the
+    // return type of that function is size_type. However, when the chunk_size is
+    // larger than INT_MAX, the position of the delimiter can also be larger than
+    // INT_MAX. We do not encounter this overflow error in the detail function
+    // since the batched JSON reader splits the byte_range_size into chunk_sizes
+    // smaller than INT_MAX bytes
+    auto const first_delimiter_position_it =
+      thrust::find(rmm::exec_policy(stream), readbufspan.begin(), readbufspan.end(), '\n');
+    return first_delimiter_position_it != readbufspan.end()
+             ? thrust::distance(readbufspan.begin(), first_delimiter_position_it)
+             : -1;
+  };
+  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
+  constexpr IndexType no_min_value = -1;
+
+  // Get the first delimiter in each chunk.
+  std::vector<IndexType> first_delimiter_index(num_chunks);
+  auto reader_opts_chunk = reader_opts;
+  for (size_t i = 0; i < num_chunks; i++) {
+    auto const chunk_start = i * chunk_size;
+    // We are updating reader_opt_chunks to store offset and size information for the current chunk
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_size);
+    first_delimiter_index[i] = find_first_delimiter_in_chunk(reader_opts_chunk);
+  }
+
+  // Process and allocate record start, end for each worker.
+  using record_range = std::pair<size_t, size_t>;
+  std::vector<record_range> record_ranges;
+  record_ranges.reserve(num_chunks);
+  size_t prev = 0;
+  for (size_t i = 1; i < num_chunks; i++) {
+    // In the case where chunk_size is smaller than row size, the chunk needs to be skipped
+    if (first_delimiter_index[i] == no_min_value) continue;
+    size_t next = static_cast<size_t>(first_delimiter_index[i]) + (i * chunk_size);
+    record_ranges.emplace_back(prev, next);
+    prev = next;
+  }
+  record_ranges.emplace_back(prev, total_source_size);
+
+  std::vector<cudf::io::table_with_metadata> tables;
+  for (auto const& [chunk_start, chunk_end] : record_ranges) {
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
+    tables.push_back(cudf::io::json::detail::read_json(sources, reader_opts_chunk, stream, mr));
+  }
+  // assume all records have same number of columns, and inferred same type. (or schema is passed)
+  // TODO a step before to merge all columns, types and infer final schema.
+  return tables;
+}
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
similarity index 100%
rename from cpp/tests/io/json_whitespace_normalization_test.cu
rename to cpp/tests/io/json/json_whitespace_normalization_test.cu
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp
similarity index 100%
rename from cpp/tests/io/json_writer.cpp
rename to cpp/tests/io/json/json_writer.cpp
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
similarity index 100%
rename from cpp/tests/io/nested_json_test.cpp
rename to cpp/tests/io/json/nested_json_test.cpp
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index cff85647725..66b36aeed63 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -149,6 +149,33 @@ auto chunked_read(std::string const& filepath,
   return chunked_read(vpath, output_limit, input_limit);
 }
 
+auto const read_table_and_nrows_per_source(cudf::io::chunked_parquet_reader const& reader)
+{
+  auto out_tables       = std::vector<std::unique_ptr<cudf::table>>{};
+  int num_chunks        = 0;
+  auto nrows_per_source = std::vector<size_t>{};
+  while (reader.has_next()) {
+    auto chunk = reader.read_chunk();
+    out_tables.emplace_back(std::move(chunk.tbl));
+    num_chunks++;
+    if (nrows_per_source.empty()) {
+      nrows_per_source = std::move(chunk.metadata.num_rows_per_source);
+    } else {
+      std::transform(chunk.metadata.num_rows_per_source.cbegin(),
+                     chunk.metadata.num_rows_per_source.cend(),
+                     nrows_per_source.begin(),
+                     nrows_per_source.begin(),
+                     std::plus<size_t>());
+    }
+  }
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  return std::tuple(cudf::concatenate(out_tviews), num_chunks, nrows_per_source);
+}
+
 }  // namespace
 
 struct ParquetChunkedReaderTest : public cudf::test::BaseFixture {};
@@ -1477,3 +1504,370 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadOutOfBoundChunks)
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 }
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
+{
+  constexpr int num_rows          = 10'723;  // A prime number
+  constexpr int rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  auto int64_col        = int64s_col(int64_data.begin(), int64_data.end()).release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(std::move(int64_col));
+
+  // Write to Parquet
+  auto const [expected, filepath] = write_file(input_columns,
+                                               "num_rows_per_source",
+                                               false,
+                                               false,
+                                               cudf::io::default_max_page_size_bytes,
+                                               rows_in_row_group);
+
+  // Chunked-read single data source entirely
+  {
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], num_rows);
+  }
+
+  // Chunked-read rows_to_read rows skipping rows_to_skip from single data source
+  {
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 1'237
+    auto const rows_to_read          = 7'232;
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .num_rows(rows_to_read)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected = int64s_col(int64_data.begin() + rows_to_skip,
+                                         int64_data.begin() + rows_to_skip + rows_to_read)
+                                .release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], rows_to_read);
+  }
+
+  // Chunked-read two data sources skipping the first entire file completely
+  {
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto constexpr rows_to_skip      = 0;  // 15'723;
+    auto constexpr output_read_limit = 1'024'000;
+    auto constexpr pass_read_limit   = 1'024'000;
+
+    auto constexpr nsources = 2;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // TODO: Enable code inside /* */ when https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto int64_col_selected =
+      int64s_col(int64_data.begin() /* + rows_to_skip - num_rows */, int64_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    // TODO: Enable the following check when https://github.com/rapidsai/cudf/issues/16186
+    // is resolved
+    // CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+
+    EXPECT_EQ(num_rows_per_source.size(), 2);
+    EXPECT_EQ(num_rows_per_source[0], num_rows /* 0 */);
+    EXPECT_EQ(num_rows_per_source[1], num_rows /* nsources * num_rows - rows_to_skip */);
+  }
+
+  // Chunked-read from single data source skipping rows_to_skip
+  {
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 1'237;
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 1'800;
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected =
+      int64s_col(int64_data.begin() + rows_to_skip, int64_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], num_rows - rows_to_skip);
+  }
+
+  // Filtered chunked-read from single data source
+  {
+    int64_t const max_value          = int64_data[int64_data.size() / 2];
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+    auto literal_value               = cudf::numeric_scalar<int64_t>{max_value};
+    auto literal                     = cudf::ast::literal{literal_value};
+    auto col_ref                     = cudf::ast::column_reference(0);
+    auto filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, literal);
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .filter(filter_expression)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    std::vector<int64_t> int64_data_filtered;
+    int64_data_filtered.reserve(num_rows);
+    std::copy_if(
+      int64_data.begin(), int64_data.end(), std::back_inserter(int64_data_filtered), [=](auto val) {
+        return val <= max_value;
+      });
+
+    auto int64_col_filtered =
+      int64s_col(int64_data_filtered.begin(), int64_data_filtered.end()).release();
+
+    cudf::table_view expected_filtered({int64_col_filtered->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_filtered, result->view());
+    EXPECT_TRUE(num_rows_per_source.empty());
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)
+{
+  constexpr int num_rows          = 10'723;  // A prime number
+  constexpr int rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  auto int64_col        = int64s_col(int64_data.begin(), int64_data.end()).release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(std::move(int64_col));
+
+  // Write to Parquet
+  auto const [expected, filepath] = write_file(input_columns,
+                                               "num_rows_per_source",
+                                               false,
+                                               false,
+                                               cudf::io::default_max_page_size_bytes,
+                                               rows_in_row_group);
+
+  // Function to initialize a vector of expected counts per source
+  auto initialize_expected_counts =
+    [](int const nsources, int const num_rows, int const rows_to_skip, int const rows_to_read) {
+      // Initialize expected_counts
+      std::vector<size_t> expected_counts(nsources, num_rows);
+
+      // Adjust expected_counts for rows_to_skip
+      int64_t counter = 0;
+      for (auto& nrows : expected_counts) {
+        if (counter < rows_to_skip) {
+          counter += nrows;
+          nrows = (counter >= rows_to_skip) ? counter - rows_to_skip : 0;
+        } else {
+          break;
+        }
+      }
+
+      // Reset the counter
+      counter = 0;
+
+      // Adjust expected_counts for rows_to_read
+      for (auto& nrows : expected_counts) {
+        if (counter < rows_to_read) {
+          counter += nrows;
+          nrows = (counter >= rows_to_read) ? rows_to_read - counter + nrows : nrows;
+        } else if (counter > rows_to_read) {
+          nrows = 0;
+        }
+      }
+
+      return expected_counts;
+    };
+
+  // Chunked-read six data sources entirely
+  {
+    auto const nsources              = 6;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources}).build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    std::vector<size_t> const expected_counts(nsources, num_rows);
+
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+
+  // Chunked-read rows_to_read rows skipping rows_to_skip from eight data sources
+  {
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 25'571;
+    auto const rows_to_read          = 41'232;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    auto const nsources              = 8;
+    std::vector<int64_t> int64_selected_data{};
+    int64_selected_data.reserve(nsources * num_rows);
+
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(nsources),
+      [&](auto const i) {
+        std::copy(int64_data.begin(), int64_data.end(), std::back_inserter(int64_selected_data));
+      });
+
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .num_rows(rows_to_read)
+        .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    auto const expected_counts =
+      initialize_expected_counts(nsources, num_rows, rows_to_skip, rows_to_read);
+
+    // Initialize expected table
+    auto int64_col_selected = int64s_col(int64_selected_data.begin() + rows_to_skip,
+                                         int64_selected_data.begin() + +rows_to_skip + rows_to_read)
+                                .release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+
+  // Chunked-read four data sources skipping three files completely
+  {
+    auto const nsources = 4;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    int constexpr rows_to_skip       = 0;  // num_rows * 3 + 1;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    std::vector<int64_t> int64_selected_data{};
+    int64_selected_data.reserve(nsources * num_rows);
+
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(nsources),
+      [&](auto const i) {
+        std::copy(int64_data.begin(), int64_data.end(), std::back_inserter(int64_selected_data));
+      });
+
+    std::vector<std::string> const datasources(nsources, filepath);
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    auto const expected_counts =
+      initialize_expected_counts(nsources, num_rows, rows_to_skip, num_rows * nsources);
+
+    // Initialize expected table
+    auto int64_col_selected =
+      int64s_col(int64_selected_data.begin() + rows_to_skip, int64_selected_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceEmptyTable)
+{
+  auto constexpr output_read_limit = 4'500;
+  auto constexpr pass_read_limit   = 8'500;
+  auto const nsources              = 10;
+
+  // Table with single col of random int64 values
+  auto int64_empty_col = int64s_col{}.release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_empty_columns;
+  input_empty_columns.emplace_back(std::move(int64_empty_col));
+
+  // Write to Parquet
+  auto const [expected_empty, filepath_empty] = write_file(input_empty_columns,
+                                                           "num_rows_per_source_empty",
+                                                           false,
+                                                           false,
+                                                           cudf::io::default_max_page_size_bytes,
+                                                           500);
+
+  std::vector<std::string> const datasources(nsources, filepath_empty);
+
+  auto const options =
+    cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources}).build();
+  auto const reader = cudf::io::chunked_parquet_reader(
+    output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+  auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+  // Initialize expected_counts
+  std::vector<size_t> const expected_counts(nsources, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_empty->view(), result->view());
+
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(num_rows_per_source.size(), nsources);
+  EXPECT_TRUE(
+    std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+}
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 2edf9e0aee6..6c61535359f 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -2243,6 +2243,209 @@ TEST_F(ParquetReaderTest, StringsWithPageStats)
   }
 }
 
+TEST_F(ParquetReaderTest, NumRowsPerSource)
+{
+  int constexpr num_rows          = 10'723;  // A prime number
+  int constexpr rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  column_wrapper<int64_t> const int64_col{
+    int64_data.begin(), int64_data.end(), cudf::test::iterators::no_nulls()};
+  cudf::table_view const expected({int64_col});
+
+  // Write to Parquet
+  auto const filepath = temp_env->get_temp_filepath("NumRowsPerSource.parquet");
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .row_group_size_rows(rows_in_row_group)
+      .build();
+  cudf::io::write_parquet(out_opts);
+
+  // Read single data source entirely
+  {
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 1);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], num_rows);
+  }
+
+  // Read rows_to_read rows skipping rows_to_skip from single data source
+  {
+    auto constexpr rows_to_skip = 557;  // a prime number != rows_in_row_group
+    auto constexpr rows_to_read = 7'232;
+    auto const in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .num_rows(rows_to_read)
+                           .build();
+    auto const result = cudf::io::read_parquet(in_opts);
+    column_wrapper<int64_t> int64_col_selected{int64_data.begin() + rows_to_skip,
+                                               int64_data.begin() + rows_to_skip + rows_to_read,
+                                               cudf::test::iterators::no_nulls()};
+
+    cudf::table_view const expected_selected({int64_col_selected});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 1);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], rows_to_read);
+  }
+
+  // Filtered read from single data source
+  {
+    auto constexpr max_value = 100;
+    auto literal_value       = cudf::numeric_scalar<int64_t>{max_value};
+    auto literal             = cudf::ast::literal{literal_value};
+    auto col_ref             = cudf::ast::column_reference(0);
+    auto filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, literal);
+
+    auto const in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                           .filter(filter_expression)
+                           .build();
+
+    std::vector<int64_t> int64_data_filtered;
+    int64_data_filtered.reserve(num_rows);
+    std::copy_if(
+      int64_data.begin(), int64_data.end(), std::back_inserter(int64_data_filtered), [=](auto val) {
+        return val <= max_value;
+      });
+    column_wrapper<int64_t> int64_col_filtered{
+      int64_data_filtered.begin(), int64_data_filtered.end(), cudf::test::iterators::no_nulls()};
+
+    cudf::table_view expected_filtered({int64_col_filtered});
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_filtered, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 0);
+  }
+
+  // Read two data sources skipping the first entire file completely
+  {
+    auto constexpr rows_to_skip = 15'723;
+    auto constexpr nsources     = 2;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    column_wrapper<int64_t> int64_col_selected{int64_data.begin() + rows_to_skip - num_rows,
+                                               int64_data.end(),
+                                               cudf::test::iterators::no_nulls()};
+
+    cudf::table_view const expected_selected({int64_col_selected});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 2);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], 0);
+    EXPECT_EQ(result.metadata.num_rows_per_source[1], nsources * num_rows - rows_to_skip);
+  }
+
+  // Read ten data sources entirely
+  {
+    auto constexpr nsources = 10;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources}).build();
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    // Initialize expected_counts
+    std::vector<size_t> const expected_counts(nsources, num_rows);
+
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                           expected_counts.cend(),
+                           result.metadata.num_rows_per_source.cbegin()));
+  }
+
+  // Read rows_to_read rows skipping rows_to_skip (> two sources) from ten data sources
+  {
+    auto constexpr rows_to_skip = 25'999;
+    auto constexpr rows_to_read = 47'232;
+
+    auto constexpr nsources = 10;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .num_rows(rows_to_read)
+        .build();
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    // Initialize expected_counts
+    std::vector<size_t> expected_counts(nsources, num_rows);
+
+    // Adjust expected_counts for rows_to_skip
+    int64_t counter = 0;
+    for (auto& nrows : expected_counts) {
+      if (counter < rows_to_skip) {
+        counter += nrows;
+        nrows = (counter >= rows_to_skip) ? counter - rows_to_skip : 0;
+      } else {
+        break;
+      }
+    }
+
+    // Reset the counter
+    counter = 0;
+
+    // Adjust expected_counts for rows_to_read
+    for (auto& nrows : expected_counts) {
+      if (counter < rows_to_read) {
+        counter += nrows;
+        nrows = (counter >= rows_to_read) ? rows_to_read - counter + nrows : nrows;
+      } else if (counter > rows_to_read) {
+        nrows = 0;
+      }
+    }
+
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                           expected_counts.cend(),
+                           result.metadata.num_rows_per_source.cbegin()));
+  }
+}
+
+TEST_F(ParquetReaderTest, NumRowsPerSourceEmptyTable)
+{
+  auto const nsources = 10;
+
+  column_wrapper<int64_t> const int64_empty_col{};
+  cudf::table_view const expected_empty({int64_empty_col});
+
+  // Write to Parquet
+  auto const filepath_empty = temp_env->get_temp_filepath("NumRowsPerSourceEmpty.parquet");
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath_empty}, expected_empty)
+      .build();
+  cudf::io::write_parquet(out_opts);
+
+  // Read from Parquet
+  std::vector<std::string> const datasources(nsources, filepath_empty);
+
+  auto const in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources}).build();
+  auto const result = cudf::io::read_parquet(in_opts);
+
+  // Initialize expected_counts
+  std::vector<size_t> const expected_counts(nsources, 0);
+
+  EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+  EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                         expected_counts.cend(),
+                         result.metadata.num_rows_per_source.cbegin()));
+}
+
 ///////////////////
 // metadata tests
 
diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cu
similarity index 50%
rename from cpp/tests/large_strings/json_tests.cpp
rename to cpp/tests/large_strings/json_tests.cu
index bf16d131ba7..49abf7b484d 100644
--- a/cpp/tests/large_strings/json_tests.cpp
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -14,8 +14,13 @@
  * limitations under the License.
  */
 
+#include "../io/json/json_utils.cuh"
 #include "large_strings_fixture.hpp"
 
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -28,31 +33,57 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  constexpr size_t expected_file_size = std::numeric_limits<int>::max() / 2;
+  constexpr size_t batch_size_ub      = std::numeric_limits<int>::max();
+  constexpr size_t expected_file_size = 1.5 * static_cast<double>(batch_size_ub);
   std::size_t const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
 
   json_string.reserve(json_string.size() * (1UL << log_repetitions));
-  std::size_t numrows = 4;
   for (std::size_t i = 0; i < log_repetitions; i++) {
     json_string += json_string;
-    numrows <<= 1;
   }
 
   constexpr int num_sources = 2;
-  std::vector<cudf::host_span<char>> hostbufs(
-    num_sources, cudf::host_span<char>(json_string.data(), json_string.size()));
+  std::vector<cudf::host_span<std::byte>> hostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(json_string.data()),
+                               json_string.size()));
 
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{
-        cudf::host_span<cudf::host_span<char>>(hostbufs.data(), hostbufs.size())})
+        cudf::host_span<cudf::host_span<std::byte>>(hostbufs.data(), hostbufs.size())})
       .lines(true)
       .compression(cudf::io::compression_type::NONE)
       .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
   // Read full test data via existing, nested JSON lines reader
   cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
-  ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources);
+
+  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
+  for (auto& hb : hostbufs) {
+    datasources.emplace_back(cudf::io::datasource::create(hb));
+  }
+  // Test for different chunk sizes
+  std::vector<size_t> chunk_sizes{
+    batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast<size_t>(batch_size_ub * 2)};
+  for (auto chunk_size : chunk_sizes) {
+    auto const tables =
+      split_byte_range_reading<std::int64_t>(datasources,
+                                             json_lines_options,
+                                             chunk_size,
+                                             cudf::get_default_stream(),
+                                             rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
 }
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
index 9e81c8574b8..03e4cf47470 100644
--- a/cpp/tests/streams/dictionary_test.cpp
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -26,6 +26,52 @@
 
 class DictionaryTest : public cudf::test::BaseFixture {};
 
+TEST_F(DictionaryTest, FactoryColumnViews)
+{
+  cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"});
+  cudf::test::fixed_width_column_wrapper<uint8_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+
+  auto dictionary = cudf::make_dictionary_column(keys, values, cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values);
+}
+
+TEST_F(DictionaryTest, FactoryColumns)
+{
+  std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
+  cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
+  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+
+  auto dictionary = cudf::make_dictionary_column(
+    keys.release(), values.release(), cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
+  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
+}
+
+TEST_F(DictionaryTest, FactoryColumnsNullMaskCount)
+{
+  std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
+  cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
+  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+
+  auto dictionary = cudf::make_dictionary_column(
+    keys.release(), values.release(), rmm::device_buffer{}, 0, cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
+  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
+}
+
 TEST_F(DictionaryTest, Encode)
 {
   cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
index 9e4ee5a4a93..9ba862585d0 100644
--- a/cpp/tests/streams/interop_test.cpp
+++ b/cpp/tests/streams/interop_test.cpp
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// These interop functions are deprecated. We keep the code in this
+// test and will migrate the tests to export via the arrow C data
+// interface with to_arrow_host which arrow can consume. For now, the
+// test is commented out.
+
+#if 0
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
@@ -67,3 +74,5 @@ TEST_F(ArrowTest, FromArrowScalar)
   auto arrow_scalar = arrow::MakeScalar(value);
   cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream());
 }
+
+#endif
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
index 711e20e4b17..7963dced292 100644
--- a/cpp/tests/streams/lists_test.cpp
+++ b/cpp/tests/streams/lists_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/explode.hpp>
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/filling.hpp>
 #include <cudf/lists/gather.hpp>
@@ -212,3 +213,57 @@ TEST_F(ListTest, HaveOverlap)
                             cudf::nan_equality::ALL_EQUAL,
                             cudf::test::get_default_stream());
 }
+
+TEST_F(ListTest, Explode)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_a{100, 200, 300};
+  cudf::test::lists_column_wrapper<int32_t> list_col_b{
+    cudf::test::lists_column_wrapper<int32_t>{1, 2, 7},
+    cudf::test::lists_column_wrapper<int32_t>{5, 6},
+    cudf::test::lists_column_wrapper<int32_t>{0, 3}};
+  cudf::test::strings_column_wrapper list_col_c{"string0", "string1", "string2"};
+  cudf::table_view lists_table({list_col_a, list_col_b, list_col_c});
+  cudf::explode(lists_table, 1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodePosition)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_a{100, 200, 300};
+  cudf::test::lists_column_wrapper<int32_t> list_col_b{
+    cudf::test::lists_column_wrapper<int32_t>{1, 2, 7},
+    cudf::test::lists_column_wrapper<int32_t>{5, 6},
+    cudf::test::lists_column_wrapper<int32_t>{0, 3}};
+  cudf::test::strings_column_wrapper list_col_c{"string0", "string1", "string2"};
+  cudf::table_view lists_table({list_col_a, list_col_b, list_col_c});
+  cudf::explode_position(lists_table, 1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodeOuter)
+{
+  constexpr auto null = 0;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  cudf::test::lists_column_wrapper<int32_t> list_col_a{
+    cudf::test::lists_column_wrapper<int32_t>({1, null, 7}, valids),
+    cudf::test::lists_column_wrapper<int32_t>({5, null, 0, null}, valids),
+    cudf::test::lists_column_wrapper<int32_t>{},
+    cudf::test::lists_column_wrapper<int32_t>({0, null, 8}, valids)};
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_b{100, 200, 300, 400};
+  cudf::table_view lists_table({list_col_a, list_col_b});
+  cudf::explode_outer(lists_table, 0, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodeOuterPosition)
+{
+  constexpr auto null = 0;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  cudf::test::lists_column_wrapper<int32_t> list_col_a{
+    cudf::test::lists_column_wrapper<int32_t>({1, null, 7}, valids),
+    cudf::test::lists_column_wrapper<int32_t>({5, null, 0, null}, valids),
+    cudf::test::lists_column_wrapper<int32_t>{},
+    cudf::test::lists_column_wrapper<int32_t>({0, null, 8}, valids)};
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_b{100, 200, 300, 400};
+  cudf::table_view lists_table({list_col_a, list_col_b});
+  cudf::explode_outer_position(lists_table, 0, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/reshape_test.cpp b/cpp/tests/streams/reshape_test.cpp
new file mode 100644
index 00000000000..d7c5da91bca
--- /dev/null
+++ b/cpp/tests/streams/reshape_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/reshape.hpp>
+
+class ReshapeTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReshapeTest, InterleaveColumns)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 3, 6});
+  auto b = cudf::test::fixed_width_column_wrapper<int32_t>({1, 4, 7});
+  auto c = cudf::test::fixed_width_column_wrapper<int32_t>({2, 5, 8});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
+  cudf::interleave_columns(in, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, Tile)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+  cudf::tile(in, 2, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, ByteCast)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 100, -100, 1000, 1000});
+  cudf::byte_cast(a, cudf::flip_endianness::YES, cudf::test::get_default_stream());
+  cudf::byte_cast(a, cudf::flip_endianness::NO, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 51e9b3bd0a0..7a038fa6d75 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -294,7 +294,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(
+  auto const d_integers = cudf::detail::make_device_uvector_sync(
     h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
@@ -308,8 +308,6 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   // convert to strings
   auto results_strings = cudf::strings::from_integers(integers->view());
 
-  // copy back to host
-  h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream());
   std::vector<std::string> h_strings;
   for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr)
     h_strings.push_back(std::to_string(*itr));
diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp
index 4d5035e5a22..ab5a31ce161 100644
--- a/cpp/tests/utilities/random_seed.cpp
+++ b/cpp/tests/utilities/random_seed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ namespace detail {
 /**
  * @copydoc cudf::test::detail::random_generator_incrementing_seed()
  */
-uint64_t random_generator_incrementing_seed()
+CUDF_EXPORT uint64_t random_generator_incrementing_seed()
 {
   static uint64_t seed = 0;
   return ++seed;
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index df9103640f4..93259fd63ee 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -18,16 +18,33 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-class PinnedMemoryTest : public cudf::test::BaseFixture {};
+class PinnedMemoryTest : public cudf::test::BaseFixture {
+  size_t prev_copy_threshold;
+  size_t prev_alloc_threshold;
 
-TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
+ public:
+  PinnedMemoryTest()
+    : prev_copy_threshold{cudf::get_kernel_pinned_copy_threshold()},
+      prev_alloc_threshold{cudf::get_allocate_host_as_pinned_threshold()}
+  {
+  }
+  ~PinnedMemoryTest() override
+  {
+    cudf::set_kernel_pinned_copy_threshold(prev_copy_threshold);
+    cudf::set_allocate_host_as_pinned_threshold(prev_alloc_threshold);
+  }
+};
+
+TEST_F(PinnedMemoryTest, MemoryResourceGetAndSet)
 {
   // Global environment for temporary files
   auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
@@ -63,3 +80,49 @@ TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
   // reset memory resource back
   cudf::set_pinned_memory_resource(last_mr);
 }
+
+TEST_F(PinnedMemoryTest, KernelCopyThresholdGetAndSet)
+{
+  cudf::set_kernel_pinned_copy_threshold(12345);
+  EXPECT_EQ(cudf::get_kernel_pinned_copy_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, HostAsPinnedThresholdGetAndSet)
+{
+  cudf::set_allocate_host_as_pinned_threshold(12345);
+  EXPECT_EQ(cudf::get_allocate_host_as_pinned_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, MakePinnedVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(0);
+
+  // should always use pinned memory
+  {
+    auto const vec = cudf::detail::make_pinned_vector_async<char>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+}
+
+TEST_F(PinnedMemoryTest, MakeHostVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(7);
+
+  // allocate smaller than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate the same size as the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<char>(7, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate larger than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(2, cudf::get_default_stream());
+    EXPECT_FALSE(vec.get_allocator().is_device_accessible());
+  }
+}
diff --git a/dependencies.yaml b/dependencies.yaml
index b5104024e18..3401362ad9a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -112,6 +112,13 @@ files:
     includes:
       - test_python_common
       - test_python_cudf
+  py_rapids_build_cudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: build-system
+    includes:
+      - rapids_build_skbuild
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -130,6 +137,7 @@ files:
     includes:
       - run_pylibcudf
       - pyarrow_run
+  # TODO: finishme
   # py_test_pylibcudf:
   #   output: pyproject
   #   pyproject_dir: python/pylibcudf
@@ -314,8 +322,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.8.*,>=0.0.0a0
-          - libkvikio==24.8.*,>=0.0.0a0
+          - librmm==24.10.*,>=0.0.0a0
+          - libkvikio==24.10.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -356,7 +364,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -370,13 +378,17 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
-            packages: &build_python_packages_cu12
-              - rmm-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
-            packages: &build_python_packages_cu11
-              - rmm-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_conda] }
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*rmm_unsuffixed]}
   libarrow_build:
     common:
       - output_types: conda
@@ -532,7 +544,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -591,7 +603,7 @@ dependencies:
           - typing_extensions>=4.0.0
       - output_types: conda
         packages:
-          - *rmm_conda
+          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -610,13 +622,27 @@ dependencies:
           - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "false"
+            packages:
+              - *rmm_unsuffixed
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_conda]}
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "false"
+            packages: &run_pylibcudf_cu11_unsuffixed
+              - *rmm_unsuffixed
+          - {matrix: null, packages: *run_pylibcudf_cu11_unsuffixed}
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -629,7 +655,7 @@ dependencies:
           - typing_extensions>=4.0.0
       - output_types: conda
         packages:
-          - *rmm_conda
+          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -650,23 +676,40 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pynvjitlink>=0.0.0a0
+              - &pynvjitlink_unsuffixed pynvjitlink>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cubinlinker
-              - ptxcompiler
+              - &cubinlinker_unsuffixed cubinlinker
+              - &ptxcompiler_unsuffixed ptxcompiler
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.8.*,>=0.0.0a0
+              - rmm-cu12==24.10.*,>=0.0.0a0
               - pynvjitlink-cu12>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "false"
             packages:
-              - rmm-cu11==24.8.*,>=0.0.0a0
+              - *rmm_unsuffixed
+              - *pynvjitlink_unsuffixed
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
               - cubinlinker-cu11
               - ptxcompiler-cu11
-          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "false"
+            packages: &run_cudf_cu11_unsuffixed
+              - *cubinlinker_unsuffixed
+              - *ptxcompiler_unsuffixed
+              - *rmm_unsuffixed
+          - {matrix: null, packages: *run_cudf_cu11_unsuffixed}
   run_cudf_polars:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -676,7 +719,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - rapids-dask-dependency==24.10.*,>=0.0.0a0
   run_custreamz:
     common:
       - output_types: conda
@@ -762,13 +805,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -778,18 +821,22 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - cudf-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - cudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - cudf-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*cudf_conda]}
+              - cudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*cudf_unsuffixed]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.8.*,>=0.0.0a0
+          - &cudf_kafka_unsuffixed cudf_kafka==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -799,13 +846,17 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
-              - cudf_kafka-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+              - cudf_kafka-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
-              - cudf_kafka-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*cudf_kafka_conda]}
+              - cudf_kafka-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*cudf_kafka_unsuffixed]}
   depends_on_cupy:
     common:
       - output_types: conda
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c3c14ac8cad..7421d9be298 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -556,10 +556,16 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
     # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:obj", "cudf.Index.values_host"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
     ("py:class", "ColumnLike"),
+    ("py:class", "ColumnLike"),
+    ("py:obj", "cudf.Index.transpose"),
+    ("py:obj", "cudf.Index.T"),
+    ("py:obj", "cudf.Index.to_flat_index"),
+    ("py:obj", "cudf.MultiIndex.to_flat_index"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md
index 75f57742ac9..8efd9d7e063 100644
--- a/docs/cudf/source/cudf_pandas/how-it-works.md
+++ b/docs/cudf/source/cudf_pandas/how-it-works.md
@@ -36,3 +36,19 @@ transfers.
 When using `cudf.pandas`, cuDF's [pandas compatibility
 mode](api.options) is automatically enabled, ensuring consistency with
 pandas-specific semantics like default sort ordering.
+
+`cudf.pandas` uses a managed memory pool by default. This allows `cudf.pandas` to process datasets larger than the memory of the GPU it is running on. Managed memory prefetching is also enabled by default to improve memory access performance. For more information on CUDA Unified Memory (managed memory), performance, and prefetching, see [this NVIDIA Developer blog post](https://developer.nvidia.com/blog/improving-gpu-memory-oversubscription-performance/).
+
+Pool allocators improve allocation performance. Without using one, memory
+allocation may be a bottleneck depending on the workload. Managed memory
+enables oversubscribing GPU memory. This allows cudf.pandas to process
+data larger than GPU memory in many cases, without CPU (Pandas) fallback.
+
+Other memory allocators can be used by changing the environment
+variable `CUDF_PANDAS_RMM_MODE` to one of the following.
+
+1. "managed_pool" (default): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
+2. "managed": CUDA Unified Memory, (managed memory) with no pool allocator.
+3. "async": CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
+4. "pool": RMM's asynchronous pool allocator with normal CUDA device memory.
+5. "cuda": normal CUDA device memory with no pool allocator.
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index c8da689479c..4f5a57fec02 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -164,7 +164,7 @@ The directive should be used inside docstrings like so:
 Docstring body
 
 .. pandas-compat::
-    **$API_NAME**
+    :meth:`pandas.DataFrame.METHOD`
 
     Explanation of differences
 ```
diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst
index 80811efa33f..ca29087cbf9 100644
--- a/docs/cudf/source/user_guide/api_docs/groupby.rst
+++ b/docs/cudf/source/user_guide/api_docs/groupby.rst
@@ -68,7 +68,6 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
-   GroupBy.corr
    GroupBy.cov
 
 The following methods are available in both ``SeriesGroupBy`` and
@@ -81,6 +80,7 @@ application to columns of a specific data type.
    :toctree: api/
 
    DataFrameGroupBy.bfill
+   DataFrameGroupBy.corr
    DataFrameGroupBy.count
    DataFrameGroupBy.cumcount
    DataFrameGroupBy.cummax
@@ -102,5 +102,6 @@ The following methods are available only for ``SeriesGroupBy`` objects.
 .. autosummary::
    :toctree: api/
 
+   SeriesGroupBy.corr
    SeriesGroupBy.nunique
    SeriesGroupBy.unique
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 1c7e364c40f..c8933981736 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -18,3 +18,4 @@ I/O Functions
     avro
     csv
     json
+    parquet
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
new file mode 100644
index 00000000000..9dfbadfa216
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
@@ -0,0 +1,6 @@
+=======
+Parquet
+=======
+
+.. automodule:: cudf._lib.pylibcudf.io.parquet
+   :members:
diff --git a/docs/cudf/source/user_guide/io/read-json.md b/docs/cudf/source/user_guide/io/read-json.md
index 7049c75d1c1..d2bb021a5b5 100644
--- a/docs/cudf/source/user_guide/io/read-json.md
+++ b/docs/cudf/source/user_guide/io/read-json.md
@@ -218,11 +218,11 @@ reads a JSON object as a single line and then extracts the
 # first read the JSON object with line=True
 >>> df = cudf.read_json(j, lines=True)
 >>> df
-             metadata                                            records
+             metadata                                            results
 0  {'vehicle': 'car'}  [{'id': 0, 'distance': 1.2}, {'id': 1, 'distan...
 
-# then explode the 'records' column
->>> df = df['records'].explode().struct.explode()
+# then explode the 'results' column
+>>> df = df['results'].explode().struct.explode()
 >>> df
    id  distance
 0   0       1.2
diff --git a/java/ci/README.md b/java/ci/README.md
index 49481efab6b..ccb9efb50b6 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.08
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.10
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.08.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.10.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 70230e6bc71..9694e741f16 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.08.0-SNAPSHOT</version>
+    <version>24.10.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/java/ai/rapids/cudf/Cudf.java b/java/src/main/java/ai/rapids/cudf/Cudf.java
new file mode 100644
index 00000000000..d09e2f87ed4
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/Cudf.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+public class Cudf {
+
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * cuDF copies that are smaller than the threshold will use a kernel to copy, instead
+   * of cudaMemcpyAsync.
+   */
+  public static native void setKernelPinnedCopyThreshold(long kernelPinnedCopyThreshold);
+
+  /**
+   * cudf allocations that are smaller than the threshold will use the pinned host
+   * memory resource.
+   */
+  public static native void setPinnedAllocationThreshold(long pinnedAllocationThreshold);
+}
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 56f8f9d0472..22059c5bc7f 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -210,6 +210,7 @@ target_compile_definitions(
   cudfjni PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
                  "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>"
 )
+target_link_options(cudfjni PRIVATE "-Wl,--no-undefined")
 
 if(USE_GDS)
   add_library(cufilejni src/CuFileJni.cpp)
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index ea04c1cda83..a3b4bfcb63e 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -284,7 +284,7 @@ class native_jArray {
     return data()[index];
   }
 
-  const N_TYPE* const data() const
+  N_TYPE const* data() const
   {
     init_data_ptr();
     return data_ptr;
@@ -296,17 +296,15 @@ class native_jArray {
     return data_ptr;
   }
 
-  const N_TYPE* const begin() const { return data(); }
+  N_TYPE const* begin() const { return data(); }
 
   N_TYPE* begin() { return data(); }
 
-  const N_TYPE* const end() const { return data() + size(); }
+  N_TYPE const* end() const { return data() + size(); }
 
   N_TYPE* end() { return data() + size(); }
 
-  const J_ARRAY_TYPE get_jArray() const { return orig; }
-
-  J_ARRAY_TYPE get_jArray() { return orig; }
+  J_ARRAY_TYPE get_jArray() const { return orig; }
 
   /**
    * @brief Conversion to std::vector
@@ -430,9 +428,7 @@ class native_jpointerArray {
   T* const* begin() const { return data(); }
   T* const* end() const { return data() + size(); }
 
-  const jlongArray get_jArray() const { return wrapped.get_jArray(); }
-
-  jlongArray get_jArray() { return wrapped.get_jArray(); }
+  jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   void assert_no_nulls() const
   {
@@ -624,7 +620,7 @@ class native_jstring {
     return true;
   }
 
-  const jstring get_jstring() const { return orig; }
+  jstring get_jstring() const { return orig; }
 
   ~native_jstring()
   {
@@ -753,13 +749,13 @@ class native_jstringArray {
     return cache[index];
   }
 
-  char const** const as_c_array() const
+  char const** as_c_array() const
   {
     init_c_cache();
     return c_cache.data();
   }
 
-  const std::vector<std::string> as_cpp_vector() const
+  std::vector<std::string> as_cpp_vector() const
   {
     init_cpp_cache();
     return cpp_cache;
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index 698a8f6ff02..2860dc2e4b2 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf/copying.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <sstream>
 
@@ -201,4 +202,28 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv* env, j
   return cudf::jni::is_ptds_enabled;
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setKernelPinnedCopyThreshold(JNIEnv* env,
+                                                                             jclass clazz,
+                                                                             jlong jthreshold)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto threshold = static_cast<std::size_t>(jthreshold);
+    cudf::set_kernel_pinned_copy_threshold(threshold);
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setPinnedAllocationThreshold(JNIEnv* env,
+                                                                             jclass clazz,
+                                                                             jlong jthreshold)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto threshold = static_cast<std::size_t>(jthreshold);
+    cudf::set_allocate_host_as_pinned_threshold(threshold);
+  }
+  CATCH_STD(env, )
+}
+
 }  // extern "C"
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5842a980fc4..09c04a77590 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -154,13 +154,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
   }
 };
 
-template <typename Upstream>
-tracking_resource_adaptor<Upstream>* make_tracking_adaptor(Upstream* upstream,
-                                                           std::size_t size_alignment)
-{
-  return new tracking_resource_adaptor<Upstream>{upstream, size_alignment};
-}
-
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c58cd732b39..a9ace1398e4 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -45,6 +45,7 @@
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -2789,7 +2790,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap
       auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
                          ? cudf::nullable_join::YES
                          : cudf::nullable_join::NO;
-      if (cudf::detail::has_nested_columns(right)) {
+      if (cudf::has_nested_columns(right)) {
         cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
         return hash.left_join();
       } else {
@@ -3010,7 +3011,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMa
       std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
                 std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
         maps;
-      if (cudf::detail::has_nested_columns(right)) {
+      if (cudf::has_nested_columns(right)) {
         cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
         maps = hash.inner_join();
       } else {
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
index a32e7d27085..631df58b017 100644
--- a/java/src/main/native/src/aggregation128_utils.cu
+++ b/java/src/main/native/src/aggregation128_utils.cu
@@ -34,7 +34,7 @@
 namespace {
 
 // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
-class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
+class chunk_assembler {
  public:
   chunk_assembler(bool* overflows,
                   uint64_t const* chunks0,
diff --git a/pyproject.toml b/pyproject.toml
index 2f59864894b..e15cb7b3cdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,69 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
+typing-modules = ["cudf._typing"]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # no-blank-line-before-function
+    "D201",
+    # one-blank-line-after-class
+    "D204",
+    # indent-with-spaces
+    "D206",
+    # under-indentation
+    "D207",
+    # over-indentation
+    "D208",
+    # new-line-after-last-paragraph
+    "D209",
+    # surrounding-whitespace
+    "D210",
+    # blank-line-before-class
+    "D211",
+    # section-not-over-indented
+    "D214",
+    # section-underline-not-over-indented
+    "D215",
+    # triple-single-quotes
+    "D300",
+    # escape-sequence-in-docstring
+    "D301",
+    # first-line-capitalized
+    "D403",
+    # capitalize-section-name
+    "D405",
+    # new-line-after-section-name
+    "D406",
+    # dashed-underline-after-section
+    "D407",
+    # section-underline-after-name
+    "D408",
+    # section-underline-matches-section-length
+    "D409",
+    # no-blank-line-after-section
+    "D410",
+    # no-blank-line-before-section
+    "D411",
+    # blank-lines-between-header-and-content
+    "D412",
+    # empty-docstring-section
+    "D414",
+    # overload-with-docstring
+    "D418",
+    # flake8-type-checking
+    "TCH",
+    # flake8-future-annotations
+    "FA",
+    # non-pep585-annotation
+    "UP006",
+    # non-pep604-annotation
+    "UP007"
+]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
new file mode 100644
index 00000000000..ecadbf5cbbc
--- /dev/null
+++ b/python/cudf/CMakeLists.txt
@@ -0,0 +1,101 @@
+# =============================================================================
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+include(rapids-cuda)
+rapids_cuda_init_architectures(cudf-python)
+
+project(
+  cudf-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX CUDA
+)
+
+option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
+       OFF
+)
+option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
+mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
+
+# Find Python early so that later commands can use it
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+# If the user requested it we attempt to find CUDF.
+if(FIND_CUDF_CPP)
+  include(rapids-cpm)
+  include(rapids-export)
+  include(rapids-find)
+  rapids_cpm_init()
+
+  if(USE_LIBARROW_FROM_PYARROW)
+    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
+    # libraries. These variables have no effect because we are always searching for arrow via
+    # pyarrow, but they must be set as they are required arguments to the function in
+    # get_arrow.cmake.
+    set(CUDF_USE_ARROW_STATIC OFF)
+    set(CUDF_ENABLE_ARROW_S3 OFF)
+    set(CUDF_ENABLE_ARROW_ORC OFF)
+    set(CUDF_ENABLE_ARROW_PYTHON OFF)
+    set(CUDF_ENABLE_ARROW_PARQUET OFF)
+    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
+  endif()
+
+  find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
+
+  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+  # for the interop.pyx
+  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
+else()
+  set(cudf_FOUND OFF)
+endif()
+
+include(rapids-cython-core)
+
+if(NOT cudf_FOUND)
+  set(BUILD_TESTS OFF)
+  set(BUILD_BENCHMARKS OFF)
+  set(CUDF_BUILD_TESTUTIL OFF)
+  set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+  set(CUDA_STATIC_RUNTIME ON)
+
+  add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)
+
+  # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
+  # components like headers from libcudacxx, but we do need the libraries. However, we want to
+  # control where they are installed to. Since there are multiple subpackages of cudf._lib that
+  # require access to libcudf, we place the library and all its dependent artifacts in the cudf
+  # directory as a single source of truth and modify the other rpaths appropriately.
+  set(cython_lib_dir cudf)
+  include(cmake/Modules/WheelHelpers.cmake)
+  # TODO: This install is currently overzealous. We should only install the libraries that are
+  # downloaded by CPM during the build, not libraries that were found on the system.  However, in
+  # practice right this would only be a problem is if libcudf was not found but some of the
+  # dependencies were, and we have no real use cases where that happens.
+  install_aliased_imported_targets(
+    TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
+    DESTINATION ${cython_lib_dir}
+  )
+endif()
+
+rapids_cython_init()
+
+include(cmake/Modules/LinkPyarrowHeaders.cmake)
+add_subdirectory(cudf/_lib)
+add_subdirectory(udf_cpp)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index ceae1b148aa..f6d9c8c404c 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -3,30 +3,9 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
-from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
-    count_elements as cpp_count_elements,
-)
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
-    sort_lists as cpp_sort_lists,
-)
-from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
-    distinct as cpp_distinct,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    nan_equality,
-    null_equality,
-    null_order,
-    order,
-    size_type,
-)
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib import pylibcudf
@@ -36,19 +15,10 @@ from cudf._lib.pylibcudf cimport Scalar
 
 @acquire_spill_lock()
 def count_elements(Column col):
-
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.count_elements(
+            col.to_pylibcudf(mode="read"))
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_count_elements(list_view.get()[0]))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
@@ -63,53 +33,25 @@ def explode_outer(list source_columns, int explode_column_idx):
 
 @acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
-    """
-    nulls_equal == True indicates that libcudf should treat any two nulls as
-    equal, and as unequal otherwise.
-    nans_all_equal == True indicates that libcudf should treat any two
-    elements from {-nan, +nan} as equal, and as unequal otherwise.
-    """
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
-    )
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
-    )
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_distinct(list_view.get()[0],
-                         c_nulls_equal,
-                         c_nans_equal)
+    return Column.from_pylibcudf(
+        pylibcudf.lists.distinct(
+            col.to_pylibcudf(mode="read"),
+            nulls_equal,
+            nans_all_equal,
         )
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
-    )
-    cdef order c_sort_order = (
-        order.ASCENDING if ascending else order.DESCENDING
-    )
-    cdef null_order c_null_prec = (
-        null_order.BEFORE if na_position == "first" else null_order.AFTER
-    )
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
+    return Column.from_pylibcudf(
+        pylibcudf.lists.sort_lists(
+            col.to_pylibcudf(mode="read"),
+            ascending,
+            null_order.BEFORE if na_position == "first" else null_order.AFTER,
+            False,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index e7959d21e01..a2eed94bb3c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -18,16 +18,14 @@ from cython.operator cimport dereference
 
 from cudf.api.types import is_list_like
 
-from cudf._lib.utils cimport data_from_unique_ptr
+from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
-from cudf._lib import pylibcudf
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -35,25 +33,20 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
+    add_df_col_struct_names,
     make_sinks_info,
     make_source_info,
-    update_struct_field_names,
 )
 from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.io.parquet cimport ChunkedParquetReader
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
-    chunked_parquet_reader as cpp_chunked_parquet_reader,
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
-    parquet_reader_options,
-    parquet_reader_options_builder,
     parquet_writer_options,
-    read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
 from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
@@ -63,19 +56,17 @@ from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     table_input_metadata,
-    table_metadata,
 )
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
-from cudf._lib.concat import concat_columns
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf cimport Table
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
-from cudf._lib.utils cimport data_from_pylibcudf_table
-
 
 cdef class BufferArrayFromVector:
     cdef Py_ssize_t length
@@ -133,71 +124,37 @@ def _parse_metadata(meta):
     return file_is_range_index, file_index_cols, file_column_dtype
 
 
-cdef pair[parquet_reader_options, bool] _setup_parquet_reader_options(
-     cudf_io_types.source_info source,
-     vector[vector[size_type]] row_groups,
-     bool use_pandas_metadata,
-     Expression filters,
-     object columns):
-
-    cdef parquet_reader_options args
-    cdef parquet_reader_options_builder builder
-    cdef data_type cpp_timestamp_type = cudf_types.data_type(
-        cudf_types.type_id.EMPTY
-    )
-    builder = (
-        parquet_reader_options.builder(source)
-        .row_groups(row_groups)
-        .use_pandas_metadata(use_pandas_metadata)
-        .use_arrow_schema(True)
-        .timestamp_type(cpp_timestamp_type)
-    )
-    if filters is not None:
-        builder = builder.filter(<expression &>dereference(filters.c_obj.get()))
-
-    args = move(builder.build())
-    cdef vector[string] cpp_columns
-    allow_range_index = True
-    if columns is not None:
-        cpp_columns.reserve(len(columns))
-        allow_range_index = len(columns) > 0
-        for col in columns:
-            cpp_columns.push_back(str(col).encode())
-        args.set_columns(cpp_columns)
-    allow_range_index &= filters is None
-
-    return pair[parquet_reader_options, bool](args, allow_range_index)
-
 cdef object _process_metadata(object df,
-                              table_metadata table_meta,
                               list names,
+                              dict child_names,
+                              list per_file_user_data,
                               object row_groups,
                               object filepaths_or_buffers,
                               list pa_buffers,
                               bool allow_range_index,
                               bool use_pandas_metadata):
-    update_struct_field_names(df, table_meta.schema_info)
+
+    add_df_col_struct_names(df, child_names)
     index_col = None
     is_range_index = True
     column_index_type = None
     index_col_names = None
     meta = None
-    cdef vector[unordered_map[string, string]] per_file_user_data = \
-        table_meta.per_file_user_data
     for single_file in per_file_user_data:
+        if b'pandas' not in single_file:
+            continue
         json_str = single_file[b'pandas'].decode('utf-8')
-        if json_str != "":
-            meta = json.loads(json_str)
-            file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
-            is_range_index &= file_is_range_index
-
-            if not file_is_range_index and index_col is not None \
-                    and index_col_names is None:
-                index_col_names = {}
-                for idx_col in index_col:
-                    for c in meta['columns']:
-                        if c['field_name'] == idx_col:
-                            index_col_names[idx_col] = c['name']
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
+        is_range_index &= file_is_range_index
+
+        if not file_is_range_index and index_col is not None \
+                and index_col_names is None:
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta['columns']:
+                    if c['field_name'] == idx_col:
+                        index_col_names[idx_col] = c['name']
 
     if meta is not None:
         # Book keep each column metadata as the order
@@ -297,6 +254,76 @@ cdef object _process_metadata(object df,
     return df
 
 
+def read_parquet_chunked(
+    filepaths_or_buffers,
+    columns=None,
+    row_groups=None,
+    use_pandas_metadata=True,
+    size_t chunk_read_limit=0,
+    size_t pass_read_limit=1024000000
+):
+    # Convert NativeFile buffers to NativeFileDatasource,
+    # but save original buffers in case we need to use
+    # pyarrow for metadata processing
+    # (See: https://github.com/rapidsai/cudf/issues/9599)
+
+    pa_buffers = []
+
+    new_bufs = []
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            new_bufs.append(NativeFileDatasource(datasource))
+        else:
+            new_bufs.append(datasource)
+
+    # Note: If this function ever takes accepts filters
+    # allow_range_index needs to be False when a filter is passed
+    # (see read_parquet)
+    allow_range_index = columns is not None and len(columns) != 0
+
+    reader = ChunkedParquetReader(
+        plc.io.SourceInfo(new_bufs),
+        columns,
+        row_groups,
+        use_pandas_metadata,
+        chunk_read_limit=chunk_read_limit,
+        pass_read_limit=pass_read_limit
+    )
+
+    tbl_w_meta = reader.read_chunk()
+    column_names = tbl_w_meta.column_names(include_children=False)
+    child_names = tbl_w_meta.child_names
+    per_file_user_data = tbl_w_meta.per_file_user_data
+    concatenated_columns = tbl_w_meta.tbl.columns()
+
+    # save memory
+    del tbl_w_meta
+
+    cdef Table tbl
+    while reader.has_next():
+        tbl = reader.read_chunk().tbl
+
+        for i in range(tbl.num_columns()):
+            concatenated_columns[i] = plc.concatenate.concatenate(
+                [concatenated_columns[i], tbl._columns[i]]
+            )
+            # Drop residual columns to save memory
+            tbl._columns[i] = None
+
+    df = cudf.DataFrame._from_data(
+        *_data_from_columns(
+            columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns],
+            column_names=column_names,
+            index_names=None
+        )
+    )
+    df = _process_metadata(df, column_names, child_names,
+                           per_file_user_data, row_groups,
+                           filepaths_or_buffers, pa_buffers,
+                           allow_range_index, use_pandas_metadata)
+    return df
+
+
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
                    Expression filters=None):
@@ -322,33 +349,28 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
             pa_buffers.append(datasource)
             filepaths_or_buffers[i] = NativeFileDatasource(datasource)
 
-    cdef cudf_io_types.source_info source = make_source_info(
-        filepaths_or_buffers)
-
-    cdef vector[vector[size_type]] cpp_row_groups
-    if row_groups is not None:
-        cpp_row_groups = row_groups
-
-    # Setup parquet reader arguments
-    cdef parquet_reader_options args
-    cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
-            source, cpp_row_groups, use_pandas_metadata, filters, columns)
-    args, allow_range_index = c_res.first, c_res.second
+    allow_range_index = True
+    if columns is not None and len(columns) == 0 or filters:
+        allow_range_index = False
 
     # Read Parquet
-    cdef cudf_io_types.table_with_metadata c_result
 
-    with nogil:
-        c_result = move(parquet_reader(args))
+    tbl_w_meta = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo(filepaths_or_buffers),
+        columns,
+        row_groups,
+        filters,
+        convert_strings_to_categories = False,
+        use_pandas_metadata = use_pandas_metadata,
+    )
 
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(tbl_w_meta)
+    )
 
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=names
-    ))
-    df = _process_metadata(df, c_result.metadata, names, row_groups,
-                           filepaths_or_buffers, pa_buffers,
+    df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
+                           tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
+                           row_groups, filepaths_or_buffers, pa_buffers,
                            allow_range_index, use_pandas_metadata)
     return df
 
@@ -804,120 +826,6 @@ cdef class ParquetWriter:
         self.initialized = True
 
 
-cdef class ParquetReader:
-    cdef bool initialized
-    cdef unique_ptr[cpp_chunked_parquet_reader] reader
-    cdef size_t chunk_read_limit
-    cdef size_t pass_read_limit
-    cdef size_t row_group_size_bytes
-    cdef table_metadata result_meta
-    cdef vector[unordered_map[string, string]] per_file_user_data
-    cdef object pandas_meta
-    cdef list pa_buffers
-    cdef bool allow_range_index
-    cdef object row_groups
-    cdef object filepaths_or_buffers
-    cdef object names
-    cdef object column_index_type
-    cdef object index_col_names
-    cdef bool is_range_index
-    cdef object index_col
-    cdef bool cpp_use_pandas_metadata
-
-    def __cinit__(self, filepaths_or_buffers, columns=None, row_groups=None,
-                  use_pandas_metadata=True,
-                  size_t chunk_read_limit=0,
-                  size_t pass_read_limit=1024000000):
-
-        # Convert NativeFile buffers to NativeFileDatasource,
-        # but save original buffers in case we need to use
-        # pyarrow for metadata processing
-        # (See: https://github.com/rapidsai/cudf/issues/9599)
-
-        pa_buffers = []
-        for i, datasource in enumerate(filepaths_or_buffers):
-            if isinstance(datasource, NativeFile):
-                pa_buffers.append(datasource)
-                filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-        self.pa_buffers = pa_buffers
-        cdef cudf_io_types.source_info source = make_source_info(
-            filepaths_or_buffers)
-
-        self.cpp_use_pandas_metadata = use_pandas_metadata
-
-        cdef vector[vector[size_type]] cpp_row_groups
-        if row_groups is not None:
-            cpp_row_groups = row_groups
-        cdef parquet_reader_options args
-        cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
-            source, cpp_row_groups, use_pandas_metadata, None, columns)
-        args, self.allow_range_index = c_res.first, c_res.second
-
-        with nogil:
-            self.reader.reset(
-                new cpp_chunked_parquet_reader(
-                    chunk_read_limit,
-                    pass_read_limit,
-                    args
-                )
-            )
-        self.initialized = False
-        self.row_groups = row_groups
-        self.filepaths_or_buffers = filepaths_or_buffers
-
-    def _has_next(self):
-        cdef bool res
-        with nogil:
-            res = self.reader.get()[0].has_next()
-        return res
-
-    def _read_chunk(self):
-        # Read Parquet
-        cdef cudf_io_types.table_with_metadata c_result
-
-        with nogil:
-            c_result = move(self.reader.get()[0].read_chunk())
-
-        if not self.initialized:
-            self.names = [info.name.decode() for info in c_result.metadata.schema_info]
-            self.result_meta = c_result.metadata
-
-        df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-            move(c_result.tbl),
-            column_names=self.names,
-        ))
-
-        self.initialized = True
-        return df
-
-    def read(self):
-        dfs = self._read_chunk()
-        column_names = dfs._column_names
-        concatenated_columns = list(dfs._columns)
-        del dfs
-        while self._has_next():
-            new_chunk = list(self._read_chunk()._columns)
-            for i in range(len(column_names)):
-                concatenated_columns[i] = concat_columns(
-                    [concatenated_columns[i], new_chunk[i]]
-                )
-                # Must drop any residual GPU columns to save memory
-                new_chunk[i] = None
-
-        dfs = cudf.DataFrame._from_data(
-            *data_from_pylibcudf_table(
-                pylibcudf.Table(
-                    [col.to_pylibcudf(mode="read") for col in concatenated_columns]
-                ),
-                column_names=column_names,
-                index_names=None
-                )
-            )
-
-        return _process_metadata(dfs, self.result_meta, self.names, self.row_groups,
-                                 self.filepaths_or_buffers, self.pa_buffers,
-                                 self.allow_range_index, self.cpp_use_pandas_metadata)
-
 cpdef merge_filemetadata(object filemetadata_list):
     """
     Cython function to call into libcudf API, see `merge_row_group_metadata`.
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 56bfa0ba332..64634b7a6f9 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+import warnings
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         A numpy data type to use for the output, defaults
         to the same type as the input column
     """
-
-    col_dtype = (
-        dtype if dtype is not None
-        else incol._reduction_result_dtype(reduction_op)
-    )
+    if dtype is not None:
+        warnings.warn(
+            "dtype is deprecated and will be remove in a future release. "
+            "Cast the result (e.g. .astype) after the operation instead.",
+            FutureWarning
+        )
+        col_dtype = dtype
+    else:
+        col_dtype = incol._reduction_result_dtype(reduction_op)
 
     # check empty case
     if len(incol) <= incol.null_count:
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index fc672caa574..253fdf7b0d9 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -21,8 +21,6 @@ from cudf._lib.types cimport (
 import cudf
 from cudf._lib import pylibcudf
 
-size_type_dtype = np.dtype("int32")
-
 
 class TypeId(IntEnum):
     EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY
@@ -150,6 +148,8 @@ datetime_unit_map = {
     TypeId.TIMESTAMP_NANOSECONDS: "ns",
 }
 
+size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
+
 
 class Interpolation(IntEnum):
     LINEAR = (
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index d97e9c815b6..294ae2fd985 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -90,7 +90,7 @@ def is_integer(obj):
     bool
     """
     if isinstance(obj, cudf.Scalar):
-        return pd.api.types.is_integer_dtype(obj.dtype)
+        return obj.dtype.kind in "iu"
     return pd.api.types.is_integer(obj)
 
 
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 479f87bb78b..c91514202c5 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -19,14 +19,7 @@
 )
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import (
-    is_integer,
-    is_integer_dtype,
-    is_list_like,
-    is_scalar,
-    is_signed_integer_dtype,
-    is_unsigned_integer_dtype,
-)
+from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.errors import MixedTypeError
@@ -62,6 +55,12 @@ def copy(self, deep: bool = True) -> Self:
     def __len__(self):
         raise NotImplementedError
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     @property
     def size(self):
         # The size of an index is always its length irrespective of dimension.
@@ -99,7 +98,7 @@ def astype(self, dtype, copy: bool = True):
         """
         raise NotImplementedError
 
-    def argsort(self, *args, **kwargs):
+    def argsort(self, *args, **kwargs) -> cupy.ndarray:
         """Return the integer indices that would sort the index.
 
         Parameters vary by subclass.
@@ -615,12 +614,8 @@ def union(self, other, sort=None):
                 # Bools + other types will result in mixed type.
                 # This is not yet consistent in pandas and specific to APIs.
                 raise MixedTypeError("Cannot perform union with mixed types")
-            if (
-                is_signed_integer_dtype(self.dtype)
-                and is_unsigned_integer_dtype(other.dtype)
-            ) or (
-                is_unsigned_integer_dtype(self.dtype)
-                and is_signed_integer_dtype(other.dtype)
+            if (self.dtype.kind == "i" and other.dtype.kind == "u") or (
+                self.dtype.kind == "u" and other.dtype.kind == "i"
             ):
                 # signed + unsigned types will result in
                 # mixed type for union in pandas.
@@ -873,6 +868,24 @@ def to_numpy(self):
         """Convert to a numpy array."""
         raise NotImplementedError
 
+    def to_flat_index(self) -> Self:
+        """
+        Identity method.
+
+        This is implemented for compatibility with subclass implementations
+        when chaining.
+
+        Returns
+        -------
+        pd.Index
+            Caller.
+
+        See Also
+        --------
+        MultiIndex.to_flat_index : Subclass implementation.
+        """
+        return self
+
     def any(self):
         """
         Return whether any elements is True in Index.
@@ -950,7 +963,7 @@ def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False):
         """
         raise NotImplementedError
 
-    def isin(self, values):
+    def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
         Compute boolean array of whether each index value is found in
@@ -961,6 +974,9 @@ def isin(self, values):
         ----------
         values : set, list-like, Index
             Sought values.
+        level : str or int, optional
+            Name or position of the index level to use (if the index is a
+            `MultiIndex`).
 
         Returns
         -------
@@ -984,7 +1000,7 @@ def isin(self, values):
         # ColumnBase.isin).
         raise NotImplementedError
 
-    def unique(self):
+    def unique(self, level: int | None = None):
         """
         Return unique values in the index.
 
@@ -1525,7 +1541,7 @@ def sort_values(
         ascending=True,
         na_position="last",
         key=None,
-    ):
+    ) -> Self | tuple[Self, cupy.ndarray]:
         """
         Return a sorted copy of the index, and optionally return the indices
         that sorted the index itself.
@@ -2097,7 +2113,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(gather_map.dtype):
+        if gather_map.dtype.kind not in "iu":
             gather_map = gather_map.astype(size_type_dtype)
 
         if not _gather_map_is_valid(
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6003a0f6aea..18ab32d2c9e 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -47,7 +47,7 @@ def _check_and_cast_columns_with_other(
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if isinstance(other, float) and not np.isnan(other):
+        if isinstance(other, (float, np.floating)) and not np.isnan(other):
             try:
                 is_safe = source_dtype.type(other) == other
             except OverflowError:
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9aaccca349d..9433a91b9c6 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -262,37 +262,10 @@ def add_categories(self, new_categories: Any) -> SeriesOrIndex | None:
         dtype: category
         Categories (2, int64): [1, 2]
         """
-        old_categories = self._column.categories
-        new_categories = column.as_column(
-            new_categories,
-            dtype=old_categories.dtype if len(new_categories) == 0 else None,
-        )
-
-        if is_mixed_with_object_dtype(old_categories, new_categories):
-            raise TypeError(
-                f"cudf does not support adding categories with existing "
-                f"categories of dtype `{old_categories.dtype}` and new "
-                f"categories of dtype `{new_categories.dtype}`, please "
-                f"type-cast new_categories to the same type as "
-                f"existing categories."
-            )
-        common_dtype = find_common_type(
-            [old_categories.dtype, new_categories.dtype]
+        return self._return_or_inplace(
+            self._column.add_categories(new_categories=new_categories)
         )
 
-        new_categories = new_categories.astype(common_dtype)
-        old_categories = old_categories.astype(common_dtype)
-
-        if old_categories.isin(new_categories).any():
-            raise ValueError("new categories must not include old categories")
-
-        new_categories = old_categories.append(new_categories)
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
-
     def remove_categories(
         self,
         removals: Any,
@@ -349,23 +322,9 @@ def remove_categories(
         dtype: category
         Categories (3, int64): [1, 2, 10]
         """
-
-        cats = self.categories.to_series()
-        removals = cudf.Series(removals, dtype=cats.dtype)
-        removals_mask = removals.isin(cats)
-
-        # ensure all the removals are in the current categories
-        # list. If not, raise an error to match Pandas behavior
-        if not removals_mask.all():
-            vals = removals[~removals_mask].to_numpy()
-            raise ValueError(f"removals must all be in old categories: {vals}")
-
-        new_categories = cats[~cats.isin(removals)]._column
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
+        return self._return_or_inplace(
+            self._column.remove_categories(removals=removals)
+        )
 
     def set_categories(
         self,
@@ -1319,7 +1278,7 @@ def _set_categories(
         new_categories: Any,
         is_unique: bool = False,
         ordered: bool = False,
-    ) -> CategoricalColumn:
+    ) -> Self:
         """Returns a new CategoricalColumn with the categories set to the
         specified *new_categories*.
 
@@ -1376,17 +1335,68 @@ def _set_categories(
         new_codes = df._data["new_codes"]
 
         # codes can't have masks, so take mask out before moving in
-        return column.build_categorical_column(
-            categories=new_cats,
-            codes=column.build_column(
-                new_codes.base_data, dtype=new_codes.dtype
+        return cast(
+            Self,
+            column.build_categorical_column(
+                categories=new_cats,
+                codes=column.build_column(
+                    new_codes.base_data, dtype=new_codes.dtype
+                ),
+                mask=new_codes.base_mask,
+                size=new_codes.size,
+                offset=new_codes.offset,
+                ordered=ordered,
             ),
-            mask=new_codes.base_mask,
-            size=new_codes.size,
-            offset=new_codes.offset,
-            ordered=ordered,
         )
 
+    def add_categories(self, new_categories: Any) -> Self:
+        old_categories = self.categories
+        new_categories = column.as_column(
+            new_categories,
+            dtype=old_categories.dtype if len(new_categories) == 0 else None,
+        )
+        if is_mixed_with_object_dtype(old_categories, new_categories):
+            raise TypeError(
+                f"cudf does not support adding categories with existing "
+                f"categories of dtype `{old_categories.dtype}` and new "
+                f"categories of dtype `{new_categories.dtype}`, please "
+                f"type-cast new_categories to the same type as "
+                f"existing categories."
+            )
+        common_dtype = find_common_type(
+            [old_categories.dtype, new_categories.dtype]
+        )
+
+        new_categories = new_categories.astype(common_dtype)
+        old_categories = old_categories.astype(common_dtype)
+
+        if old_categories.isin(new_categories).any():
+            raise ValueError("new categories must not include old categories")
+
+        new_categories = old_categories.append(new_categories)
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
+    def remove_categories(
+        self,
+        removals: Any,
+    ) -> Self:
+        removals = column.as_column(removals).astype(self.categories.dtype)
+        removals_mask = removals.isin(self.categories)
+
+        # ensure all the removals are in the current categories
+        # list. If not, raise an error to match Pandas behavior
+        if not removals_mask.all():
+            raise ValueError("removals must all be in old categories")
+
+        new_categories = self.categories.apply_boolean_mask(
+            self.categories.isin(removals).unary_operator("not")
+        )
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
     def reorder_categories(
         self,
         new_categories: Any,
@@ -1404,6 +1414,16 @@ def reorder_categories(
             )
         return self._set_categories(new_categories, ordered=ordered)
 
+    def rename_categories(self, new_categories) -> CategoricalColumn:
+        raise NotImplementedError(
+            "rename_categories is currently not supported."
+        )
+
+    def remove_unused_categories(self) -> Self:
+        raise NotImplementedError(
+            "remove_unused_categories is currently not supported."
+        )
+
     def as_ordered(self, ordered: bool):
         if self.dtype.ordered == ordered:
             return self
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9467bbeed15..32e6aade65b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -71,7 +71,7 @@
     get_time_unit,
     is_column_like,
     is_mixed_with_object_dtype,
-    min_scalar_type,
+    min_signed_type,
     min_unsigned_type,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
@@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
         if self.null_count == self.size:
             return True
 
-        return libcudf.reduce.reduce("all", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", self)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and self.null_count == self.size:
             return False
 
-        return libcudf.reduce.reduce("any", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", self)
 
     def dropna(self) -> Self:
         if self.has_nulls():
@@ -1305,7 +1305,10 @@ def _reduce(
             skipna=skipna, min_count=min_count
         )
         if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+            dtype = kwargs.pop("dtype", None)
+            return libcudf.reduce.reduce(
+                op, preprocessed, dtype=dtype, **kwargs
+            )
         return preprocessed
 
     def _process_for_reduction(
@@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         Determine the correct dtype to pass to libcudf based on
         the input dtype, data dtype, and specific reduction op
         """
+        if reduction_op in {"any", "all"}:
+            return np.dtype(np.bool_)
         return self.dtype
 
     def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
@@ -1351,7 +1356,7 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: ScalarLike | None = None,
+        na_sentinel: cudf.Scalar | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1391,7 +1396,7 @@ def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
@@ -1453,9 +1458,10 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
-def _has_any_nan(arbitrary):
+def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
+    """Check if an object dtype Series or array contains NaN."""
     return any(
-        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        isinstance(x, (float, np.floating)) and np.isnan(x)
         for x in np.asarray(arbitrary)
     )
 
@@ -2213,25 +2219,26 @@ def as_column(
                 and arbitrary.null_count > 0
             ):
                 arbitrary = arbitrary.cast(pa.float64())
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and pa.types.is_integer(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("int")
-            elif cudf.get_option(
-                "default_float_bitwidth"
-            ) and pa.types.is_floating(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("float")
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and pa.types.is_integer(arbitrary.type)
+            ) or (
+                cudf.get_option("default_float_bitwidth")
+                and pa.types.is_floating(arbitrary.type)
+            ):
+                dtype = _maybe_convert_to_default_type(
+                    cudf.dtype(arbitrary.type.to_pandas_dtype())
+                )
         except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
             arbitrary = pd.Series(arbitrary)
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and arbitrary.dtype.kind in set("iu"):
-                dtype = _maybe_convert_to_default_type("int")
-            elif (
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and arbitrary.dtype.kind in set("iu")
+            ) or (
                 cudf.get_option("default_float_bitwidth")
                 and arbitrary.dtype.kind == "f"
             ):
-                dtype = _maybe_convert_to_default_type("float")
+                dtype = _maybe_convert_to_default_type(arbitrary.dtype)
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
@@ -2307,9 +2314,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtyp)
-        and np.issubdtype(dtyp, np.datetime64)
-        for dtyp in not_null_col_dtypes
+        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)
         # Cast all columns to the common dtype
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 004a059af95..73902789c11 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -485,13 +485,11 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
                 format = format.split(" ")[0]
         return self.strftime(format)
 
-    def mean(
-        self, skipna=None, min_count: int = 0, dtype=np.float64
-    ) -> ScalarLike:
+    def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
         return pd.Timestamp(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
+            ).mean(skipna=skipna, min_count=min_count),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -499,12 +497,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+                skipna=skipna, min_count=min_count, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
         ).as_unit(self.time_unit)
@@ -642,7 +639,7 @@ def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        if np.issubdtype(to_dtype, np.datetime64):
+        if to_dtype.kind == "M":  # type: ignore[union-attr]
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index a63055ed527..6a7f338b065 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -15,7 +15,7 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf.api.types import is_integer_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
@@ -150,7 +150,7 @@ def _validate_fillna_value(
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
             if isinstance(other, cudf.core.column.NumericalColumn):
-                if not is_integer_dtype(other.dtype):
+                if other.dtype.kind not in "iu":
                     raise TypeError(
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index cc15e78314e..1b7cd95b3d0 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -564,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        if not _is_non_decimal_numeric_dtype(
-            lists_indices_col.children[1].dtype
-        ) or not np.issubdtype(
-            lists_indices_col.children[1].dtype, np.integer
+        if (
+            not _is_non_decimal_numeric_dtype(
+                lists_indices_col.children[1].dtype
+            )
+            or lists_indices_col.children[1].dtype.kind not in "iu"
         ):
             raise TypeError(
                 "lists_indices should be column of values of index types."
@@ -646,9 +647,17 @@ def sort_values(
         dtype: list
 
         .. pandas-compat::
-            **ListMethods.sort_values**
+            `pandas.Series.list.sort_values`
+
+            This method does not exist in pandas but it can be run
+            as:
 
-            The ``inplace`` and ``kind`` arguments are currently not supported.
+            >>> import pandas as pd
+            >>> s = pd.Series([[3, 2, 1], [2, 4, 3]])
+            >>> print(s.apply(sorted))
+            0    [1, 2, 3]
+            1    [2, 3, 4]
+            dtype: object
         """
         if inplace:
             raise NotImplementedError("`inplace` not currently implemented.")
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index cea68c88c90..f9404eb3b40 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -12,12 +12,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf.api.types import (
-    is_float_dtype,
-    is_integer,
-    is_integer_dtype,
-    is_scalar,
-)
+from cudf.api.types import is_integer, is_scalar
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -29,10 +24,10 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    find_common_type,
     min_column_type,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
-    numeric_normalize_types,
 )
 
 from .numerical_base import NumericalBaseColumn
@@ -225,25 +220,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 tmp = self if reflect else other
                 # Guard against division by zero for integers.
                 if (
-                    (tmp.dtype.type in int_float_dtype_mapping)
-                    and (tmp.dtype.type != np.bool_)
-                    and (
-                        (
-                            (
-                                np.isscalar(tmp)
-                                or (
-                                    isinstance(tmp, cudf.Scalar)
-                                    # host to device copy
-                                    and tmp.is_valid()
-                                )
-                            )
-                            and (0 == tmp)
-                        )
-                        or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
-                    )
+                    tmp.dtype.type in int_float_dtype_mapping
+                    and tmp.dtype.kind != "b"
                 ):
-                    out_dtype = cudf.dtype("float64")
-
+                    if isinstance(tmp, NumericalColumn) and 0 in tmp:
+                        out_dtype = cudf.dtype("float64")
+                    elif isinstance(tmp, cudf.Scalar):
+                        if tmp.is_valid() and tmp == 0:
+                            # tmp == 0 can return NA
+                            out_dtype = cudf.dtype("float64")
+                    elif is_scalar(tmp) and tmp == 0:
+                        out_dtype = cudf.dtype("float64")
         if op in {
             "__lt__",
             "__gt__",
@@ -257,7 +244,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
-            if is_float_dtype(self.dtype) or is_float_dtype(other.dtype):
+            if self.dtype.kind == "f" or other.dtype.kind == "f":
                 raise TypeError(
                     f"Operation 'bitwise {op[2:-2]}' not supported between "
                     f"{self.dtype.type.__name__} and "
@@ -268,8 +255,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         if (
             op == "__pow__"
-            and is_integer_dtype(self.dtype)
-            and (is_integer(other) or is_integer_dtype(other.dtype))
+            and self.dtype.kind in "iu"
+            and (is_integer(other) or other.dtype.kind in "iu")
         ):
             op = "INT_POW"
 
@@ -395,7 +382,7 @@ def all(self, skipna: bool = True) -> bool:
         if result_col.null_count == result_col.size:
             return True
 
-        return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", result_col)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -406,7 +393,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and result_col.null_count == result_col.size:
             return False
 
-        return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", result_col)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -517,11 +504,15 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        to_replace_col, replacement_col, replaced = numeric_normalize_types(
-            to_replace_col, replacement_col, self
+        common_type = find_common_type(
+            (to_replace_col.dtype, replacement_col.dtype, self.dtype)
         )
+        replaced = self.astype(common_type)
         df = cudf.DataFrame._from_data(
-            {"old": to_replace_col, "new": replacement_col}
+            {
+                "old": to_replace_col.astype(common_type),
+                "new": replacement_col.astype(common_type),
+            }
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
@@ -684,15 +675,16 @@ def to_pandas(
             return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
-        col_dtype = self.dtype
         if reduction_op in {"sum", "product"}:
-            col_dtype = (
-                col_dtype if col_dtype.kind == "f" else np.dtype("int64")
-            )
+            if self.dtype.kind == "f":
+                return self.dtype
+            return np.dtype("int64")
         elif reduction_op == "sum_of_squares":
-            col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
+            return np.result_dtype(self.dtype, np.dtype("uint64"))
+        elif reduction_op in {"var", "std", "mean"}:
+            return np.dtype("float64")
 
-        return col_dtype
+        return super()._reduction_result_dtype(reduction_op)
 
 
 def _normalize_find_and_replace_input(
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 95c78c5efcb..f41010062c8 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -144,32 +144,27 @@ def mean(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
     ):
-        return self._reduce(
-            "mean", skipna=skipna, min_count=min_count, dtype=dtype
-        )
+        return self._reduce("mean", skipna=skipna, min_count=min_count)
 
     def var(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "var", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "std", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 96f9cdfd655..ec95c50f455 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -612,7 +612,7 @@ def extract(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.extract**
+            :meth:`pandas.Series.str.extract`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -738,7 +738,7 @@ def contains(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.contains**
+            :meth:`pandas.Series.str.contains`
 
             The parameters `case` and `na` are not yet supported and will
             raise a NotImplementedError if anything other than the default
@@ -974,7 +974,7 @@ def replace(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.replace**
+            :meth:`pandas.Series.str.replace`
 
             The parameters `case` and `flags` are not yet supported and will
             raise a `NotImplementedError` if anything other than the default
@@ -2803,7 +2803,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
                    )
 
         .. pandas-compat::
-            **StringMethods.partition**
+            :meth:`pandas.Series.str.partition`
 
             The parameter `expand` is not yet supported and will raise a
             `NotImplementedError` if anything other than the default
@@ -3527,7 +3527,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         Index([0, 0, 2, 1], dtype='int64')
 
         .. pandas-compat::
-            **StringMethods.count**
+            :meth:`pandas.Series.str.count`
 
             -   `flags` parameter currently only supports re.DOTALL
                 and re.MULTILINE.
@@ -3607,7 +3607,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         dtype: list
 
         .. pandas-compat::
-            **StringMethods.findall**
+            :meth:`pandas.Series.str.findall`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -3811,7 +3811,7 @@ def endswith(self, pat: str) -> SeriesOrIndex:
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.endswith**
+            :meth:`pandas.Series.str.endswith`
 
             `na` parameter is not yet supported, as cudf uses
             native strings instead of Python objects.
@@ -4264,7 +4264,7 @@ def match(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.match**
+            :meth:`pandas.Series.str.match`
 
             Parameters `case` and `na` are currently not supported.
             The `flags` parameter currently only supports re.DOTALL and
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 36d7d9f9614..59ea1cc002c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
+    def mean(self, skipna=None) -> pd.Timedelta:
         return pd.Timedelta(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, dtype=dtype),
+            ).mean(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -345,12 +345,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
+                skipna=skipna, min_count=min_count, ddof=ddof
             ),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index f30a557efb0..819d351b2c4 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -16,6 +16,8 @@
 from cudf.core import column
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import Dtype
     from cudf.core.column import ColumnBase
 
@@ -86,58 +88,58 @@ class ColumnAccessor(abc.MutableMapping):
         (default=None).
     verify : bool, optional
         For non ColumnAccessor inputs, whether to verify
-        column length and type
+        column length and data.values() are all Columns
     """
 
-    _data: "dict[Any, ColumnBase]"
-    multiindex: bool
+    _data: dict[Any, ColumnBase]
     _level_names: tuple[Any, ...]
 
     def __init__(
         self,
-        data: abc.MutableMapping | ColumnAccessor | None = None,
+        data: abc.MutableMapping[Any, ColumnBase] | Self,
         multiindex: bool = False,
         level_names=None,
         rangeindex: bool = False,
         label_dtype: Dtype | None = None,
         verify: bool = True,
     ):
-        self.rangeindex = rangeindex
-        self.label_dtype = label_dtype
-        if data is None:
-            data = {}
-        # TODO: we should validate the keys of `data`
         if isinstance(data, ColumnAccessor):
-            multiindex = multiindex or data.multiindex
-            level_names = level_names or data.level_names
             self._data = data._data
-            self.multiindex = multiindex
-            self._level_names = level_names
-            self.rangeindex = data.rangeindex
-            self.label_dtype = data.label_dtype
-        else:
+            self._level_names = data.level_names
+            self.multiindex: bool = data.multiindex
+            self.rangeindex: bool = data.rangeindex
+            self.label_dtype: Dtype | None = data.label_dtype
+        elif isinstance(data, abc.MutableMapping):
             # This code path is performance-critical for copies and should be
             # modified with care.
-            data = dict(data)
             if data and verify:
-                result = {}
                 # Faster than next(iter(data.values()))
                 column_length = len(data[next(iter(data))])
-                for k, v in data.items():
-                    # Much faster to avoid the function call if possible; the
-                    # extra isinstance is negligible if we do have to make a
-                    # column from something else.
-                    if not isinstance(v, column.ColumnBase):
-                        v = column.as_column(v)
-                    if len(v) != column_length:
+                # TODO: we should validate the keys of `data`
+                for col in data.values():
+                    if not isinstance(col, column.ColumnBase):
+                        raise ValueError(
+                            f"All data.values() must be Column, not {type(col).__name__}"
+                        )
+                    if len(col) != column_length:
                         raise ValueError("All columns must be of equal length")
-                    result[k] = v
-                self._data = result
-            else:
-                self._data = data
 
+            if not isinstance(data, dict):
+                data = dict(data)
+            self._data = data
+
+            if rangeindex and multiindex:
+                raise ValueError(
+                    f"{rangeindex=} and {multiindex=} cannot both be True."
+                )
+            self.rangeindex = rangeindex
             self.multiindex = multiindex
+            self.label_dtype = label_dtype
             self._level_names = level_names
+        else:
+            raise ValueError(
+                f"data must be a ColumnAccessor or MutableMapping, not {type(data).__name__}"
+            )
 
     def __iter__(self):
         return iter(self._data)
@@ -161,7 +163,9 @@ def __repr__(self) -> str:
         type_info = (
             f"{self.__class__.__name__}("
             f"multiindex={self.multiindex}, "
-            f"level_names={self.level_names})"
+            f"level_names={self.level_names}, "
+            f"rangeindex={self.rangeindex}, "
+            f"label_dtype={self.label_dtype})"
         )
         column_info = "\n".join(
             [f"{name}: {col.dtype}" for name, col in self.items()]
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b3d938829c9..6ea11fe9f64 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -83,8 +83,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
-    min_scalar_type,
-    numeric_normalize_types,
+    min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
@@ -103,20 +102,6 @@
     "var": "nanvar",
 }
 
-_numeric_reduction_ops = (
-    "mean",
-    "min",
-    "max",
-    "sum",
-    "product",
-    "prod",
-    "std",
-    "var",
-    "kurtosis",
-    "kurt",
-    "skew",
-)
-
 
 def _shape_mismatch_error(x, y):
     raise ValueError(
@@ -490,6 +475,7 @@ def __getitem__(self, arg):
                     {key: ca._data[key] for key in column_names},
                     multiindex=ca.multiindex,
                     level_names=ca.level_names,
+                    verify=False,
                 ),
                 index=index,
             )
@@ -500,6 +486,7 @@ def __getitem__(self, arg):
                     {key: ca._data[key] for key in column_names},
                     multiindex=ca.multiindex,
                     level_names=ca.level_names,
+                    verify=False,
                 ),
                 index=index,
             )
@@ -609,6 +596,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed.
         If None, infer.
+    copy : bool or None, default None
+        Copy data from inputs.
+        Currently not implemented.
     nan_as_null : bool, Default True
         If ``None``/``True``, converts ``np.nan`` values to
         ``null`` values.
@@ -695,8 +685,11 @@ def __init__(
         index=None,
         columns=None,
         dtype=None,
+        copy=None,
         nan_as_null=no_default,
     ):
+        if copy is not None:
+            raise NotImplementedError("copy is not currently implemented.")
         super().__init__()
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
@@ -780,6 +773,7 @@ def __init__(
                     else None,
                     rangeindex=rangeindex,
                     label_dtype=label_dtype,
+                    verify=False,
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
@@ -923,7 +917,8 @@ def _init_from_series_list(self, data, columns, index):
             final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
-        data = numeric_normalize_types(*data)
+        common_dtype = find_common_type([obj.dtype for obj in data])
+        data = [obj.astype(common_dtype) for obj in data]
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
             # getting union of all `index` of the Series objects.
@@ -939,7 +934,7 @@ def _init_from_series_list(self, data, columns, index):
                     )
                 if not series.index.equals(final_columns):
                     series = series.reindex(final_columns)
-                self._data[idx] = column.as_column(series._column)
+                self._data[idx] = series._column
 
             # Setting `final_columns` to self._index so
             # that the resulting `transpose` will be have
@@ -1538,6 +1533,25 @@ def __array_function__(self, func, types, args, kwargs):
             pass
         return NotImplemented
 
+    def __arrow_c_stream__(self, requested_schema=None):
+        """
+        Export the cudf DataFrame as an Arrow C stream PyCapsule.
+
+        Parameters
+        ----------
+        requested_schema : PyCapsule, default None
+            The schema to which the dataframe should be casted, passed as a
+            PyCapsule containing a C ArrowSchema representation of the
+            requested schema. Currently not implemented.
+
+        Returns
+        -------
+        PyCapsule
+        """
+        if requested_schema is not None:
+            raise NotImplementedError("requested_schema is not supported")
+        return self.to_arrow().__arrow_c_stream__()
+
     # The _get_numeric_data method is necessary for dask compatibility.
     @_performance_tracking
     def _get_numeric_data(self):
@@ -2249,6 +2263,7 @@ def to_dict(
         self,
         orient: str = "dict",
         into: type[dict] = dict,
+        index: bool = True,
     ) -> dict | list[dict]:
         """
         Convert the DataFrame to a dictionary.
@@ -2282,6 +2297,13 @@ def to_dict(
             instance of the mapping type you want.  If you want a
             collections.defaultdict, you must pass it initialized.
 
+        index : bool, default True
+            Whether to include the index item (and index_names item if `orient`
+            is 'tight') in the returned dictionary. Can only be ``False``
+            when `orient` is 'split' or 'tight'. Note that when `orient` is
+            'records', this parameter does not take effect (index item always
+            not included).
+
         Returns
         -------
         dict, list or collections.abc.Mapping
@@ -2363,7 +2385,7 @@ def to_dict(
                 raise TypeError(f"unsupported type: {into}")
             return cons(self.items())  # type: ignore[misc]
 
-        return self.to_pandas().to_dict(orient=orient, into=into)
+        return self.to_pandas().to_dict(orient=orient, into=into, index=index)
 
     @_performance_tracking
     def scatter_by_map(
@@ -2750,7 +2772,7 @@ def reindex(
         Chrome                200          0.02
 
         .. pandas-compat::
-            **DataFrame.reindex**
+            :meth:`pandas.DataFrame.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -2822,6 +2844,10 @@ def reindex(
             index=index,
             inplace=False,
             fill_value=fill_value,
+            level=level,
+            method=method,
+            limit=limit,
+            tolerance=tolerance,
         )
 
     @_performance_tracking
@@ -2939,7 +2965,7 @@ def set_index(
             # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    data_to_add.append(self[col])
+                    data_to_add.append(self[col]._column)
                     names.append(col)
                     if drop:
                         to_drop.append(col)
@@ -2954,7 +2980,7 @@ def set_index(
             elif isinstance(
                 col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
             ):
-                data_to_add.append(col)
+                data_to_add.append(as_column(col))
                 names.append(col.name)
             else:
                 try:
@@ -3018,7 +3044,12 @@ def fillna(
         )
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
+
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
             _make_categorical_like,
@@ -3160,7 +3191,14 @@ class  speed  type
         )
     )
     def reset_index(
-        self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
+        self,
+        level=None,
+        drop=False,
+        inplace=False,
+        col_level=0,
+        col_fill="",
+        allow_duplicates: bool = False,
+        names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None,
     ):
         return self._mimic_inplace(
             DataFrame._from_data(
@@ -3169,32 +3207,45 @@ def reset_index(
                     drop=drop,
                     col_level=col_level,
                     col_fill=col_fill,
+                    allow_duplicates=allow_duplicates,
+                    names=names,
                 )
             ),
             inplace=inplace,
         )
 
     @_performance_tracking
-    def insert(self, loc, name, value, nan_as_null=no_default):
+    def insert(
+        self,
+        loc,
+        column,
+        value,
+        allow_duplicates: bool = False,
+        nan_as_null=no_default,
+    ):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
         ----------
         loc : int
             location to insert by index, cannot be greater then num columns + 1
-        name : number or string
-            name or label of column to be inserted
+        column : number or string
+            column or label of column to be inserted
         value : Series or array-like
         nan_as_null : bool, Default None
             If ``None``/``True``, converts ``np.nan`` values to
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if allow_duplicates is not False:
+            raise NotImplementedError(
+                "allow_duplicates is currently not implemented."
+            )
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
         return self._insert(
             loc=loc,
-            name=name,
+            name=column,
             value=value,
             nan_as_null=nan_as_null,
             ignore_index=False,
@@ -3350,7 +3401,7 @@ def diff(self, periods=1, axis=0):
         5     2     5    20
 
         .. pandas-compat::
-            **DataFrame.diff**
+            :meth:`pandas.DataFrame.diff`
 
             Diff currently only supports numeric dtype columns.
         """
@@ -3555,7 +3606,7 @@ def rename(
         30  3  6
 
         .. pandas-compat::
-            **DataFrame.rename**
+            :meth:`pandas.DataFrame.rename`
 
             * Not Supporting: level
 
@@ -3628,7 +3679,9 @@ def rename(
         return result
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
@@ -3637,7 +3690,9 @@ def add_prefix(self, prefix):
         return out
 
     @_performance_tracking
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
@@ -3670,7 +3725,7 @@ def agg(self, aggs, axis=None):
             ``DataFrame`` is returned.
 
         .. pandas-compat::
-            **DataFrame.agg**
+            :meth:`pandas.DataFrame.agg`
 
             * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
@@ -3843,7 +3898,7 @@ def nlargest(self, n, columns, keep="first"):
         Brunei      434000    12128      BN
 
         .. pandas-compat::
-            **DataFrame.nlargest**
+            :meth:`pandas.DataFrame.nlargest`
 
             - Only a single column is supported in *columns*
         """
@@ -3915,7 +3970,7 @@ def nsmallest(self, n, columns, keep="first"):
         Nauru         337000  182      NR
 
         .. pandas-compat::
-            **DataFrame.nsmallest**
+            :meth:`pandas.DataFrame.nsmallest`
 
             - Only a single column is supported in *columns*
         """
@@ -3997,7 +4052,7 @@ def transpose(self):
         a new (ncol x nrow) dataframe. self is (nrow x ncol)
 
         .. pandas-compat::
-            **DataFrame.transpose, DataFrame.T**
+            :meth:`pandas.DataFrame.transpose`, :attr:`pandas.DataFrame.T`
 
             Not supporting *copy* because default and only behavior is
             copy=True
@@ -4053,7 +4108,15 @@ def transpose(self):
     T = property(transpose, doc=transpose.__doc__)
 
     @_performance_tracking
-    def melt(self, **kwargs):
+    def melt(
+        self,
+        id_vars=None,
+        value_vars=None,
+        var_name=None,
+        value_name="value",
+        col_level=None,
+        ignore_index: bool = True,
+    ):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
 
@@ -4080,23 +4143,30 @@ def melt(self, **kwargs):
         """
         from cudf.core.reshape import melt
 
-        return melt(self, **kwargs)
+        return melt(
+            self,
+            id_vars=id_vars,
+            value_vars=value_vars,
+            var_name=var_name,
+            value_name=value_name,
+            col_level=col_level,
+            ignore_index=ignore_index,
+        )
 
     @_performance_tracking
     def merge(
         self,
         right,
+        how="inner",
         on=None,
         left_on=None,
         right_on=None,
         left_index=False,
         right_index=False,
-        how="inner",
         sort=False,
-        lsuffix=None,
-        rsuffix=None,
-        indicator=False,
         suffixes=("_x", "_y"),
+        indicator=False,
+        validate=None,
     ):
         """Merge GPU DataFrame objects by performing a database-style join
         operation by columns or indexes.
@@ -4188,7 +4258,7 @@ def merge(
         from both sides.
 
         .. pandas-compat::
-            **DataFrame.merge**
+            :meth:`pandas.DataFrame.merge`
 
             DataFrames merges in cuDF result in non-deterministic row
             ordering.
@@ -4197,17 +4267,8 @@ def merge(
             raise NotImplementedError(
                 "Only indicator=False is currently supported"
             )
-
-        if lsuffix or rsuffix:
-            raise ValueError(
-                "The lsuffix and rsuffix keywords have been replaced with the "
-                "``suffixes=`` keyword.  "
-                "Please provide the following instead: \n\n"
-                "    suffixes=('%s', '%s')"
-                % (lsuffix or "_x", rsuffix or "_y")
-            )
-        else:
-            lsuffix, rsuffix = suffixes
+        if validate is not None:
+            raise NotImplementedError("validate is currently not supported.")
 
         lhs, rhs = self, right
         merge_cls = Merge
@@ -4244,6 +4305,7 @@ def join(
         lsuffix="",
         rsuffix="",
         sort=False,
+        validate: str | None = None,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -4257,19 +4319,33 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
+        validate : str, optional
+            If specified, checks if join is of specified type.
+
+            * "one_to_one" or "1:1": check if join keys are unique in both left
+              and right datasets.
+            * "one_to_many" or "1:m": check if join keys are unique in left dataset.
+            * "many_to_one" or "m:1": check if join keys are unique in right dataset.
+            * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+            Currently not supported.
 
         Returns
         -------
         joined : DataFrame
 
         .. pandas-compat::
-            **DataFrame.join**
+            :meth:`pandas.DataFrame.join`
 
             - *other* must be a single DataFrame for now.
             - *on* is not supported yet due to lack of multi-index support.
         """
         if on is not None:
             raise NotImplementedError("The on parameter is not yet supported")
+        elif validate is not None:
+            raise NotImplementedError(
+                "The validate parameter is not yet supported"
+            )
 
         df = self.merge(
             other,
@@ -4306,7 +4382,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -4317,7 +4392,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )
@@ -4385,7 +4459,7 @@ def query(self, expr, local_dict=None):
         1 2018-10-08
 
         .. pandas-compat::
-            **DataFrame.query**
+            :meth:`pandas.DataFrame.query`
 
             One difference from pandas is that ``query`` currently only
             supports numeric, datetime, timedelta, or bool dtypes.
@@ -4420,7 +4494,16 @@ def query(self, expr, local_dict=None):
 
     @_performance_tracking
     def apply(
-        self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
+        self,
+        func,
+        axis=1,
+        raw=False,
+        result_type=None,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        engine: Literal["python", "numba"] = "python",
+        engine_kwargs: dict[str, bool] | None = None,
+        **kwargs,
     ):
         """
         Apply a function along an axis of the DataFrame.
@@ -4448,6 +4531,24 @@ def apply(
             Not yet supported
         args: tuple
             Positional arguments to pass to func in addition to the dataframe.
+        by_row : False or "compat", default "compat"
+            Only has an effect when ``func`` is a listlike or dictlike of funcs
+            and the func isn't a string.
+            If "compat", will if possible first translate the func into pandas
+            methods (e.g. ``Series().apply(np.sum)`` will be translated to
+            ``Series().sum()``). If that doesn't work, will try call to apply again with
+            ``by_row=True`` and if that fails, will call apply again with
+            ``by_row=False`` (backward compatible).
+            If False, the funcs will be passed the whole Series at once.
+
+            Currently not supported.
+        engine : {'python', 'numba'}, default 'python'
+            Unused. Added for compatibility with pandas.
+        engine_kwargs : dict
+            Unused. Added for compatibility with pandas.
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `func`.
 
         Examples
         --------
@@ -4598,13 +4699,17 @@ def apply(
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
         """
         if axis != 1:
-            raise ValueError(
+            raise NotImplementedError(
                 "DataFrame.apply currently only supports row wise ops"
             )
         if raw:
-            raise ValueError("The `raw` kwarg is not yet supported.")
+            raise NotImplementedError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
-            raise ValueError("The `result_type` kwarg is not yet supported.")
+            raise NotImplementedError(
+                "The `result_type` kwarg is not yet supported."
+            )
+        if by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
@@ -4698,7 +4803,7 @@ def _func(x):  # pragma: no cover
         result = {}
         for name, col in self._data.items():
             apply_sr = Series._from_data({None: col})
-            result[name] = apply_sr.apply(_func)
+            result[name] = apply_sr.apply(_func)._column
 
         return DataFrame._from_data(result, index=self.index)
 
@@ -5447,10 +5552,11 @@ def from_arrow(cls, table):
         2  3  6
 
         .. pandas-compat::
-            **DataFrame.from_arrow**
+            `pandas.DataFrame.from_arrow`
 
-            -   Does not support automatically setting index column(s) similar
-                to how ``to_pandas`` works for PyArrow Tables.
+            This method does not exist in pandas but it is similar to
+            how :meth:`pyarrow.Table.to_pandas` works for PyArrow Tables i.e.
+            it does not support automatically setting index column(s).
         """
         index_col = None
         col_index_names = None
@@ -5504,7 +5610,7 @@ def from_arrow(cls, table):
         return out
 
     @_performance_tracking
-    def to_arrow(self, preserve_index=None):
+    def to_arrow(self, preserve_index=None) -> pa.Table:
         """
         Convert to a PyArrow Table.
 
@@ -5594,18 +5700,36 @@ def to_arrow(self, preserve_index=None):
         return out.replace_schema_metadata(metadata)
 
     @_performance_tracking
-    def to_records(self, index=True):
+    def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
         """Convert to a numpy recarray
 
         Parameters
         ----------
         index : bool
             Whether to include the index in the output.
+        column_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types. Currently not supported.
+        index_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+            This mapping is applied only if `index=True`.
+            Currently not supported.
 
         Returns
         -------
         numpy recarray
         """
+        if column_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
+        elif index_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
         members = [("index", self.index.dtype)] if index else []
         members += [(col, self[col].dtype) for col in self._data.names]
         dtype = np.dtype(members)
@@ -5618,7 +5742,16 @@ def to_records(self, index=True):
 
     @classmethod
     @_performance_tracking
-    def from_records(cls, data, index=None, columns=None, nan_as_null=False):
+    def from_records(
+        cls,
+        data,
+        index=None,
+        exclude=None,
+        columns=None,
+        coerce_float: bool = False,
+        nrows: int | None = None,
+        nan_as_null=False,
+    ):
         """
         Convert structured or record ndarray to DataFrame.
 
@@ -5628,13 +5761,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         index : str, array-like
             The name of the index column in *data*.
             If None, the default index is used.
+        exclude : sequence, default None
+            Columns or fields to exclude.
+            Currently not implemented.
         columns : list of str
             List of column names to include.
+        coerce_float : bool, default False
+            Attempt to convert values of non-string, non-numeric objects (like
+            decimal.Decimal) to floating point, useful for SQL result sets.
+            Currently not implemented.
+        nrows : int, default None
+            Number of rows to read if data is an iterator.
+            Currently not implemented.
 
         Returns
         -------
         DataFrame
         """
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported.")
+        if coerce_float is not False:
+            raise NotImplementedError(
+                "coerce_float is currently not supported."
+            )
+        if nrows is not None:
+            raise NotImplementedError("nrows is currently not supported.")
+
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found {data.ndim}"
@@ -5688,6 +5840,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
                 ),
                 level_names=level_names,
                 label_dtype=getattr(columns, "dtype", None),
+                verify=False,
             ),
             index=new_index,
         )
@@ -5774,6 +5927,7 @@ def _from_arrays(
                 ),
                 level_names=level_names,
                 label_dtype=getattr(columns, "dtype", None),
+                verify=False,
             ),
             index=index,
         )
@@ -5815,9 +5969,9 @@ def quantile(
         axis=0,
         numeric_only=True,
         interpolation=None,
+        method="single",
         columns=None,
         exact=True,
-        method="single",
     ):
         """
         Return values at the given quantile.
@@ -5843,14 +5997,14 @@ def quantile(
                 * higher: `j`.
                 * nearest: `i` or `j` whichever is nearest.
                 * midpoint: (`i` + `j`) / 2.
-        columns : list of str
-            List of column names to include.
-        exact : boolean
-            Whether to use approximate or exact quantile algorithm.
         method : {'single', 'table'}, default `'single'`
             Whether to compute quantiles per-column ('single') or over all
             columns ('table'). When 'table', the only allowed interpolation
             methods are 'nearest', 'lower', and 'higher'.
+        columns : list of str
+            List of column names to include.
+        exact : boolean
+            Whether to use approximate or exact quantile algorithm.
 
         Returns
         -------
@@ -5884,7 +6038,7 @@ def quantile(
         0.5  2.5  55.0
 
         .. pandas-compat::
-            **DataFrame.quantile**
+            :meth:`pandas.DataFrame.quantile`
 
             One notable difference from Pandas is when DataFrame is of
             non-numeric types and result is expected to be a Series in case of
@@ -6174,7 +6328,7 @@ def count(self, axis=0, numeric_only=False):
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.count**
+            :meth:`pandas.DataFrame.count`
 
             Parameters currently not supported are `axis` and `numeric_only`.
         """
@@ -6184,10 +6338,9 @@ def count(self, axis=0, numeric_only=False):
         length = len(self)
         return Series._from_data(
             {
-                None: [
-                    length - self._data[col].null_count
-                    for col in self._data.names
-                ]
+                None: as_column(
+                    [length - col.null_count for col in self._columns]
+                )
             },
             cudf.Index(self._data.names),
         )
@@ -6412,7 +6565,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         1  <NA>    2.0
 
         .. pandas-compat::
-            **DataFrame.mode**
+            :meth:`pandas.DataFrame.transpose`
 
             ``axis`` parameter is currently not supported.
         """
@@ -7173,25 +7326,47 @@ def unnamed_group_generator():
             return result
 
     @_performance_tracking
-    def cov(self, **kwargs):
+    def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
         """Compute the covariance matrix of a DataFrame.
 
         Parameters
         ----------
-        **kwargs
-            Keyword arguments to be passed to cupy.cov
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns to
+            have a valid result.
+            Currently not supported.
+
+        ddof : int, default 1
+            Delta degrees of freedom.  The divisor used in calculations
+            is ``N - ddof``, where ``N`` represents the number of elements.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported.
 
         Returns
         -------
         cov : DataFrame
         """
-        cov = cupy.cov(self.values, rowvar=False)
+        if min_periods is not None:
+            raise NotImplementedError(
+                "min_periods is currently not supported."
+            )
+
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
+        cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
         df._set_columns_like(self._data)
         return df
 
-    def corr(self, method="pearson", min_periods=None):
+    def corr(
+        self, method="pearson", min_periods=None, numeric_only: bool = False
+    ):
         """Compute the correlation matrix of a DataFrame.
 
         Parameters
@@ -7221,6 +7396,11 @@ def corr(self, method="pearson", min_periods=None):
         if min_periods is not None:
             raise NotImplementedError("Unsupported argument 'min_periods'")
 
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
         corr = cupy.corrcoef(values, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
@@ -7256,7 +7436,9 @@ def to_struct(self, name=None):
             offset=0,
         )
         return cudf.Series._from_data(
-            cudf.core.column_accessor.ColumnAccessor({name: col}),
+            cudf.core.column_accessor.ColumnAccessor(
+                {name: col}, verify=False
+            ),
             index=self.index,
             name=name,
         )
@@ -7359,9 +7541,9 @@ def pivot_table(
 
     @_performance_tracking
     @copy_docstring(reshape.unstack)
-    def unstack(self, level=-1, fill_value=None):
+    def unstack(self, level=-1, fill_value=None, sort: bool = True):
         return cudf.core.reshape.unstack(
-            self, level=level, fill_value=fill_value
+            self, level=level, fill_value=fill_value, sort=sort
         )
 
     @_performance_tracking
@@ -7407,7 +7589,12 @@ def explode(self, column, ignore_index=False):
         return super()._explode(column, ignore_index)
 
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -7432,6 +7619,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `DataFrame.shift`.
 
         Returns
         -------
@@ -7477,7 +7667,7 @@ def pct_change(
             data = self.fillna(method=fill_method, limit=limit)
 
         return data.diff(periods=periods) / data.shift(
-            periods=periods, freq=freq
+            periods=periods, freq=freq, **kwargs
         )
 
     def __dataframe__(
@@ -7594,7 +7784,7 @@ def interleave_columns(self):
         The interleaved columns as a single column
 
         .. pandas-compat::
-            **DataFrame.interleave_columns**
+            `pandas.DataFrame.interleave_columns`
 
             This method does not exist in pandas but it can be run
             as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
@@ -7696,7 +7886,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
         4  5   2   7  3
 
         .. pandas-compat::
-            **DataFrame.eval**
+            :meth:`pandas.DataFrame.eval`
 
             * Additional kwargs are not supported.
             * Bitwise and logical operators are not dtype-dependent.
@@ -8305,7 +8495,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             )._column.unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
-            dtypes[idx] = min_scalar_type(len(categories[idx]))
+            dtypes[idx] = min_signed_type(len(categories[idx]))
         # Otherwise raise an error if columns have different dtypes
         elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
             raise ValueError("All columns must be the same type")
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index de715191c08..27afec18b4e 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -17,10 +17,15 @@
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
 
 import cudf
-from cudf.core._compat import PANDAS_LT_300
+from cudf.core._compat import PANDAS_GE_210, PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.utils.docutils import doc_apply
 
+if PANDAS_GE_210:
+    PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.NumpyEADtype
+else:
+    PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.PandasDtype
+
 if TYPE_CHECKING:
     from cudf._typing import Dtype
     from cudf.core.buffer import Buffer
@@ -72,7 +77,7 @@ def dtype(arbitrary):
             return np.dtype("object")
         else:
             return dtype(pd_dtype.numpy_dtype)
-    elif isinstance(pd_dtype, pd.core.dtypes.dtypes.NumpyEADtype):
+    elif isinstance(pd_dtype, PANDAS_NUMPY_DTYPE):
         return dtype(pd_dtype.numpy_dtype)
     elif isinstance(pd_dtype, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(pd_dtype)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 802751e47ad..32c313e42d3 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray:
         return self.to_numpy()
 
     @_performance_tracking
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
             "allowed, To explicitly construct a GPU matrix, consider using "
@@ -591,7 +591,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.where, Series.where**
+            :meth:`pandas.DataFrame.where`, :meth:`pandas.Series.where`
 
             Note that ``where`` treats missing values as falsy,
             in parallel with pandas treatment of nullable data:
@@ -1187,6 +1187,7 @@ def searchsorted(
         self,
         values,
         side: Literal["left", "right"] = "left",
+        sorter=None,
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
     ) -> ScalarLike | cupy.ndarray:
@@ -1199,6 +1200,10 @@ def searchsorted(
         side : str {'left', 'right'} optional, default 'left'
             If 'left', the index of the first suitable location found is given
             If 'right', return the last such index
+        sorter : 1-D array-like, optional
+            Optional array of integer indices that sort `self` into ascending
+            order. They are typically the result of ``np.argsort``.
+            Currently not supported.
         ascending : bool optional, default True
             Sorted Frame is in ascending order (otherwise descending)
         na_position : str {'last', 'first'} optional, default 'last'
@@ -1245,10 +1250,12 @@ def searchsorted(
         >>> df.searchsorted(values_df, ascending=False)
         array([4, 4, 4, 0], dtype=int32)
         """
-        # Call libcudf search_sorted primitive
+        # Note: pandas.DataFrame does not support searchsorted
 
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
+        elif sorter is not None:
+            raise NotImplementedError("sorter is currently not supported.")
 
         scalar_flag = None
         if is_scalar(values):
@@ -1305,7 +1312,7 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         """Return the integer indices that would sort the Series values.
 
         Parameters
@@ -1587,6 +1594,12 @@ def __pos__(self):
     def __abs__(self):
         return self._unaryop("abs")
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     # Reductions
     @classmethod
     @_performance_tracking
@@ -1641,7 +1654,7 @@ def min(
         1
 
         .. pandas-compat::
-            **DataFrame.min, Series.min**
+            :meth:`pandas.DataFrame.min`, :meth:`pandas.Series.min`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1689,7 +1702,7 @@ def max(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.max, Series.max**
+            :meth:`pandas.DataFrame.max`, :meth:`pandas.Series.max`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1742,7 +1755,7 @@ def all(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.all, Series.all**
+            :meth:`pandas.DataFrame.all`, :meth:`pandas.Series.all`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
@@ -1795,7 +1808,7 @@ def any(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.any, Series.any**
+            :meth:`pandas.DataFrame.any`, :meth:`pandas.Series.any`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d2c75715be2..3cfbd1d736a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable
+from typing import TYPE_CHECKING, Any, Iterable, Literal
 
 import cupy as cp
 import numpy as np
@@ -306,6 +306,18 @@ def __iter__(self):
                 grouped_values[offsets[i] : offsets[i + 1]],
             )
 
+    def __len__(self) -> int:
+        return self.ngroups
+
+    @property
+    def ngroups(self) -> int:
+        _, offsets, _, _ = self._grouped()
+        return len(offsets) - 1
+
+    @property
+    def ndim(self) -> int:
+        return self.obj.ndim
+
     @property
     def dtypes(self):
         """
@@ -457,10 +469,20 @@ def size(self):
         )
 
     @_performance_tracking
-    def cumcount(self):
+    def cumcount(self, ascending: bool = True):
         """
         Return the cumulative count of keys in each group.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from length of group - 1 to 0.
+            Currently not supported
         """
+        if ascending is not True:
+            raise NotImplementedError(
+                "ascending is currently not implemented."
+            )
         return (
             cudf.Series(
                 cudf.core.column.column_empty(
@@ -527,7 +549,7 @@ def _groupby(self):
         )
 
     @_performance_tracking
-    def agg(self, func):
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
         """
         Apply aggregation(s) to the groups.
 
@@ -615,6 +637,22 @@ def agg(self, func):
         1  1.5  1.75  2.0   2.0
         2  3.0  3.00  1.0   1.0
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         column_names, columns, normalized_aggs = self._normalize_aggs(func)
         orig_dtypes = tuple(c.dtype for c in columns)
 
@@ -744,7 +782,8 @@ def _reduce(
             Computed {op} of values within each group.
 
         .. pandas-compat::
-            **{cls}.{op}**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.{op}`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.{op}`
 
             The numeric_only, min_count
         """
@@ -934,12 +973,13 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
         )
 
     @_performance_tracking
-    def nth(self, n):
+    def nth(self, n, dropna: Literal["any", "all", None] = None):
         """
         Return the nth row from each group.
         """
-
-        self.obj["__groupbynth_order__"] = range(0, len(self.obj))
+        if dropna is not None:
+            raise NotImplementedError("dropna is not currently supported.")
+        self.obj["__groupbynth_order__"] = range(0, len(self.obj))  # type: ignore[index]
         # We perform another groupby here to have the grouping columns
         # be a part of dataframe columns.
         result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n))
@@ -1359,7 +1399,9 @@ def _post_process_chunk_results(
         if isinstance(chunk_results, ColumnBase) or cudf.api.types.is_scalar(
             chunk_results[0]
         ):
-            data = {None: chunk_results}
+            data = ColumnAccessor(
+                {None: as_column(chunk_results)}, verify=False
+            )
             ty = cudf.Series if self._as_index else cudf.DataFrame
             result = ty._from_data(data, index=group_names)
             result.index.names = self.grouping.names
@@ -1420,13 +1462,13 @@ def _post_process_chunk_results(
 
     @_performance_tracking
     def apply(
-        self, function, *args, engine="auto", include_groups: bool = True
+        self, func, *args, engine="auto", include_groups: bool = True, **kwargs
     ):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
         ----------
-        function : callable
+        func : callable
           The python transformation function that will be applied
           on the grouped chunk.
         args : tuple
@@ -1449,6 +1491,9 @@ def apply(
             When True, will attempt to apply ``func`` to the groupings in
             the case that they are columns of the DataFrame. In the future,
             this will default to ``False``.
+        kwargs : dict
+            Optional keyword arguments to pass to the function.
+            Currently not supported
 
         Examples
         --------
@@ -1482,7 +1527,8 @@ def mult(df):
           6    2    6   12
 
         .. pandas-compat::
-            **GroupBy.apply**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.apply`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.apply`
 
             cuDF's ``groupby.apply`` is limited compared to pandas.
             In some situations, Pandas returns the grouped keys as part of
@@ -1524,13 +1570,17 @@ def mult(df):
         dtype: int64
 
         """
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         if self.obj.empty:
-            if function in {"count", "size", "idxmin", "idxmax"}:
+            if func in {"count", "size", "idxmin", "idxmax"}:
                 res = cudf.Series([], dtype="int64")
             else:
                 res = self.obj.copy(deep=True)
             res.index = self.grouping.keys
-            if function in {"sum", "product"}:
+            if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
                 for name, col in res._data.items():
@@ -1538,20 +1588,20 @@ def mult(df):
                         res._data[name] = col.astype("int")
             return res
 
-        if not callable(function):
-            raise TypeError(f"type {type(function)} is not callable")
+        if not callable(func):
+            raise TypeError(f"type {type(func)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped(
             include_groups=include_groups
         )
 
         if engine == "auto":
-            if _can_be_jitted(grouped_values, function, args):
+            if _can_be_jitted(grouped_values, func, args):
                 engine = "jit"
             else:
                 engine = "cudf"
         if engine == "jit":
             result = self._jit_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1560,7 +1610,7 @@ def mult(df):
             )
         elif engine == "cudf":
             result = self._iterative_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1740,12 +1790,14 @@ def _broadcast(self, values: cudf.Series) -> cudf.Series:
         return values
 
     @_performance_tracking
-    def transform(self, function):
+    def transform(
+        self, func, *args, engine=None, engine_kwargs=None, **kwargs
+    ):
         """Apply an aggregation, then broadcast the result to the group size.
 
         Parameters
         ----------
-        function: str or callable
+        func: str or callable
             Aggregation to apply to each group. Note that the set of
             operations currently supported by `transform` is identical
             to that supported by the `agg` method.
@@ -1774,18 +1826,35 @@ def transform(self, function):
         --------
         agg
         """
-        if not (isinstance(function, str) or callable(function)):
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
+
+        if not (isinstance(func, str) or callable(func)):
             raise TypeError(
                 "Aggregation must be a named aggregation or a callable"
             )
         try:
-            result = self.agg(function)
+            result = self.agg(func)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
         # If the aggregation is a scan, don't broadcast
-        if libgroupby._is_all_scan_aggregate([[function]]):
+        if libgroupby._is_all_scan_aggregate([[func]]):
             if len(result) != len(self.obj):
                 raise AssertionError(
                     "Unexpected result length for scan transform"
@@ -1820,7 +1889,7 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def describe(self, include=None, exclude=None):
+    def describe(self, percentiles=None, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
         dispersion and shape of a dataset's distribution, excluding NaN values.
@@ -1829,6 +1898,10 @@ def describe(self, include=None, exclude=None):
 
         Parameters
         ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output.
+            Currently not supported.
+
         include: 'all', list-like of dtypes or None (default), optional
             list of data types to include in the result.
             Ignored for Series.
@@ -1865,8 +1938,12 @@ def describe(self, include=None, exclude=None):
         90        1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0
 
         """
-        if exclude is not None and include is not None:
-            raise NotImplementedError
+        if percentiles is not None:
+            raise NotImplementedError("percentiles is currently not supported")
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported")
+        if include is not None:
+            raise NotImplementedError("include is currently not supported")
 
         res = self.agg(
             [
@@ -1892,69 +1969,7 @@ def describe(self, include=None, exclude=None):
         return res
 
     @_performance_tracking
-    def corr(self, method="pearson", min_periods=1):
-        """
-        Compute pairwise correlation of columns, excluding NA/null values.
-
-        Parameters
-        ----------
-        method: {"pearson", "kendall", "spearman"} or callable,
-            default "pearson". Currently only the pearson correlation
-            coefficient is supported.
-
-        min_periods: int, optional
-            Minimum number of observations required per pair of columns
-            to have a valid result.
-
-        Returns
-        -------
-        DataFrame
-            Correlation matrix.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> gdf = cudf.DataFrame({
-        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
-        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
-        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
-        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
-        >>> gdf
-           id  val1  val2  val3
-        0  a     5     4     4
-        1  a     4     5     5
-        2  a     6     6     6
-        3  b     4     1     1
-        4  b     8     2     2
-        5  b     7     9     9
-        6  c     4     8     8
-        7  c     5     5     5
-        8  c     2     1     1
-        >>> gdf.groupby("id").corr(method="pearson")
-                    val1      val2      val3
-        id
-        a   val1  1.000000  0.500000  0.500000
-            val2  0.500000  1.000000  1.000000
-            val3  0.500000  1.000000  1.000000
-        b   val1  1.000000  0.385727  0.385727
-            val2  0.385727  1.000000  1.000000
-            val3  0.385727  1.000000  1.000000
-        c   val1  1.000000  0.714575  0.714575
-            val2  0.714575  1.000000  1.000000
-            val3  0.714575  1.000000  1.000000
-        """
-
-        if method.lower() not in ("pearson",):
-            raise NotImplementedError(
-                "Only pearson correlation is currently supported"
-            )
-
-        return self._cov_or_corr(
-            lambda x: x.corr(method, min_periods), "Correlation"
-        )
-
-    @_performance_tracking
-    def cov(self, min_periods=0, ddof=1):
+    def cov(self, min_periods=0, ddof=1, numeric_only: bool = False):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
         excluding NA/null values.
@@ -2038,6 +2053,10 @@ def cov(self, min_periods=0, ddof=1):
            val2  3.833333  12.333333  12.333333
            val3  3.833333  12.333333  12.333333
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         return self._cov_or_corr(
             lambda x: x.cov(min_periods, ddof), "Covariance"
@@ -2133,7 +2152,13 @@ def _cov_or_corr(self, func, method_name):
         return res
 
     @_performance_tracking
-    def var(self, ddof=1):
+    def var(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise variance of the values in each group.
 
         Parameters
@@ -2142,6 +2167,18 @@ def var(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the variance.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "var")(ddof=ddof)
@@ -2149,7 +2186,13 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def std(self, ddof=1):
+    def std(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise std of the values in each group.
 
         Parameters
@@ -2158,6 +2201,18 @@ def std(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the standard deviation.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "std")(ddof=ddof)
@@ -2165,7 +2220,9 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def quantile(self, q=0.5, interpolation="linear"):
+    def quantile(
+        self, q=0.5, interpolation="linear", numeric_only: bool = False
+    ):
         """Compute the column-wise quantiles of the values in each group.
 
         Parameters
@@ -2175,7 +2232,14 @@ def quantile(self, q=0.5, interpolation="linear"):
         interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
             The interpolation method to use when the desired quantile lies
             between two data points. Defaults to "linear".
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is not currently supported."
+            )
 
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
@@ -2329,7 +2393,14 @@ def fillna(
         )
 
     @_performance_tracking
-    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+    def shift(
+        self,
+        periods=1,
+        freq=None,
+        axis=0,
+        fill_value=None,
+        suffix: str | None = None,
+    ):
         """
         Shift each group by ``periods`` positions.
 
@@ -2351,6 +2422,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
               the list. The length of the list should match the number of
               columns shifted. Each value should match the data type of the
               column to fill.
+        suffix : str, optional
+            A string to add to each shifted column if there are multiple periods.
+            Ignored otherwise.
+            Currently not supported.
 
         Returns
         -------
@@ -2358,7 +2433,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             Object shifted within each group.
 
         .. pandas-compat::
-            **GroupBy.shift**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.shift`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.shift`
 
             Parameter ``freq`` is unsupported.
         """
@@ -2369,6 +2445,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
+        if suffix is not None:
+            raise NotImplementedError("shift is not currently supported.")
+
         values = self.grouping.values
         if is_list_like(fill_value):
             if len(fill_value) != len(values._data):
@@ -2468,6 +2547,142 @@ def pct_change(
         shifted = fill_grp.shift(periods=periods, freq=freq)
         return (filled / shifted) - 1
 
+    def _mimic_pandas_order(
+        self, result: DataFrameOrSeries
+    ) -> DataFrameOrSeries:
+        """Given a groupby result from libcudf, reconstruct the row orders
+        matching that of pandas. This also adds appropriate indices.
+        """
+        # TODO: copy metadata after this method is a common pattern, should
+        # merge in this method.
+
+        # This function is used to reorder the results of scan-based
+        # groupbys which have the same output size as input size.
+        # However, if the grouping key has NAs and dropna=True, the
+        # result coming back from libcudf has null_count few rows than
+        # the input, so we must produce an ordering from the full
+        # input range.
+        _, _, (ordering,) = self._groupby.groups(
+            [as_column(range(0, len(self.obj)))]
+        )
+        if self._dropna and any(
+            c.has_nulls(include_nan=True) > 0
+            for c in self.grouping._key_columns
+        ):
+            # Scan aggregations with null/nan keys put nulls in the
+            # corresponding output rows in pandas, to do that here
+            # expand the result by reindexing.
+            ri = cudf.RangeIndex(0, len(self.obj))
+            result.index = cudf.Index(ordering)
+            # This reorders and expands
+            result = result.reindex(ri)
+        else:
+            # Just reorder according to the groupings
+            result = result.take(ordering.argsort())
+        # Now produce the actual index we first thought of
+        result.index = self.obj.index
+        return result
+
+    def ohlc(self):
+        """
+        Compute open, high, low and close values of a group, excluding missing values.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("ohlc is currently not implemented")
+
+    @property
+    def plot(self):
+        """
+        Make plots of a grouped Series or DataFrame.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("plot is currently not implemented")
+
+    def resample(self, rule, *args, include_groups: bool = True, **kwargs):
+        """
+        Provide resampling when using a TimeGrouper.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("resample is currently not implemented")
+
+    def take(self, indices):
+        """
+        Return the elements in the given *positional* indices in each group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("take is currently not implemented")
+
+    def filter(self, func, dropna: bool = True, *args, **kwargs):
+        """
+        Filter elements from groups that don't satisfy a criterion.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("filter is currently not implemented")
+
+    def expanding(self, *args, **kwargs):
+        """
+        Return an expanding grouper, providing expanding
+        functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def ewm(self, *args, **kwargs):
+        """
+        Return an ewm grouper, providing ewm functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def any(self, skipna: bool = True):
+        """
+        Return True if any value in the group is truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("any is currently not implemented")
+
+    def all(self, skipna: bool = True):
+        """
+        Return True if all values in the group are truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("all is currently not implemented")
+
+
+class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
+    obj: "cudf.core.dataframe.DataFrame"
+
+    _PROTECTED_KEYS = frozenset(("obj",))
+
+    def _reduce_numeric_only(self, op: str):
+        columns = list(
+            name
+            for name in self.obj._data.names
+            if (
+                is_numeric_dtype(self.obj._data[name].dtype)
+                and name not in self.grouping.names
+            )
+        )
+        return self[columns].agg(op)
+
+    def __getitem__(self, key):
+        return self.obj[key].groupby(
+            by=self.grouping.keys,
+            dropna=self._dropna,
+            sort=self._sort,
+            group_keys=self._group_keys,
+            as_index=self._as_index,
+        )
+
     def value_counts(
         self,
         subset=None,
@@ -2632,68 +2847,112 @@ def value_counts(
 
         return result
 
-    def _mimic_pandas_order(
-        self, result: DataFrameOrSeries
-    ) -> DataFrameOrSeries:
-        """Given a groupby result from libcudf, reconstruct the row orders
-        matching that of pandas. This also adds appropriate indices.
+    @_performance_tracking
+    def corr(
+        self, method="pearson", min_periods=1, numeric_only: bool = False
+    ):
         """
-        # TODO: copy metadata after this method is a common pattern, should
-        # merge in this method.
+        Compute pairwise correlation of columns, excluding NA/null values.
 
-        # This function is used to reorder the results of scan-based
-        # groupbys which have the same output size as input size.
-        # However, if the grouping key has NAs and dropna=True, the
-        # result coming back from libcudf has null_count few rows than
-        # the input, so we must produce an ordering from the full
-        # input range.
-        _, _, (ordering,) = self._groupby.groups(
-            [as_column(range(0, len(self.obj)))]
-        )
-        if self._dropna and any(
-            c.has_nulls(include_nan=True) > 0
-            for c in self.grouping._key_columns
-        ):
-            # Scan aggregations with null/nan keys put nulls in the
-            # corresponding output rows in pandas, to do that here
-            # expand the result by reindexing.
-            ri = cudf.RangeIndex(0, len(self.obj))
-            result.index = cudf.Index(ordering)
-            # This reorders and expands
-            result = result.reindex(ri)
-        else:
-            # Just reorder according to the groupings
-            result = result.take(ordering.argsort())
-        # Now produce the actual index we first thought of
-        result.index = self.obj.index
-        return result
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
 
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
 
-class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
-    obj: "cudf.core.dataframe.DataFrame"
+        Returns
+        -------
+        DataFrame
+            Correlation matrix.
 
-    _PROTECTED_KEYS = frozenset(("obj",))
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+           id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
 
-    def _reduce_numeric_only(self, op: str):
-        columns = list(
-            name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
+        if method != "pearson":
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
             )
-        )
-        return self[columns].agg(op)
 
-    def __getitem__(self, key):
-        return self.obj[key].groupby(
-            by=self.grouping.keys,
-            dropna=self._dropna,
-            sort=self._sort,
-            group_keys=self._group_keys,
-            as_index=self._as_index,
+        return self._cov_or_corr(
+            lambda x: x.corr(method, min_periods), "Correlation"
         )
 
+    def hist(
+        self,
+        column=None,
+        by=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        ax=None,
+        sharex: bool = False,
+        sharey: bool = False,
+        figsize: tuple[float, float] | None = None,
+        layout: tuple[int, int] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is not currently implemented")
+
+    def boxplot(
+        self,
+        subplots: bool = True,
+        column=None,
+        fontsize: int | None = None,
+        rot: int = 0,
+        grid: bool = True,
+        ax=None,
+        figsize: tuple[float, float] | None = None,
+        layout=None,
+        sharex: bool = False,
+        sharey: bool = True,
+        backend=None,
+        **kwargs,
+    ):
+        raise NotImplementedError("boxplot is not currently implemented")
+
 
 DataFrameGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
@@ -2701,8 +2960,10 @@ def __getitem__(self, key):
 class SeriesGroupBy(GroupBy):
     obj: "cudf.core.series.Series"
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
 
         # downcast the result to a Series:
         if len(result._data):
@@ -2717,14 +2978,95 @@ def agg(self, func):
 
     aggregate = agg
 
-    def apply(self, func, *args):
-        result = super().apply(func, *args)
+    def apply(self, func, *args, **kwargs):
+        result = super().apply(func, *args, **kwargs)
 
         # apply Series name to result
         result.name = self.obj.name
 
         return result
 
+    @property
+    def dtype(self) -> pd.Series:
+        raise NotImplementedError("dtype is currently not implemented.")
+
+    def hist(
+        self,
+        by=None,
+        ax=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        figsize: tuple[float, float] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is currently not implemented.")
+
+    @property
+    def is_monotonic_increasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically increasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_increasing is currently not implemented."
+        )
+
+    @property
+    def is_monotonic_decreasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically decreasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_decreasing is currently not implemented."
+        )
+
+    def nlargest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the largest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nlargest is currently not implemented.")
+
+    def nsmallest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the smallest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nsmallest is currently not implemented.")
+
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        bins=None,
+        dropna: bool = True,
+    ) -> cudf.Series | cudf.DataFrame:
+        raise NotImplementedError("value_counts is currently not implemented.")
+
+    def corr(
+        self,
+        other: cudf.Series,
+        method: str = "pearson",
+        min_periods: int | None = None,
+    ) -> cudf.Series:
+        raise NotImplementedError("corr is currently not implemented.")
+
 
 SeriesGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 4164f981fca..8c3b091abec 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -52,11 +52,9 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
-    numeric_normalize_types,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
@@ -80,6 +78,11 @@ class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
 
     def __call__(cls, data, *args, **kwargs):
+        if kwargs.get("tupleize_cols", True) is not True:
+            raise NotImplementedError(
+                "tupleize_cols is currently not supported."
+            )
+
         if cls is Index:
             return as_index(
                 arbitrary=data,
@@ -351,18 +354,16 @@ def hasnans(self) -> bool:
     @_performance_tracking
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
-            {self.name: self._values}
+            {self.name: self._values}, verify=False
         )
 
     @_performance_tracking
     def __contains__(self, item):
         hash(item)
-        if isinstance(item, bool) or not isinstance(
-            item,
-            tuple(
-                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
-            ),
-        ):
+        if not isinstance(item, (np.floating, np.integer, int, float)):
+            return False
+        elif isinstance(item, (np.timedelta64, np.datetime64, bool)):
+            # Cases that would pass the above check
             return False
         try:
             int_item = int(item)
@@ -539,8 +540,12 @@ def memory_usage(self, deep: bool = False) -> int:
             )
         return 0
 
-    def unique(self) -> Self:
+    def unique(self, level: int | None = None) -> Self:
         # RangeIndex always has unique values
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return self.copy()
 
     @_performance_tracking
@@ -963,7 +968,11 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
             i = []
         return as_column(i, dtype=size_type_dtype)
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
@@ -1001,21 +1010,23 @@ def __dask_tokenize__(self):
 
 class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
     """
-    An array of orderable values that represent the indices of another Column
+    Immutable sequence used for indexing and alignment.
 
-    Attributes
-    ----------
-    _values: A Column object
-    name: A string
+    The basic object storing axis labels for all pandas objects.
 
     Parameters
     ----------
-    data : Column
-        The Column of data for this index
-    name : str optional
-        The name of the Index. If not provided, the Index adopts the value
-        Column's name. Otherwise if this name is different from the value
-        Column's, the data Column will be cloned to adopt this name.
+    data : array-like (1-dimensional)
+    dtype : str, numpy.dtype, or ExtensionDtype, optional
+        Data type for the output Index. If not specified, this will be
+        inferred from `data`.
+    copy : bool, default False
+        Copy input data.
+    name : object
+        Name to be stored in the index.
+    tupleize_cols : bool (default: True)
+        When True, attempt to create a MultiIndex if possible.
+        Currently not supported.
     """
 
     @_performance_tracking
@@ -1460,18 +1471,19 @@ def notna(self):
     notnull = notna
 
     def _is_numeric(self):
-        return isinstance(
-            self._values, cudf.core.column.NumericalColumn
-        ) and self.dtype != cudf.dtype("bool")
+        return (
+            isinstance(self._values, cudf.core.column.NumericalColumn)
+            and self.dtype.kind != "b"
+        )
 
     def _is_boolean(self):
-        return self.dtype == cudf.dtype("bool")
+        return self.dtype.kind == "b"
 
     def _is_integer(self):
-        return cudf.api.types.is_integer_dtype(self.dtype)
+        return self.dtype.kind in "iu"
 
     def _is_floating(self):
-        return cudf.api.types.is_float_dtype(self.dtype)
+        return self.dtype.kind == "f"
 
     def _is_object(self):
         return isinstance(self._values, cudf.core.column.StringColumn)
@@ -1495,7 +1507,7 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         """Return the integer indices that would sort the index.
 
         Parameters
@@ -1601,19 +1613,31 @@ def append(self, other):
                         f"either one of them to same dtypes."
                     )
 
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
+                if (
+                    isinstance(self._column, cudf.core.column.NumericalColumn)
+                    and self.dtype != other.dtype
+                ):
+                    common_type = find_common_type((self.dtype, other.dtype))
+                    this = this.astype(common_type)
+                    other = other.astype(common_type)
                 to_concat = [this, other]
 
         return self._concat(to_concat)
 
-    def unique(self):
+    def unique(self, level: int | None = None) -> Self:
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return cudf.core.index._index_from_data(
             {self.name: self._values.unique()}, name=self.name
         )
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
@@ -1734,8 +1758,18 @@ def __init__(
         if tz is not None:
             raise NotImplementedError("tz is not yet supported")
         if normalize is not False:
+            warnings.warn(
+                "The 'normalize' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
             raise NotImplementedError("normalize == True is not yet supported")
         if closed is not None:
+            warnings.warn(
+                "The 'closed' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
             raise NotImplementedError("closed is not yet supported")
         if ambiguous != "raise":
             raise NotImplementedError("ambiguous is not yet supported")
@@ -2479,6 +2513,14 @@ def __init__(
         if freq is not None:
             raise NotImplementedError("freq is not yet supported")
 
+        if closed is not None:
+            warnings.warn(
+                "The 'closed' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
+            raise NotImplementedError("closed is not yet supported")
+
         if unit is not None:
             warnings.warn(
                 "The 'unit' keyword is "
@@ -2679,6 +2721,10 @@ def __init__(
             data = data.as_ordered(ordered=False)
         super().__init__(data, name=name)
 
+    @property
+    def ordered(self) -> bool:
+        return self._column.ordered
+
     @property  # type: ignore
     @_performance_tracking
     def codes(self):
@@ -2701,6 +2747,118 @@ def _is_boolean(self):
     def _is_categorical(self):
         return True
 
+    def add_categories(self, new_categories) -> Self:
+        """
+        Add new categories.
+
+        `new_categories` will be included at the last/highest place in the
+        categories and will be unused directly after this call.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.add_categories(new_categories)}
+        )
+
+    def as_ordered(self) -> Self:
+        """
+        Set the Categorical to be ordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=True)}
+        )
+
+    def as_unordered(self) -> Self:
+        """
+        Set the Categorical to be unordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=False)}
+        )
+
+    def remove_categories(self, removals) -> Self:
+        """
+        Remove the specified categories.
+
+        `removals` must be included in the old categories.
+
+        Parameters
+        ----------
+        removals : category or list of categories
+           The categories which should be removed.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_categories(removals)}
+        )
+
+    def remove_unused_categories(self) -> Self:
+        """
+        Remove categories which are not used.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_unused_categories()}
+        )
+
+    def rename_categories(self, new_categories) -> Self:
+        """
+        Rename categories.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.rename_categories(new_categories)}
+        )
+
+    def reorder_categories(self, new_categories, ordered=None) -> Self:
+        """
+        Reorder categories as specified in new_categories.
+
+        ``new_categories`` need to include all old categories and no new category
+        items.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, optional
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.reorder_categories(
+                    new_categories, ordered=ordered
+                )
+            }
+        )
+
+    def set_categories(
+        self, new_categories, ordered=None, rename: bool = False
+    ) -> Self:
+        """
+        Set the categories to the specified new_categories.
+
+        Parameters
+        ----------
+        new_categories : list-like
+            The categories in new order.
+        ordered : bool, default None
+            Whether or not the categorical is treated as
+            a ordered categorical. If not given, do
+            not change the ordered information.
+        rename : bool, default False
+            Whether or not the `new_categories` should be
+            considered as a rename of the old categories
+            or as reordered categories.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.set_categories(
+                    new_categories, ordered=ordered, rename=rename
+                )
+            }
+        )
+
 
 @_performance_tracking
 def interval_range(
@@ -2862,6 +3020,7 @@ def __init__(
         dtype=None,
         copy: bool = False,
         name=None,
+        verify_integrity: bool = True,
     ):
         name = _getdefault_name(data, name=name)
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 30b68574960..0678ebfdd81 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -84,6 +84,9 @@
 {argument}
         inplace : bool, default False
             Modify the DataFrame in place (do not create a new object).
+        allow_duplicates : bool, default False
+            Allow duplicate column labels to be created.
+            Currently not supported.
 
         Returns
         -------
@@ -497,7 +500,7 @@ def empty(self):
         True
 
         .. pandas-compat::
-            **DataFrame.empty, Series.empty**
+            :attr:`pandas.DataFrame.empty`, :attr:`pandas.Series.empty`
 
             If DataFrame/Series contains only `null` values, it is still not
             considered empty. See the example above.
@@ -831,7 +834,7 @@ def replace(
         4    4    9  e
 
         .. pandas-compat::
-            **DataFrame.replace, Series.replace**
+            :meth:`pandas.DataFrame.replace`, :meth:`pandas.Series.replace`
 
             Parameters that are currently not supported are: `limit`, `regex`,
             `method`
@@ -902,7 +905,7 @@ def replace(
         return self._mimic_inplace(result, inplace=inplace)
 
     @_performance_tracking
-    def clip(self, lower=None, upper=None, inplace=False, axis=1):
+    def clip(self, lower=None, upper=None, axis=1, inplace=False):
         """
         Trim values at input threshold(s).
 
@@ -1372,7 +1375,7 @@ def sum(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.sum, Series.sum**
+           :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1433,7 +1436,7 @@ def product(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.product, Series.product**
+            :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
 
             Parameters currently not supported are level`, `numeric_only`.
         """
@@ -1492,9 +1495,7 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    def median(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
-    ):
+    def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
         """
         Return the median of the values for the requested axis.
 
@@ -1530,7 +1531,7 @@ def median(
         17.0
 
         .. pandas-compat::
-            **DataFrame.median, Series.median**
+            :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
 
             Parameters currently not supported are `level` and `numeric_only`.
         """
@@ -1586,7 +1587,7 @@ def std(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.std, Series.std**
+            :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1645,7 +1646,7 @@ def var(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.var, Series.var**
+            :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1701,7 +1702,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.kurtosis**
+            :meth:`pandas.DataFrame.kurtosis`
 
             Parameters currently not supported are `level` and `numeric_only`
         """
@@ -1763,7 +1764,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.skew, Series.skew, Frame.skew**
+            :meth:`pandas.DataFrame.skew`, :meth:`pandas.Series.skew`
 
             The `axis` parameter is not currently supported.
         """
@@ -1779,7 +1780,14 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         )
 
     @_performance_tracking
-    def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
+    def mask(
+        self,
+        cond,
+        other=None,
+        inplace: bool = False,
+        axis=None,
+        level=None,
+    ) -> Self | None:
         """
         Replace values where the condition is True.
 
@@ -1831,6 +1839,10 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         4       0
         dtype: int64
         """
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
 
         if not hasattr(cond, "__invert__"):
             # We Invert `cond` below and call `where`, so
@@ -1843,7 +1855,16 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
     @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
-        self, window, min_periods=None, center=False, axis=0, win_type=None
+        self,
+        window,
+        min_periods=None,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
+        axis=0,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         return Rolling(
             self,
@@ -1851,7 +1872,11 @@ def rolling(
             min_periods=min_periods,
             center=center,
             axis=axis,
+            on=on,
             win_type=win_type,
+            closed=closed,
+            step=step,
+            method=method,
         )
 
     @copy_docstring(ExponentialMovingWindow)
@@ -1866,6 +1891,7 @@ def ewm(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
         return ExponentialMovingWindow(
             self,
@@ -1878,6 +1904,7 @@ def ewm(
             ignore_na=ignore_na,
             axis=axis,
             times=times,
+            method=method,
         )
 
     @_performance_tracking
@@ -2042,13 +2069,26 @@ def interpolate(
         )
 
     @_performance_tracking
-    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+    def shift(
+        self,
+        periods=1,
+        freq=None,
+        axis=0,
+        fill_value=None,
+        suffix: str | None = None,
+    ):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
         if axis != 0:
-            raise ValueError("Only axis=0 is supported.")
+            raise NotImplementedError("Only axis=0 is supported.")
         if freq is not None:
-            raise ValueError("The freq argument is not yet supported.")
+            raise NotImplementedError(
+                "The freq argument is not yet supported."
+            )
+        if suffix is not None:
+            raise NotImplementedError(
+                "The suffix argument is not yet supported."
+            )
 
         data_columns = (
             col.shift(periods, fill_value) for col in self._columns
@@ -2229,7 +2269,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         2021-01-01 23:45:27  1  2
 
         .. pandas-compat::
-            **DataFrame.truncate, Series.truncate**
+            :meth:`pandas.DataFrame.truncate`, :meth:`pandas.Series.truncate`
 
             The ``copy`` parameter is only present for API compatibility, but
             ``copy=False`` is not supported. This method always generates a
@@ -2665,7 +2705,7 @@ def sort_index(
         2  3  1
 
         .. pandas-compat::
-            **DataFrame.sort_index, Series.sort_index**
+            :meth:`pandas.DataFrame.sort_index`, :meth:`pandas.Series.sort_index`
 
             * Not supporting: kind, sort_remaining=False
         """
@@ -3225,7 +3265,9 @@ def _split(self, splits, keep_index=True):
         ]
 
     @_performance_tracking
-    def bfill(self, value=None, axis=None, inplace=None, limit=None):
+    def bfill(
+        self, value=None, axis=None, inplace=None, limit=None, limit_area=None
+    ):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
 
@@ -3233,6 +3275,9 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        if limit_area is not None:
+            raise NotImplementedError("limit_area is currently not supported.")
+
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             return self.fillna(
@@ -3264,7 +3309,14 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit)
 
     @_performance_tracking
-    def ffill(self, value=None, axis=None, inplace=None, limit=None):
+    def ffill(
+        self,
+        value=None,
+        axis=None,
+        inplace=None,
+        limit=None,
+        limit_area: Literal["inside", "outside", None] = None,
+    ):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
 
@@ -3272,6 +3324,9 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        if limit_area is not None:
+            raise NotImplementedError("limit_area is currently not supported.")
+
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             return self.fillna(
@@ -3302,7 +3357,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
         """
         Prefix labels with string `prefix`.
 
@@ -3363,7 +3418,7 @@ def add_prefix(self, prefix):
                 Use `Series.add_prefix` or `DataFrame.add_prefix`"
         )
 
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
         """
         Suffix labels with string `suffix`.
 
@@ -3464,6 +3519,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -3479,6 +3535,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -3497,7 +3561,7 @@ def sort_values(
         1  1  2
 
         .. pandas-compat::
-            **DataFrame.sort_values, Series.sort_values**
+            :meth:`pandas.DataFrame.sort_values`, :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * Not supporting: inplace, kind
@@ -3518,6 +3582,8 @@ def sort_values(
             )
         if axis != 0:
             raise NotImplementedError("`axis` not currently implemented.")
+        if key is not None:
+            raise NotImplementedError("key is not currently supported.")
 
         if len(self) == 0:
             return self
@@ -3642,6 +3708,10 @@ def _reindex(
         index=None,
         inplace=False,
         fill_value=NA,
+        level=None,
+        method=None,
+        limit=None,
+        tolerance=None,
     ):
         """
         Helper for `.reindex`
@@ -3666,6 +3736,15 @@ def _reindex(
         -------
         Series or DataFrame
         """
+        if method is not None:
+            raise NotImplementedError("method is not currently supported.")
+        if level is not None:
+            raise NotImplementedError("level is not currently supported.")
+        if limit is not None:
+            raise NotImplementedError("limit is not currently supported.")
+        if tolerance is not None:
+            raise NotImplementedError("tolerance is not currently supported.")
+
         if dtypes is None:
             dtypes = {}
 
@@ -3877,16 +3956,15 @@ def resample(
         self,
         rule,
         axis=0,
-        closed=None,
-        label=None,
-        convention="start",
+        closed: Literal["right", "left"] | None = None,
+        label: Literal["right", "left"] | None = None,
+        convention: Literal["start", "end", "s", "e"] = "start",
         kind=None,
-        loffset=None,
-        base=None,
         on=None,
         level=None,
         origin="start_day",
         offset=None,
+        group_keys: bool = False,
     ):
         """
         Convert the frequency of ("resample") the given time series data.
@@ -4008,7 +4086,7 @@ def resample(
 
 
         .. pandas-compat::
-            **DataFrame.resample, Series.resample**
+            :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample`
 
             Note that the dtype of the index (or the 'on' column if using
             'on=') in the result will be of a frequency closest to the
@@ -4024,26 +4102,27 @@ def resample(
                 "deprecated and will be removed in a future version. ",
                 FutureWarning,
             )
-        if (axis, convention, kind, loffset, base, origin, offset) != (
-            0,
-            "start",
-            None,
-            None,
-            None,
-            "start_day",
-            None,
-        ):
-            raise NotImplementedError(
-                "The following arguments are not "
-                "currently supported by resample:\n\n"
-                "- axis\n"
-                "- convention\n"
-                "- kind\n"
-                "- loffset\n"
-                "- base\n"
-                "- origin\n"
-                "- offset"
+            raise NotImplementedError("kind is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "The 'axis' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
             )
+            raise NotImplementedError("axis is currently not supported.")
+        if convention != "start":
+            warnings.warn(
+                "The 'convention' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
+            raise NotImplementedError("convention is currently not supported.")
+        if origin != "start_day":
+            raise NotImplementedError("origin is currently not supported.")
+        if offset is not None:
+            raise NotImplementedError("offset is currently not supported.")
+        if group_keys is not False:
+            raise NotImplementedError("group_keys is currently not supported.")
         by = cudf.Grouper(
             key=on, freq=rule, closed=closed, label=label, level=level
         )
@@ -4054,7 +4133,13 @@ def resample(
         )
 
     def dropna(
-        self, axis=0, how="any", thresh=None, subset=None, inplace=False
+        self,
+        axis=0,
+        how="any",
+        thresh=None,
+        subset=None,
+        inplace=False,
+        ignore_index: bool = False,
     ):
         """
         Drop rows (or columns) containing nulls from a Column.
@@ -4078,6 +4163,8 @@ def dropna(
             columns, subset is a list of rows to consider.
         inplace : bool, default False
             If True, do operation inplace and return None.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -4154,6 +4241,8 @@ def dropna(
         """
         if axis == 0:
             result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
+            if ignore_index:
+                result.index = RangeIndex(len(result))
         else:
             result = self._drop_na_columns(
                 how=how, subset=subset, thresh=thresh
@@ -4292,8 +4381,22 @@ def take(self, indices, axis=0):
 
         return self._gather(GatherMap(indices, len(self), nullify=False))
 
-    def _reset_index(self, level, drop, col_level=0, col_fill=""):
+    def _reset_index(
+        self,
+        level,
+        drop,
+        col_level=0,
+        col_fill="",
+        allow_duplicates: bool = False,
+        names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None,
+    ):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
+        if allow_duplicates is not False:
+            raise NotImplementedError(
+                "allow_duplicates is not currently supported."
+            )
+        elif names is not None:
+            raise NotImplementedError("names is not currently supported.")
         if level is not None:
             if (
                 isinstance(level, int)
@@ -4564,7 +4667,7 @@ def sample(
         1  2  4
 
         .. pandas-compat::
-            **DataFrame.sample, Series.sample**
+            :meth:`pandas.DataFrame.sample`, :meth:`pandas.Series.sample`
 
             When sampling from ``axis=0/'index'``, ``random_state`` can be
             either a numpy random state (``numpy.random.RandomState``)
@@ -5249,7 +5352,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -5259,11 +5361,6 @@ def groupby(
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 
-        if squeeze is not False:
-            raise NotImplementedError(
-                "squeeze parameter is not yet implemented"
-            )
-
         if not observed:
             raise NotImplementedError(
                 "observed parameter is not yet implemented"
@@ -6224,6 +6321,7 @@ def rank(
                     multiindex=self._data.multiindex,
                     level_names=self._data.level_names,
                     label_dtype=self._data.label_dtype,
+                    verify=False,
                 ),
             )
         else:
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 9c81b0eb607..a0089242909 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -8,11 +8,7 @@
 from typing_extensions import TypeAlias
 
 import cudf
-from cudf.api.types import (
-    _is_scalar_or_zero_d_array,
-    is_integer,
-    is_integer_dtype,
-)
+from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
 from cudf.core.copy_types import BooleanMask, GatherMap
 
 
@@ -233,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
             return EmptyIndexer()
-        elif is_integer_dtype(key.dtype):
+        elif key.dtype.kind in "iu":
             return MapIndexer(GatherMap(key, n, nullify=False))
         else:
             raise TypeError(
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index dd0a4f666a1..32c84763401 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import is_decimal_dtype, is_dtype_equal
+from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype
 from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
@@ -88,38 +88,25 @@ def _match_join_keys(
         )
 
     if (
-        np.issubdtype(ltype, np.number)
-        and np.issubdtype(rtype, np.number)
-        and not (
-            np.issubdtype(ltype, np.timedelta64)
-            or np.issubdtype(rtype, np.timedelta64)
-        )
+        is_numeric_dtype(ltype)
+        and is_numeric_dtype(rtype)
+        and not (ltype.kind == "m" or rtype.kind == "m")
     ):
         common_type = (
             max(ltype, rtype)
             if ltype.kind == rtype.kind
             else np.result_type(ltype, rtype)
         )
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        and np.issubdtype(rtype, np.datetime64)
-    ) or (
-        np.issubdtype(ltype, np.timedelta64)
-        and np.issubdtype(rtype, np.timedelta64)
+    elif (ltype.kind == "M" and rtype.kind == "M") or (
+        ltype.kind == "m" and rtype.kind == "m"
     ):
         common_type = max(ltype, rtype)
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        or np.issubdtype(ltype, np.timedelta64)
-    ) and not rcol.fillna(0).can_cast_safely(ltype):
+    elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype):
         raise TypeError(
             f"Cannot join between {ltype} and {rtype}, please type-cast both "
             "columns to the same type."
         )
-    elif (
-        np.issubdtype(rtype, np.datetime64)
-        or np.issubdtype(rtype, np.timedelta64)
-    ) and not lcol.fillna(0).can_cast_safely(rtype):
+    elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype):
         raise TypeError(
             f"Cannot join between {rtype} and {ltype}, please type-cast both "
             "columns to the same type."
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index ff4b06c6334..2788455aebf 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -150,7 +150,7 @@ def __init__(
         dtype=None,
         copy=False,
         name=None,
-        **kwargs,
+        verify_integrity=True,
     ):
         if sortorder is not None:
             raise NotImplementedError("sortorder is not yet supported")
@@ -524,8 +524,10 @@ def codes(self):
             col.values for col in self._codes
         )
 
-    def get_slice_bound(self, label, side, kind=None):
-        raise NotImplementedError()
+    def get_slice_bound(self, label, side):
+        raise NotImplementedError(
+            "get_slice_bound is not currently implemented."
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -1108,7 +1110,7 @@ def _concat(cls, objs):
 
     @classmethod
     @_performance_tracking
-    def from_tuples(cls, tuples, names=None):
+    def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
         """
         Convert list of tuples to MultiIndex.
 
@@ -1116,6 +1118,9 @@ def from_tuples(cls, tuples, names=None):
         ----------
         tuples : list / sequence of tuple-likes
             Each tuple is the index of one row/column.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
 
@@ -1142,13 +1147,24 @@ def from_tuples(cls, tuples, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_tuples(tuples, names=names)
+        pdi = pd.MultiIndex.from_tuples(
+            tuples, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @_performance_tracking
     def to_numpy(self):
         return self.values_host
 
+    def to_flat_index(self):
+        """
+        Convert a MultiIndex to an Index of Tuples containing the level values.
+
+        This is not currently implemented
+        """
+        # TODO: Could implement as Index of ListDtype?
+        raise NotImplementedError("to_flat_index is not currently supported.")
+
     @property  # type: ignore
     @_performance_tracking
     def values_host(self):
@@ -1215,7 +1231,12 @@ def values(self):
 
     @classmethod
     @_performance_tracking
-    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
+    def from_frame(
+        cls,
+        df: pd.DataFrame | cudf.DataFrame,
+        sortorder: int | None = None,
+        names=None,
+    ):
         """
         Make a MultiIndex from a DataFrame.
 
@@ -1223,6 +1244,9 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         ----------
         df : DataFrame
             DataFrame to be converted to MultiIndex.
+        sortorder : int, optional
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list-like, optional
             If no names are provided, use the column names, or tuple of column
             names if the columns is a MultiIndex. If a sequence, overwrite
@@ -1273,11 +1297,13 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         else:
             source_data = df
         names = names if names is not None else source_data._column_names
-        return cls.from_arrays(source_data._columns, names=names)
+        return cls.from_arrays(
+            source_data._columns, sortorder=sortorder, names=names
+        )
 
     @classmethod
     @_performance_tracking
-    def from_product(cls, arrays, names=None):
+    def from_product(cls, iterables, sortorder: int | None = None, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
 
@@ -1285,6 +1311,9 @@ def from_product(cls, arrays, names=None):
         ----------
         iterables : list / sequence of iterables
             Each iterable has unique labels for each level of the index.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
             If not explicitly provided, names will be inferred from the
@@ -1314,7 +1343,9 @@ def from_product(cls, arrays, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_product(arrays, names=names)
+        pdi = pd.MultiIndex.from_product(
+            iterables, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @classmethod
@@ -1712,8 +1743,11 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     @_performance_tracking
-    def unique(self):
-        return self.drop_duplicates(keep="first")
+    def unique(self, level: int | None = None) -> Self | cudf.Index:
+        if level is None:
+            return self.drop_duplicates(keep="first")
+        else:
+            return self.get_level_values(level).unique()
 
     @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index cdd4ec6f8e5..715bbf89b15 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -13,9 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import pickle
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -23,7 +25,6 @@
 import cudf
 import cudf._lib.labeling
 import cudf.core.index
-from cudf._typing import DataFrameOrSeries
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -31,6 +32,9 @@
     _Grouping,
 )
 
+if TYPE_CHECKING:
+    from cudf._typing import DataFrameOrSeries
+
 
 class _Resampler(GroupBy):
     grouping: "_ResampleGrouping"
@@ -39,8 +43,10 @@ def __init__(self, obj, by, axis=None, kind=None):
         by = _ResampleGrouping(obj, by)
         super().__init__(obj, by=by)
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
         if len(self.grouping.bin_labels) != len(result):
             index = cudf.core.index.Index(
                 self.grouping.bin_labels, name=self.grouping.names[0]
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1120642947b..e7248977b1d 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -502,6 +502,7 @@ def melt(
     var_name=None,
     value_name="value",
     col_level=None,
+    ignore_index: bool = True,
 ):
     """Unpivots a DataFrame from wide format to long format,
     optionally leaving identifier variables set.
@@ -566,6 +567,8 @@ def melt(
     """
     if col_level is not None:
         raise NotImplementedError("col_level != None is not supported yet.")
+    if ignore_index is not True:
+        raise NotImplementedError("ignore_index is currently not supported.")
 
     # Arg cleaning
 
@@ -932,14 +935,10 @@ def _pivot(df, index, columns):
     index_labels, index_idx = index._encode()
     column_labels = columns_labels.to_pandas().to_flat_index()
 
-    # the result of pivot always has a multicolumn
-    result = cudf.core.column_accessor.ColumnAccessor(
-        multiindex=True, level_names=(None,) + columns._data.names
-    )
-
     def as_tuple(x):
         return x if isinstance(x, tuple) else (x,)
 
+    result = {}
     for v in df:
         names = [as_tuple(v) + as_tuple(name) for name in column_labels]
         nrows = len(index_labels)
@@ -964,8 +963,12 @@ def as_tuple(x):
                 }
             )
 
+    # the result of pivot always has a multicolumn
+    ca = cudf.core.column_accessor.ColumnAccessor(
+        result, multiindex=True, level_names=(None,) + columns._data.names
+    )
     return cudf.DataFrame._from_data(
-        result, index=cudf.Index(index_labels, name=index.name)
+        ca, index=cudf.Index(index_labels, name=index.name)
     )
 
 
@@ -1060,7 +1063,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     return result
 
 
-def unstack(df, level, fill_value=None):
+def unstack(df, level, fill_value=None, sort: bool = True):
     """
     Pivot one or more levels of the (necessarily hierarchical) index labels.
 
@@ -1080,6 +1083,9 @@ def unstack(df, level, fill_value=None):
         levels of the index to pivot
     fill_value
         Non-functional argument provided for compatibility with Pandas.
+    sort : bool, default True
+        Sort the level(s) in the resulting MultiIndex columns.
+
 
     Returns
     -------
@@ -1156,6 +1162,8 @@ def unstack(df, level, fill_value=None):
 
     if fill_value is not None:
         raise NotImplementedError("fill_value is not supported.")
+    elif sort is False:
+        raise NotImplementedError(f"{sort=} is not supported.")
     if pd.api.types.is_list_like(level):
         if not level:
             return df
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e12cc3d52fb..10ac1fdfc1e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -24,7 +24,6 @@
     _is_scalar_or_zero_d_array,
     is_dict_like,
     is_integer,
-    is_integer_dtype,
     is_scalar,
 )
 from cudf.core import indexing_utils
@@ -213,7 +212,7 @@ def __setitem__(self, key, value):
                         and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
-                and isinstance(value, (np.float32, np.float64))
+                and isinstance(value, np.floating)
                 and np.isnan(value)
             ):
                 raise MixedTypeError(
@@ -356,12 +355,10 @@ def _loc_to_iloc(self, arg):
             )
             if not _is_non_decimal_numeric_dtype(index_dtype) and not (
                 isinstance(index_dtype, cudf.CategoricalDtype)
-                and is_integer_dtype(index_dtype.categories.dtype)
+                and index_dtype.categories.dtype.kind in "iu"
             ):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
-                if isinstance(arg, cudf.Scalar) and is_integer_dtype(
-                    arg.dtype
-                ):
+                if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu":
                     # Do not remove until pandas 3.0 support is added.
                     assert (
                         PANDAS_LT_300
@@ -921,7 +918,18 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         return self.to_pandas().to_dict(into=into)
 
     @_performance_tracking
-    def reindex(self, *args, **kwargs):
+    def reindex(
+        self,
+        index=None,
+        *,
+        axis=None,
+        method: str | None = None,
+        copy: bool = True,
+        level=None,
+        fill_value: ScalarLike | None = None,
+        limit: int | None = None,
+        tolerance=None,
+    ):
         """
         Conform Series to new index.
 
@@ -930,6 +938,8 @@ def reindex(self, *args, **kwargs):
         index : Index, Series-convertible, default None
             New labels / index to conform to,
             should be specified using keywords.
+        axis: int, default None
+            Unused.
         method: Not Supported
         copy : boolean, default True
         level: Not Supported
@@ -960,7 +970,7 @@ def reindex(self, *args, **kwargs):
         dtype: int64
 
         .. pandas-compat::
-            **Series.reindex**
+            :meth:`pandas.Series.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -968,27 +978,23 @@ def reindex(self, *args, **kwargs):
             where it is cast to float in Pandas.
 
         """
-        if len(args) > 1:
-            raise TypeError(
-                "Only one positional argument ('index') is allowed"
-            )
-        if args:
-            (index,) = args
-            if "index" in kwargs:
-                raise TypeError(
-                    "'index' passed as both positional and keyword argument"
-                )
-        else:
-            index = kwargs.get("index", self.index)
+        if index is None:
+            index = self.index
+        if fill_value is None:
+            fill_value = cudf.NA
 
         name = self.name or 0
         series = self._reindex(
-            deep=kwargs.get("copy", True),
+            deep=copy,
             dtypes={name: self.dtype},
             index=index,
             column_names=[name],
             inplace=False,
-            fill_value=kwargs.get("fill_value", cudf.NA),
+            fill_value=fill_value,
+            level=level,
+            method=method,
+            limit=limit,
+            tolerance=tolerance,
         )
         series.name = self.name
         return series
@@ -1057,14 +1063,21 @@ def reindex(self, *args, **kwargs):
         )
     )
     def reset_index(
-        self, level=None, drop=False, name=no_default, inplace=False
+        self,
+        level=None,
+        drop=False,
+        name=no_default,
+        inplace=False,
+        allow_duplicates=False,
     ):
         if not drop and inplace:
             raise TypeError(
                 "Cannot reset_index inplace on a Series "
                 "to create a DataFrame"
             )
-        data, index = self._reset_index(level=level, drop=drop)
+        data, index = self._reset_index(
+            level=level, drop=drop, allow_duplicates=allow_duplicates
+        )
         if not drop:
             if name is no_default:
                 name = 0 if self.name is None else self.name
@@ -1243,7 +1256,7 @@ def map(self, arg, na_action=None) -> "Series":
         dtype: int64
 
         .. pandas-compat::
-            **Series.map**
+            :meth:`pandas.Series.map`
 
             Please note map currently only supports fixed-width numeric
             type functions.
@@ -1635,7 +1648,9 @@ def has_nulls(self):
         return self._column.has_nulls()
 
     @_performance_tracking
-    def dropna(self, axis=0, inplace=False, how=None):
+    def dropna(
+        self, axis=0, inplace=False, how=None, ignore_index: bool = False
+    ):
         """
         Return a Series with null values removed.
 
@@ -1647,6 +1662,8 @@ def dropna(self, axis=0, inplace=False, how=None):
             If True, do operation inplace and return None.
         how : str, optional
             Not in use. Kept for compatibility.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -1712,6 +1729,9 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         result = super().dropna(axis=axis)
 
+        if ignore_index:
+            result.index = RangeIndex(len(result))
+
         return self._mimic_inplace(result, inplace=inplace)
 
     @_performance_tracking
@@ -2049,10 +2069,31 @@ def astype(
         return super().astype(dtype, copy, errors)
 
     @_performance_tracking
-    def sort_index(self, axis=0, *args, **kwargs):
+    def sort_index(
+        self,
+        axis=0,
+        level=None,
+        ascending=True,
+        inplace=False,
+        kind=None,
+        na_position="last",
+        sort_remaining=True,
+        ignore_index=False,
+        key=None,
+    ):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
-        return super().sort_index(axis=axis, *args, **kwargs)
+        return super().sort_index(
+            axis=axis,
+            level=level,
+            ascending=ascending,
+            inplace=inplace,
+            kind=kind,
+            na_position=na_position,
+            sort_remaining=sort_remaining,
+            ignore_index=ignore_index,
+            key=key,
+        )
 
     @_performance_tracking
     def sort_values(
@@ -2063,6 +2104,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -2076,6 +2118,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -2094,7 +2144,7 @@ def sort_values(
         dtype: int64
 
         .. pandas-compat::
-            **Series.sort_values**
+            :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * The inplace and kind argument is currently unsupported
@@ -2107,6 +2157,7 @@ def sort_values(
             kind=kind,
             na_position=na_position,
             ignore_index=ignore_index,
+            key=key,
         )
 
     @_performance_tracking
@@ -2256,30 +2307,44 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
-        obj = self.__class__._from_data(
-            {
-                None: super().argsort(
-                    axis=axis,
-                    kind=kind,
-                    order=order,
-                    ascending=ascending,
-                    na_position=na_position,
-                )
-            }
+    ) -> Self:
+        col = as_column(
+            super().argsort(
+                axis=axis,
+                kind=kind,
+                order=order,
+                ascending=ascending,
+                na_position=na_position,
+            )
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self([col])
         )
-        obj.name = self.name
-        return obj
 
     @_performance_tracking
-    def replace(self, to_replace=None, value=no_default, *args, **kwargs):
+    def replace(
+        self,
+        to_replace=None,
+        value=no_default,
+        inplace=False,
+        limit=None,
+        regex=False,
+        method=no_default,
+    ):
         if is_dict_like(to_replace) and value not in {None, no_default}:
             raise ValueError(
                 "Series.replace cannot use dict-like to_replace and non-None "
                 "value"
             )
 
-        return super().replace(to_replace, value, *args, **kwargs)
+        return super().replace(
+            to_replace,
+            value,
+            inplace=inplace,
+            limit=limit,
+            regex=regex,
+            method=method,
+        )
 
     @_performance_tracking
     def update(self, other):
@@ -2388,7 +2453,14 @@ def update(self, other):
 
     # UDF related
     @_performance_tracking
-    def apply(self, func, convert_dtype=True, args=(), **kwargs):
+    def apply(
+        self,
+        func,
+        convert_dtype=True,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        **kwargs,
+    ):
         """
         Apply a scalar function to the values of a Series.
         Similar to ``pandas.Series.apply``.
@@ -2415,6 +2487,18 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             See examples for details.
         args : tuple
             Positional arguments passed to func after the series value.
+        by_row : False or "compat", default "compat"
+            If ``"compat"`` and func is a callable, func will be passed each element of
+            the Series, like ``Series.map``. If func is a list or dict of
+            callables, will first try to translate each func into pandas methods. If
+            that doesn't work, will try call to apply again with ``by_row="compat"``
+            and if that fails, will call apply again with ``by_row=False``
+            (backward compatible).
+            If False, the func will be passed the whole Series at once.
+
+            ``by_row`` has no effect when ``func`` is a string.
+
+            Currently not implemented.
         **kwargs
             Not supported
 
@@ -2524,6 +2608,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         if convert_dtype is not True:
             raise ValueError("Series.apply only supports convert_dtype=True")
+        elif by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         result = self._apply(func, _get_scalar_kernel, *args, **kwargs)
         result.name = self.name
@@ -2550,7 +2636,7 @@ def count(self):
         5
 
         .. pandas-compat::
-            **Series.count**
+            :meth:`pandas.Series.count`
 
             Parameters currently not supported is `level`.
         """
@@ -2624,7 +2710,7 @@ def mode(self, dropna=True):
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
         return Series._from_data(
-            {self.name: val_counts.index.sort_values()}, name=self.name
+            {self.name: val_counts.index.sort_values()._column}, name=self.name
         )
 
     @_performance_tracking
@@ -2637,7 +2723,7 @@ def round(self, decimals=0, how="half_even"):
         return super().round(decimals, how)
 
     @_performance_tracking
-    def cov(self, other, min_periods=None):
+    def cov(self, other, min_periods=None, ddof: int | None = None):
         """
         Compute covariance with Series, excluding missing values.
 
@@ -2661,7 +2747,7 @@ def cov(self, other, min_periods=None):
         -0.015750000000000004
 
         .. pandas-compat::
-            **Series.cov**
+            :meth:`pandas.Series.cov`
 
             `min_periods` parameter is not yet supported.
         """
@@ -2670,6 +2756,8 @@ def cov(self, other, min_periods=None):
             raise NotImplementedError(
                 "min_periods parameter is not implemented yet"
             )
+        if ddof is not None:
+            raise NotImplementedError("ddof parameter is not implemented yet")
 
         if self.empty or other.empty:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -2687,14 +2775,6 @@ def cov(self, other, min_periods=None):
                 f"{other.dtype}"
             )
 
-    @_performance_tracking
-    def transpose(self):
-        """Return the transpose, which is by definition self."""
-
-        return self
-
-    T = property(transpose, doc=transpose.__doc__)
-
     @_performance_tracking
     def duplicated(self, keep="first"):
         """
@@ -3368,7 +3448,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -3379,13 +3458,20 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )
 
     @_performance_tracking
-    def rename(self, index=None, copy=True):
+    def rename(
+        self,
+        index=None,
+        axis=None,
+        copy: bool = True,
+        inplace: bool = False,
+        level=None,
+        errors: Literal["ignore", "raise"] = "ignore",
+    ):
         """
         Alter Series name
 
@@ -3395,8 +3481,21 @@ def rename(self, index=None, copy=True):
         ----------
         index : Scalar, optional
             Scalar to alter the Series.name attribute
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
         copy : boolean, default True
             Also copy underlying data
+        inplace : bool, default False
+            Whether to return a new Series. If True the value of copy is ignored.
+            Currently not supported.
+        level : int or level name, default None
+            In case of MultiIndex, only rename labels in the specified level.
+            Currently not supported.
+        errors : {'ignore', 'raise'}, default 'ignore'
+            If 'raise', raise `KeyError` when a `dict-like mapper` or
+            `index` contains labels that are not present in the index being transformed.
+            If 'ignore', existing keys will be renamed and extra keys will be ignored.
+            Currently not supported.
 
         Returns
         -------
@@ -3422,16 +3521,23 @@ def rename(self, index=None, copy=True):
         'numeric_series'
 
         .. pandas-compat::
-            **Series.rename**
+            :meth:`pandas.Series.rename`
 
             - Supports scalar values only for changing name attribute
-            - The ``inplace`` and ``level`` is not supported
         """
+        if inplace is not False:
+            raise NotImplementedError("inplace is currently not supported.")
+        if level is not None:
+            raise NotImplementedError("level is currently not supported.")
+        if errors != "ignore":
+            raise NotImplementedError("errors is currently not supported.")
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
@@ -3439,7 +3545,9 @@ def add_prefix(self, prefix):
         )
 
     @_performance_tracking
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
@@ -3529,7 +3637,12 @@ def explode(self, ignore_index=False):
 
     @_performance_tracking
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -3554,6 +3667,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `Series.shift`.
 
         Returns
         -------
@@ -3598,11 +3714,15 @@ def pct_change(
             warnings.simplefilter("ignore")
             data = self.fillna(method=fill_method, limit=limit)
         diff = data.diff(periods=periods)
-        change = diff / data.shift(periods=periods, freq=freq)
+        change = diff / data.shift(periods=periods, freq=freq, **kwargs)
         return change
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
             self._from_data_like_self(
@@ -4702,7 +4822,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
         dtype: object
 
         .. pandas-compat::
-            **series.DatetimeProperties.strftime**
+            :meth:`pandas.DatetimeIndex.strftime`
 
             The following date format identifiers are not yet
             supported: ``%c``, ``%x``,``%X``
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 04c7db7a53c..a5ff1223791 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -12,7 +12,6 @@
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_integer,
-    is_integer_dtype,
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
@@ -91,12 +90,6 @@ def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
 
-    def __bool__(self):
-        raise TypeError(
-            f"The truth value of a {type(self)} is ambiguous. Use "
-            "a.empty, a.bool(), a.item(), a.any() or a.all()."
-        )
-
     @property  # type: ignore
     @_performance_tracking
     def _num_columns(self) -> int:
@@ -358,7 +351,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = cudf.core.column.column_empty(0, dtype="int32")
-            if is_integer_dtype(arg.dtype):
+            if arg.dtype.kind in "iu":
                 return self._column.take(arg)
             if arg.dtype.kind == "b":
                 if (bn := len(arg)) != (n := len(self)):
@@ -396,3 +389,10 @@ def where(self, cond, other=None, inplace=False):
         result = cudf._lib.copying.copy_if_else(input_col, other, cond)
 
         return _make_categorical_like(result, self_column)
+
+    @_performance_tracking
+    def transpose(self):
+        """Return the transpose, which is by definition self."""
+        return self
+
+    T = property(transpose, doc=transpose.__doc__)
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 466d46f7dca..07158e4ee61 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -80,7 +80,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype: float64
 
     .. pandas-compat::
-        **cudf.to_numeric**
+        :func:`pandas.to_numeric`
 
         An important difference from pandas is that this function does not
         accept mixed numeric/non-numeric type sequences.
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 21693e106bd..ef0f6958aeb 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
 from __future__ import annotations
 
+import warnings
+from typing import Literal
+
 import numpy as np
 
 from cudf._lib.reduce import scan
@@ -56,7 +58,7 @@ class ExponentialMovingWindow(_RollingBase):
     the equivalent pandas method.
 
     .. pandas-compat::
-        **cudf.core.window.ExponentialMovingWindow**
+        :meth:`pandas.DataFrame.ewm`
 
         The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times``
         are not yet supported. Behavior is defined only for data that begins
@@ -103,34 +105,79 @@ def __init__(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
-        if (min_periods, ignore_na, axis, times) != (0, False, 0, None):
+        if min_periods != 0:
             raise NotImplementedError(
-                "The parameters `min_periods`, `ignore_na`, "
-                "`axis`, and `times` are not yet supported."
+                "min_periods is currently not supported."
             )
-
+        if ignore_na is not False:
+            raise NotImplementedError("ignore_na is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
+            raise NotImplementedError("axis is currently not supported.")
+        if times is not None:
+            raise NotImplementedError("times is currently not supported.")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported.")
         self.obj = obj
         self.adjust = adjust
         self.com = get_center_of_mass(com, span, halflife, alpha)
 
-    def mean(self):
+    def online(self, engine: str = "numba", engine_kwargs=None):
+        """
+        Return an ``OnlineExponentialMovingWindow`` object to calculate
+        exponentially moving window aggregations in an online method.
+
+        Currently not supported.
+        """
+        raise NotImplementedError("online is currently not supported.")
+
+    def mean(
+        self, numeric_only: bool = False, engine=None, engine_kwargs=None
+    ):
         """
         Calculate the ewm (exponential weighted moment) mean.
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas."
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional and added for compatibility with pandas."
+            )
         return self._apply_agg("ewma")
 
-    def var(self, bias):
-        raise NotImplementedError("ewmvar not yet supported.")
+    def sum(self, numeric_only: bool = False, engine=None, engine_kwargs=None):
+        raise NotImplementedError("sum not yet supported.")
 
-    def std(self, bias):
-        raise NotImplementedError("ewmstd not yet supported.")
+    def var(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("var not yet supported.")
 
-    def corr(self, other):
-        raise NotImplementedError("ewmcorr not yet supported.")
+    def std(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("std not yet supported.")
 
-    def cov(self, other):
-        raise NotImplementedError("ewmcov not yet supported.")
+    def corr(
+        self, other, pairwise: bool | None = None, numeric_only: bool = False
+    ):
+        raise NotImplementedError("corr not yet supported.")
+
+    def cov(
+        self,
+        other,
+        pairwise: bool | None = None,
+        bias: bool = False,
+        numeric_only: bool = False,
+    ):
+        raise NotImplementedError("cov not yet supported.")
 
     def _apply_agg_series(self, sr, agg_name):
         if not is_numeric_dtype(sr.dtype):
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 29391c68471..043a41145e5 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,4 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION
+from __future__ import annotations
+
+import warnings
 
 import numba
 import pandas as pd
@@ -196,17 +199,26 @@ def __init__(
         obj,
         window,
         min_periods=None,
-        center=False,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
         axis=0,
-        win_type=None,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         self.obj = obj
         self.window = window
         self.min_periods = min_periods
         self.center = center
         self._normalize()
-        self.agg_params = {}
+        # for var & std only?
+        self.agg_params: dict[str, int] = {}
         if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
             raise NotImplementedError("axis != 0 is not supported yet.")
         self.axis = axis
 
@@ -217,6 +229,15 @@ def __init__(
                 )
         self.win_type = win_type
 
+        if on is not None:
+            raise NotImplementedError("on is currently not supported")
+        if closed not in (None, "right"):
+            raise NotImplementedError("closed is currently not supported")
+        if step is not None:
+            raise NotImplementedError("step is currently not supported")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported")
+
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e909d96309e..0f2820a01e9 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -50,7 +50,7 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 7082a85237a..289292b5182 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,6 +10,7 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -280,7 +281,7 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -320,6 +321,9 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -360,17 +364,24 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
+        # Don't want to warn if use_python_file_object causes us to get
+        # a NativeFile (there is a separate deprecation warning for that)
+        with maybe_filter_deprecation(
+            not have_nativefile,
+            message="Support for reading pyarrow's NativeFile is deprecated",
+            category=FutureWarning,
+        ):
+            return DataFrame._from_data(
+                *liborc.read_orc(
+                    filepaths_or_buffers,
+                    columns,
+                    stripes,
+                    skiprows,
+                    num_rows,
+                    use_index,
+                    timestamp_type,
+                )
             )
-        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 02b26ea1c01..7dab2f20100 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -23,6 +24,7 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
+from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -350,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=True,
+            use_python_file_object=None,
             open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
@@ -532,7 +534,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=True,
+    use_python_file_object=None,
     categorical_partitions=True,
     open_file_options=None,
     bytes_per_thread=None,
@@ -615,6 +617,9 @@ def read_parquet(
             row_groups=row_groups,
             fs=fs,
         )
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
@@ -662,19 +667,26 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-    df = _parquet_to_frame(
-        filepaths_or_buffers,
-        engine,
-        *args,
-        columns=columns,
-        row_groups=row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        partition_keys=partition_keys,
-        partition_categories=partition_categories,
-        dataset_kwargs=dataset_kwargs,
-        **kwargs,
-    )
 
+    # Don't want to warn if use_python_file_object causes us to get
+    # a NativeFile (there is a separate deprecation warning for that)
+    with maybe_filter_deprecation(
+        not have_nativefile,
+        message="Support for reading pyarrow's NativeFile is deprecated",
+        category=FutureWarning,
+    ):
+        df = _parquet_to_frame(
+            filepaths_or_buffers,
+            engine,
+            *args,
+            columns=columns,
+            row_groups=row_groups,
+            use_pandas_metadata=use_pandas_metadata,
+            partition_keys=partition_keys,
+            partition_categories=partition_categories,
+            dataset_kwargs=dataset_kwargs,
+            **kwargs,
+        )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
@@ -917,12 +929,12 @@ def _read_parquet(
                 f"following positional arguments: {list(args)}"
             )
         if cudf.get_option("io.parquet.low_memory"):
-            return libparquet.ParquetReader(
+            return libparquet.read_parquet_chunked(
                 filepaths_or_buffers,
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
-            ).read()
+            )
         else:
             return libparquet.read_parquet(
                 filepaths_or_buffers,
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index ff445a63f74..bf88c950385 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -25,41 +25,39 @@ def install():
     global LOADED
     LOADED = loader is not None
 
-    if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None:
-        # Check if a non-default memory resource is set
-        current_mr = rmm.mr.get_current_device_resource()
-        if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
-            warnings.warn(
-                f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
-                UserWarning,
-            )
-        free_memory, _ = rmm.mr.available_device_memory()
-        free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+    rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", "managed_pool")
+    # Check if a non-default memory resource is set
+    current_mr = rmm.mr.get_current_device_resource()
+    if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
+        warnings.warn(
+            f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
+            UserWarning,
+        )
+        return rmm_mode
 
-        if rmm_mode == "cuda":
-            mr = rmm.mr.CudaMemoryResource()
-            rmm.mr.set_current_device_resource(mr)
-        elif rmm_mode == "pool":
-            rmm.mr.set_current_device_resource(
-                rmm.mr.PoolMemoryResource(
-                    rmm.mr.get_current_device_resource(),
-                    initial_pool_size=free_memory,
-                )
-            )
-        elif rmm_mode == "async":
-            mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory)
-            rmm.mr.set_current_device_resource(mr)
-        elif rmm_mode == "managed":
-            mr = rmm.mr.ManagedMemoryResource()
-            rmm.mr.set_current_device_resource(mr)
-        elif rmm_mode == "managed_pool":
-            mr = rmm.mr.PoolMemoryResource(
+    free_memory, _ = rmm.mr.available_device_memory()
+    free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+    new_mr = current_mr
+    if rmm_mode == "pool":
+        new_mr = rmm.mr.PoolMemoryResource(
+            current_mr,
+            initial_pool_size=free_memory,
+        )
+    elif rmm_mode == "async":
+        new_mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory)
+    elif rmm_mode == "managed":
+        new_mr = rmm.mr.PrefetchResourceAdaptor(rmm.mr.ManagedMemoryResource())
+    elif rmm_mode == "managed_pool":
+        new_mr = rmm.mr.PrefetchResourceAdaptor(
+            rmm.mr.PoolMemoryResource(
                 rmm.mr.ManagedMemoryResource(),
                 initial_pool_size=free_memory,
             )
-            rmm.mr.set_current_device_resource(mr)
-        else:
-            raise ValueError(f"Unsupported rmm mode: {rmm_mode}")
+        )
+    elif rmm_mode != "cuda":
+        raise ValueError(f"Unsupported {rmm_mode=}")
+    rmm.mr.set_current_device_resource(new_mr)
+    return rmm_mode
 
 
 def pytest_load_initial_conftests(early_config, parser, args):
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index fb8569fa1d0..591744ce793 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -72,7 +72,17 @@ def main():
 
     args = parser.parse_args()
 
-    install()
+    rmm_mode = install()
+    if "managed" in rmm_mode:
+        for key in {
+            "column_view::get_data",
+            "mutable_column_view::get_data",
+            "gather",
+            "hash_join",
+        }:
+            from cudf._lib import pylibcudf
+
+            pylibcudf.experimental.enable_prefetching(key)
     with profile(args.profile, args.line_profile, args.args[0]) as fn:
         args.args[0] = fn
         if args.module:
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 59a243dd7c4..478108f36f1 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -26,6 +26,7 @@
 )
 
 import cudf
+import cudf.core._compat
 
 from ..annotation import nvtx
 from ..fast_slow_proxy import (
@@ -556,13 +557,14 @@ def Index__setattr__(self, name, value):
     },
 )
 
-ArrowStringArrayNumpySemantics = make_final_proxy_type(
-    "ArrowStringArrayNumpySemantics",
-    _Unusable,
-    pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
-    fast_to_slow=_Unusable(),
-    slow_to_fast=_Unusable(),
-)
+if cudf.core._compat.PANDAS_GE_210:
+    ArrowStringArrayNumpySemantics = make_final_proxy_type(
+        "ArrowStringArrayNumpySemantics",
+        _Unusable,
+        pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+    )
 
 ArrowStringArray = make_final_proxy_type(
     "ArrowStringArray",
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index a66f63c09b3..9c65b74d081 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -137,7 +137,7 @@ and not test_eof_states \
 and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index e56c8d867cb..c2072d90e98 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -158,12 +158,12 @@ def assert_column_equal(
             return True
 
     if check_datetimelike_compat:
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             right = right.astype(left.dtype)
-        elif np.issubdtype(right.dtype, np.datetime64):
+        elif right.dtype.kind == "M":
             left = left.astype(right.dtype)
 
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             if not left.equals(right):
                 raise AssertionError(
                     f"[datetimelike_compat=True] {left.values} "
@@ -779,9 +779,7 @@ def assert_eq(left, right, **kwargs):
                 tm.assert_index_equal(left, right, **kwargs)
 
     elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
-        if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
-            right.dtype, np.floating
-        ):
+        if left.dtype.kind == "f" and right.dtype.kind == "f":
             assert np.allclose(left, right, equal_nan=True)
         else:
             assert np.array_equal(left, right)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 9b6029582ce..ae58af8ebce 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -891,3 +891,59 @@ def test_categorical_maxima(op):
     result = getattr(ser.cat.as_ordered(), op)()
     result_pd = getattr(ser_pd.cat.as_ordered(), op)()
     assert_eq(result, result_pd)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_ordered(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+    assert pd_ci.ordered == cudf_ci.ordered
+
+
+@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_as_ordered(method, ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = getattr(pd_ci, method)()
+    result = getattr(cudf_ci, method)()
+    assert_eq(result, expected)
+
+
+def test_index_add_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.add_categories([4])
+    result = cudf_ci.add_categories([4])
+    assert_eq(result, expected)
+
+
+def test_index_remove_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.remove_categories([4])
+    result = cudf_ci.remove_categories([4])
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_reorder_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_set_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index f3343c37d1d..e84e1433c10 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -5,28 +5,35 @@
 import pytest
 
 import cudf
+from cudf.core.column import as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.testing import assert_eq
 
 simple_test_data = [
     {},
-    {"a": []},
-    {"a": [1]},
-    {"a": ["a"]},
-    {"a": [1, 2, 3], "b": ["a", "b", "c"]},
+    {"a": as_column([])},
+    {"a": as_column([1])},
+    {"a": as_column(["a"])},
+    {"a": as_column([1, 2, 3]), "b": as_column(["a", "b", "c"])},
 ]
 
 mi_test_data = [
-    {("a", "b"): [1, 2, 4], ("a", "c"): [2, 3, 4]},
-    {("a", "b"): [1, 2, 3], ("a", ""): [2, 3, 4]},
-    {("a", "b"): [1, 2, 4], ("c", "d"): [2, 3, 4]},
-    {("a", "b"): [1, 2, 3], ("a", "c"): [2, 3, 4], ("b", ""): [4, 5, 6]},
+    {("a", "b"): as_column([1, 2, 4]), ("a", "c"): as_column([2, 3, 4])},
+    {("a", "b"): as_column([1, 2, 3]), ("a", ""): as_column([2, 3, 4])},
+    {("a", "b"): as_column([1, 2, 4]), ("c", "d"): as_column([2, 3, 4])},
+    {
+        ("a", "b"): as_column([1, 2, 3]),
+        ("a", "c"): as_column([2, 3, 4]),
+        ("b", ""): as_column([4, 5, 6]),
+    },
 ]
 
 
 def check_ca_equal(lhs, rhs):
     assert lhs.level_names == rhs.level_names
     assert lhs.multiindex == rhs.multiindex
+    assert lhs.rangeindex == rhs.rangeindex
+    assert lhs.label_dtype == rhs.label_dtype
     for l_key, r_key in zip(lhs, rhs):
         assert l_key == r_key
         assert_eq(lhs[l_key], rhs[r_key])
@@ -58,19 +65,26 @@ def test_to_pandas_simple(simple_data):
     # to ignore this `inferred_type` comparison, we pass exact=False.
     assert_eq(
         ca.to_pandas_index(),
-        pd.DataFrame(simple_data).columns,
+        pd.DataFrame(
+            {key: value.values_host for key, value in simple_data.items()}
+        ).columns,
         exact=False,
     )
 
 
 def test_to_pandas_multiindex(mi_data):
     ca = ColumnAccessor(mi_data, multiindex=True)
-    assert_eq(ca.to_pandas_index(), pd.DataFrame(mi_data).columns)
+    assert_eq(
+        ca.to_pandas_index(),
+        pd.DataFrame(
+            {key: value.values_host for key, value in mi_data.items()}
+        ).columns,
+    )
 
 
 def test_to_pandas_multiindex_names():
     ca = ColumnAccessor(
-        {("a", "b"): [1, 2, 3], ("c", "d"): [3, 4, 5]},
+        {("a", "b"): as_column([1, 2, 3]), ("c", "d"): as_column([3, 4, 5])},
         multiindex=True,
         level_names=("foo", "bar"),
     )
@@ -108,16 +122,20 @@ def test_column_size_mismatch():
     differing sizes throws an error.
     """
     with pytest.raises(ValueError):
-        ColumnAccessor({"a": [1], "b": [1, 2]})
+        ColumnAccessor({"a": as_column([1]), "b": as_column([1, 2])})
 
 
 def test_select_by_label_simple():
     """
     Test getting a column by label
     """
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
-    check_ca_equal(ca.select_by_label("a"), ColumnAccessor({"a": [1, 2, 3]}))
-    check_ca_equal(ca.select_by_label("b"), ColumnAccessor({"b": [2, 3, 4]}))
+    ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])})
+    check_ca_equal(
+        ca.select_by_label("a"), ColumnAccessor({"a": as_column([1, 2, 3])})
+    )
+    check_ca_equal(
+        ca.select_by_label("b"), ColumnAccessor({"b": as_column([2, 3, 4])})
+    )
 
 
 def test_select_by_label_multiindex():
@@ -126,40 +144,62 @@ def test_select_by_label_multiindex():
     """
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
-        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
+        {
+            ("b", "c"): as_column([1, 2, 3]),
+            ("b", "e"): as_column([2, 3, 4]),
+            ("d", "e"): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
     got = ca.select_by_label("a")
     check_ca_equal(expect, got)
 
-    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
+    expect = ColumnAccessor(
+        {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])},
+        multiindex=False,
+    )
     got = ca.select_by_label(("a", "b"))
     check_ca_equal(expect, got)
 
     expect = ColumnAccessor(
-        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
+        {
+            ("b", "c"): as_column([1, 2, 3]),
+            ("b", "e"): as_column([2, 3, 4]),
+            ("d", "e"): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
     got = ca.select_by_label("a")
     check_ca_equal(expect, got)
 
-    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
+    expect = ColumnAccessor(
+        {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])},
+        multiindex=False,
+    )
     got = ca.select_by_label(("a", "b"))
     check_ca_equal(expect, got)
 
 
 def test_select_by_label_simple_slice():
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
-    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
+    ca = ColumnAccessor(
+        {
+            "a": as_column([1, 2, 3]),
+            "b": as_column([2, 3, 4]),
+            "c": as_column([3, 4, 5]),
+        }
+    )
+    expect = ColumnAccessor(
+        {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])}
+    )
     got = ca.select_by_label(slice("b", "c"))
     check_ca_equal(expect, got)
 
@@ -167,10 +207,10 @@ def test_select_by_label_simple_slice():
 def test_select_by_label_multiindex_slice():
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )  # pandas needs columns to be sorted to do slicing with multiindex
@@ -180,9 +220,9 @@ def test_select_by_label_multiindex_slice():
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )
@@ -191,8 +231,16 @@ def test_select_by_label_multiindex_slice():
 
 
 def test_by_label_list():
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
-    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
+    ca = ColumnAccessor(
+        {
+            "a": as_column([1, 2, 3]),
+            "b": as_column([2, 3, 4]),
+            "c": as_column([3, 4, 5]),
+        }
+    )
+    expect = ColumnAccessor(
+        {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])}
+    )
     got = ca.select_by_label(["b", "c"])
     check_ca_equal(expect, got)
 
@@ -201,9 +249,13 @@ def test_select_by_index_simple():
     """
     Test getting a column by label
     """
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
-    check_ca_equal(ca.select_by_index(0), ColumnAccessor({"a": [1, 2, 3]}))
-    check_ca_equal(ca.select_by_index(1), ColumnAccessor({"b": [2, 3, 4]}))
+    ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])})
+    check_ca_equal(
+        ca.select_by_index(0), ColumnAccessor({"a": as_column([1, 2, 3])})
+    )
+    check_ca_equal(
+        ca.select_by_index(1), ColumnAccessor({"b": as_column([2, 3, 4])})
+    )
     check_ca_equal(ca.select_by_index([0, 1]), ca)
     check_ca_equal(ca.select_by_index(slice(0, None)), ca)
 
@@ -214,19 +266,19 @@ def test_select_by_index_multiindex():
     """
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )
@@ -235,9 +287,9 @@ def test_select_by_index_multiindex():
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
@@ -248,10 +300,10 @@ def test_select_by_index_multiindex():
 def test_select_by_index_empty():
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
@@ -267,12 +319,20 @@ def test_select_by_index_empty():
 
 def test_replace_level_values_RangeIndex():
     ca = ColumnAccessor(
-        {("a"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]},
+        {
+            ("a"): as_column([1, 2, 3]),
+            ("b"): as_column([2, 3, 4]),
+            ("c"): as_column([3, 4, 5]),
+        },
         multiindex=False,
     )
 
     expect = ColumnAccessor(
-        {("f"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]},
+        {
+            ("f"): as_column([1, 2, 3]),
+            ("b"): as_column([2, 3, 4]),
+            ("c"): as_column([3, 4, 5]),
+        },
         multiindex=False,
     )
 
@@ -282,12 +342,20 @@ def test_replace_level_values_RangeIndex():
 
 def test_replace_level_values_MultiColumn():
     ca = ColumnAccessor(
-        {("a", 1): [1, 2, 3], ("a", 2): [2, 3, 4], ("b", 1): [3, 4, 5]},
+        {
+            ("a", 1): as_column([1, 2, 3]),
+            ("a", 2): as_column([2, 3, 4]),
+            ("b", 1): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
-        {("f", 1): [1, 2, 3], ("f", 2): [2, 3, 4], ("b", 1): [3, 4, 5]},
+        {
+            ("f", 1): as_column([1, 2, 3]),
+            ("f", 2): as_column([2, 3, 4]),
+            ("b", 1): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
 
@@ -303,7 +371,17 @@ def test_clear_nrows_empty_before():
 
 
 def test_clear_nrows_empty_after():
-    ca = ColumnAccessor({"new": [1]})
+    ca = ColumnAccessor({"new": as_column([1])})
     assert ca.nrows == 1
     del ca["new"]
     assert ca.nrows == 0
+
+
+def test_not_rangeindex_and_multiindex():
+    with pytest.raises(ValueError):
+        ColumnAccessor({}, multiindex=True, rangeindex=True)
+
+
+def test_data_values_not_column_raises():
+    with pytest.raises(ValueError):
+        ColumnAccessor({"a": [1]})
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index a22a627523f..6a21cb1b9d7 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf):
     # Arrow FileSystem interface
     expect = cudf.read_csv(path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        got = cudf.read_csv(fil)
+    with pytest.warns(FutureWarning):
+        with fs.open_input_file(path) as fil:
+            got = cudf.read_csv(fil)
 
     assert_eq(expect, got)
 
@@ -1617,7 +1618,7 @@ def test_csv_reader_partial_dtype(dtype):
         StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"]
     )
 
-    assert names_df == header_df
+    assert_eq(names_df, header_df)
     assert all(names_df.dtypes == ["int16", "int64"])
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2009fc49ce5..e2ce5c03b70 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10833,7 +10833,7 @@ def test_dataframe_contains(name, contains, other_names):
         expectation = contains is cudf.NA and name is cudf.NA
         assert (contains in pdf) == expectation
         assert (contains in gdf) == expectation
-    elif pd.api.types.is_float_dtype(gdf.columns.dtype):
+    elif gdf.columns.dtype.kind == "f":
         # In some cases, the columns are converted to an Index[float] based on
         # the other column names. That casts name values from None to np.nan.
         expectation = contains is np.nan and (name is None or name is np.nan)
@@ -11100,3 +11100,12 @@ def test_from_records_with_index_no_shallow_copy():
     data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
     df = cudf.DataFrame(data.view(np.recarray), index=idx)
     assert df.index is idx
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.DataFrame()]],
+        rfunc_args_and_kwargs=[[pd.DataFrame()]],
+    )
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index ed0cf0053ea..5b1ee0ffac6 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -284,3 +284,12 @@ def test_dropna_multiindex_2(data, how):
     got = gi.dropna(how)
 
     assert_eq(expect, got)
+
+
+def test_ignore_index():
+    pser = pd.Series([1, 2, np.nan], index=[2, 4, 1])
+    gser = cudf.from_pandas(pser)
+
+    result = pser.dropna(ignore_index=True)
+    expected = gser.dropna(ignore_index=True)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index fc22d8bc0ea..28fdfb5c2f1 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -46,7 +46,8 @@ def mock_size(*args):
     # use_python_file_object=True, because the pyarrow
     # `open_input_file` command will fail (since it doesn't
     # use the monkey-patched `open` definition)
-    got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+    with pytest.warns(FutureWarning):
+        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 826a0e52f57..74f04c0584f 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3885,3 +3885,28 @@ def test_group_by_raises_category_error(op):
 
     with pytest.raises(TypeError):
         df.groupby(df.a).agg(op)
+
+
+def test_ngroups():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ngroups == ggb.ngroups
+    assert len(pgb) == len(ggb)
+
+
+def test_ndim():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ndim == ggb.ndim
+
+    pser = pd.Series(range(3))
+    gser = cudf.Series.from_pandas(pser)
+    pgb = pser.groupby([0, 0, 1])
+    ggb = gser.groupby(cudf.Series([0, 0, 1]))
+    assert pgb.ndim == ggb.ndim
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 9eba6122d26..722a64cb553 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3294,3 +3294,12 @@ def test_index_assignment_no_shallow_copy(index):
     df = cudf.DataFrame(range(1))
     df.index = index
     assert df.index is index
+
+
+def test_bool_rangeindex_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[pd.RangeIndex(0)]],
+        rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]],
+    )
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 1941eec91eb..b7314a36e73 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2161,3 +2161,21 @@ def test_nunique(array, dropna):
     result = gidx.nunique(dropna=dropna)
     expected = pidx.nunique(dropna=dropna)
     assert result == expected
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
+        rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
+    )
+
+
+def test_unique_level():
+    pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]])
+    cudf_mi = cudf.MultiIndex.from_pandas(pd_mi)
+
+    result = pd_mi.unique(level=1)
+    expected = cudf_mi.unique(level=1)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ecb7fd44422..3806b901b10 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -22,7 +22,7 @@
 from pyarrow import fs as pa_fs, parquet as pq
 
 import cudf
-from cudf._lib.parquet import ParquetReader
+from cudf._lib.parquet import read_parquet_chunked
 from cudf.io.parquet import (
     ParquetDatasetWriter,
     ParquetWriter,
@@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
     with fs.open_input_file(path) as fil:
-        got = cudf.read_parquet(fil)
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(fil)
 
     assert_eq(expect, got)
 
@@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object(
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(
-            fil, use_python_file_object=use_python_file_object
-        )
+    with pytest.warns(FutureWarning):
+        with fs.open(paths[0], mode="rb") as fil:
+            got1 = cudf.read_parquet(
+                fil, use_python_file_object=use_python_file_object
+            )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(
-        paths[0], use_python_file_object=use_python_file_object
-    )
+    with pytest.warns(FutureWarning):
+        got2 = cudf.read_parquet(
+            paths[0], use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got2)
 
 
@@ -3752,7 +3755,7 @@ def test_parquet_chunked_reader(
     )
     buffer = BytesIO()
     df.to_parquet(buffer)
-    reader = ParquetReader(
+    actual = read_parquet_chunked(
         [buffer],
         chunk_read_limit=chunk_read_limit,
         pass_read_limit=pass_read_limit,
@@ -3762,7 +3765,6 @@ def test_parquet_chunked_reader(
     expected = cudf.read_parquet(
         buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups
     )
-    actual = reader.read()
     assert_eq(expected, actual)
 
 
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 1247fa362ce..8be6463c699 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -248,16 +248,11 @@ def test_sum_masked(nelem):
 
 def test_sum_boolean():
     s = Series(np.arange(100000))
-    got = (s > 1).sum(dtype=np.int32)
+    got = (s > 1).sum()
     expect = 99998
 
     assert expect == got
 
-    got = (s > 1).sum(dtype=np.bool_)
-    expect = True
-
-    assert expect == got
-
 
 def test_date_minmax():
     np_data = np.random.normal(size=10**3)
@@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
     result = df.mean()
     expected = df.to_pandas().mean()
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("op", ["sum", "product"])
+def test_dtype_deprecated(op):
+    ser = cudf.Series(range(5))
+    with pytest.warns(FutureWarning):
+        result = getattr(ser, op)(dtype=np.dtype(np.int8))
+    assert isinstance(result, np.int8)
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index a44bf791767..3ae318d3bf5 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            use_python_file_object=False,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                use_python_file_object=False,
+            )
     assert_eq(pdf, got)
 
     # Use Arrow PythonFile object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            use_python_file_object=True,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
     assert_eq(pdf, got)
 
 
@@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_csv(fil)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_csv(fil)
 
     assert_eq(pdf, got)
 
@@ -184,17 +187,18 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            byte_range=(74, 73),
-            bytes_per_thread=bytes_per_thread
-            if not use_python_file_object
-            else None,
-            header=None,
-            names=["Integer", "Float", "Integer2", "String", "Boolean"],
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                byte_range=(74, 73),
+                bytes_per_thread=bytes_per_thread
+                if not use_python_file_object
+                else None,
+                header=None,
+                names=["Integer", "Float", "Integer2", "String", "Boolean"],
+                use_python_file_object=use_python_file_object,
+            )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -241,18 +245,19 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got1 = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            open_file_options=(
-                {"precache_options": {"method": precache}}
-                if use_python_file_object
-                else None
-            ),
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            columns=columns,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got1 = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                open_file_options=(
+                    {"precache_options": {"method": precache}}
+                    if use_python_file_object
+                    else None
+                ),
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+                use_python_file_object=use_python_file_object,
+            )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -263,12 +268,13 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            got2 = cudf.read_parquet(
-                f,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+            with pytest.warns(FutureWarning):
+                got2 = cudf.read_parquet(
+                    f,
+                    bytes_per_thread=bytes_per_thread,
+                    columns=columns,
+                    use_python_file_object=use_python_file_object,
+                )
     assert_eq(expect, got2)
 
 
@@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-        )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_parquet(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_parquet(fil, columns=columns)
 
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got)
@@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            filters=filters,
-            open_file_options={"precache_options": {"method": precache}},
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                filters=filters,
+                open_file_options={"precache_options": {"method": precache}},
+            )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_orc(
-            f"s3://{bucket}/{fname}",
-            columns=columns,
-            storage_options=s3so,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                columns=columns,
+                storage_options=s3so,
+                use_python_file_object=use_python_file_object,
+            )
 
     if columns:
         expect = expect[columns]
@@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_orc(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index af912bee342..b0788bcc0fc 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import datetime
 from decimal import Decimal
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
@@ -11,6 +13,9 @@
 
 import cudf
 
+if TYPE_CHECKING:
+    from cudf._typing import DtypeObj
+
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
 handling is required when converting a Boolean column into arrow.
@@ -89,10 +94,6 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
-# The NumPy scalar types are a bit of a mess as they align with the C types
-# so for now we use the `sctypes` dict (although it was made private in 2.0)
-_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
-
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -114,12 +115,6 @@ def np_to_pa_dtype(dtype):
     return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
-def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic"""
-    dtype = np.result_type(*[a.dtype for a in args])
-    return [a.astype(dtype) for a in args]
-
-
 def _find_common_type_decimal(dtypes):
     # Find the largest scale and the largest difference between
     # precision and scale of the columns to be concatenated
@@ -330,32 +325,28 @@ def can_convert_to_column(obj):
     return is_column_like(obj) or cudf.api.types.is_list_like(obj)
 
 
-def min_scalar_type(a, min_size=8):
-    return min_signed_type(a, min_size=min_size)
-
-
-def min_signed_type(x, min_size=8):
+def min_signed_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["int"]:
+    for int_dtype in (np.int8, np.int16, np.int32, np.int64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `int64` and let numpy raise appropriate exception:
     return np.int64(x).dtype
 
 
-def min_unsigned_type(x, min_size=8):
+def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["uint"]:
+    for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `uint64` and let numpy raise appropriate exception:
     return np.uint64(x).dtype
 
@@ -373,10 +364,10 @@ def min_column_type(x, expected_type):
     if x.null_count == len(x):
         return x.dtype
 
-    if np.issubdtype(x.dtype, np.floating):
+    if x.dtype.kind == "f":
         return get_min_float_dtype(x)
 
-    elif np.issubdtype(expected_type, np.integer):
+    elif cudf.dtype(expected_type).kind in "iu":
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
@@ -582,25 +573,18 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _maybe_convert_to_default_type(dtype):
+def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj:
     """Convert `dtype` to default if specified by user.
 
     If not specified, return as is.
     """
-    if cudf.get_option("default_integer_bitwidth"):
-        if cudf.api.types.is_signed_integer_dtype(dtype):
-            return cudf.dtype(
-                f'i{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-        elif cudf.api.types.is_unsigned_integer_dtype(dtype):
-            return cudf.dtype(
-                f'u{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-    if cudf.get_option(
-        "default_float_bitwidth"
-    ) and cudf.api.types.is_float_dtype(dtype):
-        return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}')
-
+    if ib := cudf.get_option("default_integer_bitwidth"):
+        if dtype.kind == "i":
+            return cudf.dtype(f"i{ib//8}")
+        elif dtype.kind == "u":
+            return cudf.dtype(f"u{ib//8}")
+    if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f":
+        return cudf.dtype(f"f{fb//8}")
     return dtype
 
 
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 76c7f2bfdb8..80555750b3a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -6,6 +6,7 @@
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
+from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
@@ -15,6 +16,7 @@
 from pyarrow import PythonFile as ArrowPythonFile
 from pyarrow.lib import NativeFile
 
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -24,7 +26,6 @@
 except ImportError:
     fsspec_parquet = None
 
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -86,7 +87,7 @@
 1       20  rapids
 2       30      ai
 """.format(remote_data_sources=_docstring_remote_sources)
-doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)
+doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro)
 
 _docstring_read_parquet_metadata = """
 Read a Parquet file's metadata and schema
@@ -174,15 +175,23 @@
     columns are also loaded.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. Setting this argument to `False`
-    will require the entire file to be copied to host memory, and is highly
-    discouraged.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 open_file_options : dict, optional
     Dictionary of key-value pairs to pass to the function used to open remote
     files. By default, this will be `fsspec.parquet.open_parquet_file`. To
     deactivate optimized precaching, set the "method" to `None` under the
     "precache_options" key. Note that the `open_file_func` key can also be
     used to specify a custom file-open function.
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
@@ -468,8 +477,12 @@
     If True, use row index if available for faster seeking.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger ORC files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -934,7 +947,7 @@
 --------
 cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame.
 """
-doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf)
+doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf)
 
 _docstring_to_hdf = """
 Write the contained data to an HDF5 file using HDFStore.
@@ -1006,7 +1019,7 @@
 cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
 cudf.DataFrame.to_feather : Write out feather-format for DataFrames.
 """
-doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf)
+doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf)
 
 _docstring_read_feather = """
 Load an feather object from the file path, returning a DataFrame.
@@ -1188,8 +1201,12 @@
     the end of the range.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger CSV files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1409,7 +1426,7 @@
 result : Series
 
 """
-doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
+doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource)
 
 
 _docstring_get_reader_filepath_or_buffer = """
@@ -1430,9 +1447,19 @@
 use_python_file_object : boolean, default False
     If True, Arrow-backed PythonFile objects will be used in place
     of fsspec AbstractBufferedFile objects.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers.
 open_file_options : dict, optional
     Optional dictionary of keyword arguments to pass to
     `_open_remote_files` (used for remote storage only).
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1708,7 +1735,8 @@ def get_reader_filepath_or_buffer(
     mode="rb",
     fs=None,
     iotypes=(BytesIO, NativeFile),
-    use_python_file_object=False,
+    # no_default aliases to False
+    use_python_file_object=no_default,
     open_file_options=None,
     allow_raw_text_input=False,
     storage_options=None,
@@ -1720,6 +1748,30 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
+    if use_python_file_object is no_default:
+        use_python_file_object = False
+    elif use_python_file_object is not None:
+        warnings.warn(
+            "The 'use_python_file_object' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+    else:
+        # Preserve the readers (e.g. read_csv) default of True
+        # if no use_python_file_object option is specified by the user
+        # for now (note: this is different from the default for this
+        # function of False)
+        # TODO: when non-pyarrow file reading perf is good enough
+        # we can default this to False
+        use_python_file_object = True
+
+    if open_file_options is not None:
+        warnings.warn(
+            "The 'open_file_options' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 7347ec7866a..c9b343e0f9f 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,6 +6,7 @@
 import os
 import traceback
 import warnings
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
+
+
+@contextmanager
+def maybe_filter_deprecation(
+    condition: bool, message: str, category: type[Warning]
+):
+    """Conditionally filter a warning category.
+
+    Parameters
+    ----------
+    condition
+        If true, filter the warning
+    message
+        Message to match, passed to :func:`warnings.filterwarnings`
+    category
+        Category of warning, passed to :func:`warnings.filterwarnings`
+    """
+    with warnings.catch_warnings():
+        if condition:
+            warnings.filterwarnings(
+                "ignore",
+                message,
+                category=category,
+            )
+        yield
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index dcb33b1fc1a..b2ddb06d8c9 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -120,13 +120,14 @@ skip = [
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index badfdf06d15..a9b60133f42 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
@@ -100,6 +100,7 @@ regex = "(?P<value>.*)"
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 764cdd3b3ca..f31193aa938 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -5,11 +5,15 @@
 
 from __future__ import annotations
 
+import os
+import warnings
 from functools import partial
 from typing import TYPE_CHECKING
 
 import nvtx
 
+from polars.exceptions import PerformanceWarning
+
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
@@ -61,6 +65,12 @@ def execute_with_cudf(
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except exception:
+    except exception as e:
+        if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
+            warnings.warn(
+                f"Query execution with GPU not supported, reason: {type(e)}: {e}",
+                PerformanceWarning,
+                stacklevel=2,
+            )
         if raise_on_fail:
             raise
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 42aba0fcdc0..02018548b2c 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -185,8 +185,7 @@ def nan_count(self) -> int:
                 plc.reduce.reduce(
                     plc.unary.is_nan(self.obj),
                     plc.aggregation.sum(),
-                    # TODO: pylibcudf needs to have a SizeType DataType singleton
-                    plc.DataType(plc.TypeId.INT32),
+                    plc.types.SIZE_TYPE,
                 )
             ).as_py()
         return 0
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index cbeadf1426a..dba76855329 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -23,8 +23,6 @@
 
     from typing_extensions import Self
 
-    import cudf
-
     from cudf_polars.containers import Column
 
 
@@ -83,16 +81,6 @@ def num_rows(self) -> int:
         """Number of rows."""
         return 0 if len(self.columns) == 0 else self.table.num_rows()
 
-    @classmethod
-    def from_cudf(cls, df: cudf.DataFrame) -> Self:
-        """Create from a cudf dataframe."""
-        return cls(
-            [
-                NamedColumn(c.to_pylibcudf(mode="read"), name)
-                for name, c in df._data.items()
-            ]
-        )
-
     @classmethod
     def from_polars(cls, df: pl.DataFrame) -> Self:
         """
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f37cb3f475c..9e0fca3f52f 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -370,6 +370,10 @@ def do_evaluate(
         # datatype of pyarrow scalar is correct by construction.
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class LiteralColumn(Expr):
     __slots__ = ("value",)
@@ -382,6 +386,13 @@ def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
 
+    def get_hash(self) -> int:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return hash((type(self), self.dtype, id(self.value)))
+
     def do_evaluate(
         self,
         df: DataFrame,
@@ -393,6 +404,10 @@ def do_evaluate(
         # datatype of pyarrow array is correct by construction.
         return Column(plc.interop.from_arrow(self.value))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class Col(Expr):
     __slots__ = ("name",)
@@ -867,7 +882,14 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("mask_nans", "round", "setsorted", "unique"):
+        if self.name not in (
+            "mask_nans",
+            "round",
+            "setsorted",
+            "unique",
+            "dropnull",
+            "fill_null",
+        ):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -953,6 +975,27 @@ def do_evaluate(
                 order=order,
                 null_order=null_order,
             )
+        elif self.name == "dropnull":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.stream_compaction.drop_nulls(
+                    plc.Table([column.obj]), [0], 1
+                ).columns()[0]
+            )
+        elif self.name == "fill_null":
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if isinstance(self.children[1], Literal):
+                arg = plc.interop.from_arrow(self.children[1].value)
+            else:
+                evaluated = self.children[1].evaluate(
+                    df, context=context, mapping=mapping
+                )
+                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            return Column(plc.replace.replace_nulls(column.obj, arg))
+
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
@@ -1145,6 +1188,14 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
+        if not (
+            plc.traits.is_fixed_width(self.dtype)
+            and plc.traits.is_fixed_width(value.dtype)
+            and plc.unary.is_supported_cast(value.dtype, self.dtype)
+        ):
+            raise NotImplementedError(
+                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
+            )
 
     def do_evaluate(
         self,
@@ -1377,13 +1428,14 @@ def __init__(
         super().__init__(dtype)
         self.op = op
         self.children = (left, right)
-        if (
-            op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and plc.traits.is_chrono(left.dtype)
-            and plc.traits.is_chrono(right.dtype)
-            and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
+        if not plc.binaryop.is_supported_operation(
+            self.dtype, left.dtype, right.dtype, op
         ):
-            raise NotImplementedError("Casting rules for timelike types")
+            raise NotImplementedError(
+                f"Operation {op.name} not supported "
+                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
+                f"with output type {self.dtype.id().name}"
+            )
 
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index cce0c4a3d94..7f62dff4389 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -25,7 +25,6 @@
 
 import polars as pl
 
-import cudf
 import cudf._lib.pylibcudf as plc
 
 import cudf_polars.dsl.expr as expr
@@ -205,12 +204,14 @@ class Scan(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
-        if self.file_options.n_rows is not None:
-            raise NotImplementedError("row limit in scan")
-        if self.typ not in ("csv", "parquet"):
+        if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
+            # This line is unhittable ATM since IPC/Anonymous scan raise
+            # on the polars side
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+        if self.typ == "ndjson" and self.file_options.n_rows is not None:
+            raise NotImplementedError("row limit in scan")
         if self.cloud_options is not None and any(
-            self.cloud_options[k] is not None for k in ("aws", "azure", "gcp")
+            self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp")
         ):
             raise NotImplementedError(
                 "Read from cloud storage"
@@ -235,17 +236,21 @@ def __post_init__(self) -> None:
                 # Need to do some file introspection to get the number
                 # of columns so that column projection works right.
                 raise NotImplementedError("Reading CSV without header")
+        elif self.typ == "ndjson":
+            # TODO: consider handling the low memory option here
+            # (maybe use chunked JSON reader)
+            if self.reader_options["ignore_errors"]:
+                raise NotImplementedError(
+                    "ignore_errors is not supported in the JSON reader"
+                )
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         options = self.file_options
         with_columns = options.with_columns
         row_index = options.row_index
+        nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1
         if self.typ == "csv":
-            dtype_map = {
-                name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()]
-                for name, typ in self.schema.items()
-            }
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
             quote = chr(parse_options["quote_char"])
@@ -280,35 +285,71 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             pieces = []
             for p in self.paths:
                 skiprows = self.reader_options["skip_rows"]
-                # TODO: read_csv expands globs which we should not do,
-                # because polars will already have handled them.
                 path = Path(p)
                 with path.open() as f:
                     while f.readline() == "\n":
                         skiprows += 1
-                pieces.append(
-                    cudf.read_csv(
-                        path,
-                        sep=sep,
-                        quotechar=quote,
-                        lineterminator=eol,
-                        names=column_names,
-                        header=header,
-                        usecols=usecols,
-                        na_filter=True,
-                        na_values=null_values,
-                        keep_default_na=False,
-                        skiprows=skiprows,
-                        comment=comment,
-                        decimal=decimal,
-                        dtype=dtype_map,
-                    )
+                tbl_w_meta = plc.io.csv.read_csv(
+                    plc.io.SourceInfo([path]),
+                    delimiter=sep,
+                    quotechar=quote,
+                    lineterminator=eol,
+                    col_names=column_names,
+                    header=header,
+                    usecols=usecols,
+                    na_filter=True,
+                    na_values=null_values,
+                    keep_default_na=False,
+                    skiprows=skiprows,
+                    comment=comment,
+                    decimal=decimal,
+                    dtypes=self.schema,
+                    nrows=nrows,
+                )
+                pieces.append(tbl_w_meta)
+            tables, colnames = zip(
+                *(
+                    (piece.tbl, piece.column_names(include_children=False))
+                    for piece in pieces
                 )
-            df = DataFrame.from_cudf(cudf.concat(pieces))
+            )
+            df = DataFrame.from_table(
+                plc.concatenate.concatenate(list(tables)),
+                colnames[0],
+            )
         elif self.typ == "parquet":
-            cdf = cudf.read_parquet(self.paths, columns=with_columns)
-            assert isinstance(cdf, cudf.DataFrame)
-            df = DataFrame.from_cudf(cdf)
+            tbl_w_meta = plc.io.parquet.read_parquet(
+                plc.io.SourceInfo(self.paths),
+                columns=with_columns,
+                num_rows=nrows,
+            )
+            df = DataFrame.from_table(
+                tbl_w_meta.tbl,
+                # TODO: consider nested column names?
+                tbl_w_meta.column_names(include_children=False),
+            )
+        elif self.typ == "ndjson":
+            json_schema: list[tuple[str, str, list]] = [
+                (name, typ, []) for name, typ in self.schema.items()
+            ]
+            plc_tbl_w_meta = plc.io.json.read_json(
+                plc.io.SourceInfo(self.paths),
+                lines=True,
+                dtypes=json_schema,
+                prune_columns=True,
+            )
+            # TODO: I don't think cudf-polars supports nested types in general right now
+            # (but when it does, we should pass child column names from nested columns in)
+            df = DataFrame.from_table(
+                plc_tbl_w_meta.tbl, plc_tbl_w_meta.column_names(include_children=False)
+            )
+            col_order = list(self.schema.keys())
+            # TODO: remove condition when dropping support for polars 1.0
+            # https://github.com/pola-rs/polars/pull/17363
+            if row_index is not None and row_index[0] in self.schema:
+                col_order.remove(row_index[0])
+            if col_order is not None:
+                df = df.select(col_order)
         else:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
@@ -335,13 +376,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 null_order=plc.types.NullOrder.AFTER,
             )
             df = DataFrame([index, *df.columns])
-        # TODO: should be true, but not the case until we get
-        # cudf-classic out of the loop for IO since it converts date32
-        # to datetime.
-        # assert all(
-        #     c.obj.type() == dtype
-        #     for c, dtype in zip(df.columns, self.schema.values())
-        # )
+        assert all(c.obj.type() == self.schema[c.name] for c in df.columns)
         if self.predicate is None:
             return df
         else:
@@ -514,7 +549,7 @@ def check_agg(agg: expr.Expr) -> int:
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
-        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
+        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal, expr.LiteralColumn)):
             return 0
         else:
             raise NotImplementedError(f"No handler for {agg=}")
@@ -574,7 +609,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame([*result_keys, *results]).slice(self.options.slice)
+        return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice)
 
 
 @dataclasses.dataclass
@@ -653,6 +688,59 @@ def _joiners(
         else:
             assert_never(how)
 
+    def _reorder_maps(
+        self,
+        left_rows: int,
+        lg: plc.Column,
+        left_policy: plc.copying.OutOfBoundsPolicy,
+        right_rows: int,
+        rg: plc.Column,
+        right_policy: plc.copying.OutOfBoundsPolicy,
+    ) -> list[plc.Column]:
+        """
+        Reorder gather maps to satisfy polars join order restrictions.
+
+        Parameters
+        ----------
+        left_rows
+            Number of rows in left table
+        lg
+            Left gather map
+        left_policy
+            Nullify policy for left map
+        right_rows
+            Number of rows in right table
+        rg
+            Right gather map
+        right_policy
+            Nullify policy for right map
+
+        Returns
+        -------
+        list of reordered left and right gather maps.
+
+        Notes
+        -----
+        For a left join, the polars result preserves the order of the
+        left keys, and is stable wrt the right keys. For all other
+        joins, there is no order obligation.
+        """
+        dt = plc.interop.to_arrow(plc.types.SIZE_TYPE)
+        init = plc.interop.from_arrow(pa.scalar(0, type=dt))
+        step = plc.interop.from_arrow(pa.scalar(1, type=dt))
+        left_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy
+        )
+        right_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy
+        )
+        return plc.sorting.stable_sort_by_key(
+            plc.Table([lg, rg]),
+            plc.Table([*left_order.columns(), *right_order.columns()]),
+            [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING],
+            [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
+        ).columns()
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -693,6 +781,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
+            if how == "left":
+                # Order of left table is preserved
+                lg, rg = self._reorder_maps(
+                    left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
+                )
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
@@ -1041,9 +1134,48 @@ class HConcat(IR):
     dfs: list[IR]
     """List of inputs."""
 
+    @staticmethod
+    def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
+        """
+        Extend a table with nulls.
+
+        Parameters
+        ----------
+        table
+            Table to extend
+        nrows
+            Number of additional rows
+
+        Returns
+        -------
+        New pylibcudf table.
+        """
+        return plc.concatenate.concatenate(
+            [
+                table,
+                plc.Table(
+                    [
+                        plc.Column.all_null_like(column, nrows)
+                        for column in table.columns()
+                    ]
+                ),
+            ]
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        max_rows = max(df.num_rows for df in dfs)
+        # Horizontal concatenation extends shorter tables with nulls
+        dfs = [
+            df
+            if df.num_rows == max_rows
+            else DataFrame.from_table(
+                self._extend_with_nulls(df.table, nrows=max_rows - df.num_rows),
+                df.column_names,
+            )
+            for df in dfs
+        ]
         return DataFrame(
             list(itertools.chain.from_iterable(df.columns for df in dfs)),
         )
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index a9a4ae5f0a6..d37c96a15de 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -14,8 +14,6 @@
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping
-
     import polars as pl
 
     from cudf_polars.typing import OptimizationArgs
@@ -26,7 +24,9 @@
 def assert_gpu_result_equal(
     lazydf: pl.LazyFrame,
     *,
-    collect_kwargs: Mapping[OptimizationArgs, bool] | None = None,
+    collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
     check_row_order: bool = True,
     check_column_order: bool = True,
     check_dtypes: bool = True,
@@ -43,8 +43,17 @@ def assert_gpu_result_equal(
     lazydf
         frame to collect.
     collect_kwargs
-        Keyword arguments to pass to collect. Useful for controlling
-        optimization settings.
+        Common keyword arguments to pass to collect for both polars CPU and
+        cudf-polars.
+        Useful for controlling optimization settings.
+    polars_collect_kwargs
+        Keyword arguments to pass to collect for execution on polars CPU.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
+    cudf_collect_kwargs
+        Keyword arguments to pass to collect for execution on cudf-polars.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
     check_row_order
         Expect rows to be in same order
     check_column_order
@@ -68,10 +77,19 @@ def assert_gpu_result_equal(
     NotImplementedError
         If GPU collection failed in some way.
     """
-    collect_kwargs = {} if collect_kwargs is None else collect_kwargs
-    expect = lazydf.collect(**collect_kwargs)
+    if collect_kwargs is None:
+        collect_kwargs = {}
+    final_polars_collect_kwargs = collect_kwargs.copy()
+    final_cudf_collect_kwargs = collect_kwargs.copy()
+    if polars_collect_kwargs is not None:
+        final_polars_collect_kwargs.update(polars_collect_kwargs)
+    if cudf_collect_kwargs is not None:  # pragma: no cover
+        # exclude from coverage since not used ATM
+        # but this is probably still useful
+        final_cudf_collect_kwargs.update(cudf_collect_kwargs)
+    expect = lazydf.collect(**final_polars_collect_kwargs)
     got = lazydf.collect(
-        **collect_kwargs,
+        **final_cudf_collect_kwargs,
         post_opt_callback=partial(execute_with_cudf, raise_on_fail=True),
     )
     assert_frame_equal(
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 918cd024fa2..cd68d021286 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -14,43 +14,7 @@
 
 import cudf._lib.pylibcudf as plc
 
-__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
-
-
-def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
-    """
-    Do two datetime typeids have matching resolution for a binop.
-
-    Parameters
-    ----------
-    lid
-       Left type id
-    rid
-       Right type id
-
-    Returns
-    -------
-    True if resolutions are compatible, False otherwise.
-
-    Notes
-    -----
-    Polars has different casting rules for combining
-    datetimes/durations than libcudf, and while we don't encode the
-    casting rules fully, just reject things we can't handle.
-
-    Precondition for correctness: both lid and rid are timelike.
-    """
-    if lid == rid:
-        return True
-    # Timestamps are smaller than durations in the libcudf enum.
-    lid, rid = sorted([lid, rid])
-    if lid == plc.TypeId.TIMESTAMP_MILLISECONDS:
-        return rid == plc.TypeId.DURATION_MILLISECONDS
-    elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS:
-        return rid == plc.TypeId.DURATION_MICROSECONDS
-    elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS:
-        return rid == plc.TypeId.DURATION_NANOSECONDS
-    return False
+__all__ = ["from_polars", "downcast_arrow_lists"]
 
 
 def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
@@ -153,7 +117,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
     elif isinstance(dtype, pl.List):
-        # TODO: This doesn't consider the value type.
+        # Recurse to catch unsupported inner types
+        _ = from_polars(dtype.inner)
         return plc.DataType(plc.TypeId.LIST)
     else:
         raise NotImplementedError(f"{dtype=} conversion not supported")
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index a9ac14c25aa..9807cffb384 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -15,6 +15,7 @@
 POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
 POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
 POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
+POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1")
 POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
 POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
 POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 874bb849747..6cd36136bf8 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -8,7 +8,7 @@ You will need:
    preferred configuration. Or else, use
    [rustup](https://www.rust-lang.org/tools/install)
 2. A [cudf development
-   environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment).
+   environment](https://github.com/rapidsai/cudf/blob/branch-24.10/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
 > ![NOTE] These instructions will get simpler as we merge code in.
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 0b559f7a8e9..f8a1973bdbf 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "polars>=1.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -182,3 +182,4 @@ docstring-code-format = true
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py
new file mode 100644
index 00000000000..3e003054338
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_casting.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+_supported_dtypes = [(pl.Int8(), pl.Int64())]
+
+_unsupported_dtypes = [
+    (pl.String(), pl.Int64()),
+]
+
+
+@pytest.fixture
+def dtypes(request):
+    return request.param
+
+
+@pytest.fixture
+def tests(dtypes):
+    fromtype, totype = dtypes
+    if fromtype == pl.String():
+        data = ["a", "b", "c"]
+    else:
+        data = [1, 2, 3]
+    return pl.DataFrame(
+        {
+            "a": pl.Series(data, dtype=fromtype),
+        }
+    ).lazy(), totype
+
+
+@pytest.mark.parametrize("dtypes", _supported_dtypes, indirect=True)
+def test_cast_supported(tests):
+    df, totype = tests
+    q = df.select(pl.col("a").cast(totype))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("dtypes", _unsupported_dtypes, indirect=True)
+def test_cast_unsupported(tests):
+    df, totype = tests
+    assert_ir_translation_raises(
+        df.select(pl.col("a").cast(totype)), NotImplementedError
+    )
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index 55e688428bd..5bd3131d1d7 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -6,6 +6,8 @@
 
 import polars as pl
 
+import cudf._lib.pylibcudf as plc
+
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -64,11 +66,17 @@ def test_timelike_literal(timestamp, timedelta):
         adjusted=timestamp + timedelta,
         two_delta=timedelta + timedelta,
     )
-    schema = q.collect_schema()
-    time_type = schema["time"]
-    delta_type = schema["delta"]
-    if dtypes.have_compatible_resolution(
-        dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id()
+    schema = {k: dtypes.from_polars(v) for k, v in q.collect_schema().items()}
+    if plc.binaryop.is_supported_operation(
+        schema["adjusted"],
+        schema["time"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
+    ) and plc.binaryop.is_supported_operation(
+        schema["two_delta"],
+        schema["delta"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
     ):
         assert_gpu_result_equal(q)
     else:
diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py
index b6bcd0026fa..8f68bbc460c 100644
--- a/python/cudf_polars/tests/expressions/test_numeric_binops.py
+++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 dtypes = [
     pl.Int8,
@@ -111,3 +114,12 @@ def test_binop_with_scalar(left_scalar, right_scalar):
     q = df.select(lop / rop)
 
     assert_gpu_result_equal(q)
+
+
+def test_numeric_to_string_cast_fails():
+    df = pl.DataFrame(
+        {"a": [1, 1, 2, 3, 3, 4, 1], "b": [None, 2, 3, 4, 5, 6, 7]}
+    ).lazy()
+    q = df.select(pl.col("a").cast(pl.String))
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 8cf65dd51ac..df08e15baa4 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -34,7 +34,9 @@ def ldf(with_nulls):
     if with_nulls:
         a[4] = None
         a[-3] = None
-    return pl.LazyFrame({"a": a, "b": range(len(a))})
+    return pl.LazyFrame(
+        {"a": a, "b": range(len(a)), "c": [str(i) for i in range(len(a))]}
+    )
 
 
 slice_cases = [
@@ -84,7 +86,7 @@ def test_contains_re_non_strict_raises(ldf):
 
 
 def test_contains_re_non_literal_raises(ldf):
-    q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False))
+    q = ldf.select(pl.col("a").str.contains(pl.col("c"), literal=False))
 
     assert_ir_translation_raises(q, NotImplementedError)
 
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
new file mode 100644
index 00000000000..5b4bba55552
--- /dev/null
+++ b/python/cudf_polars/tests/test_config.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.dsl.ir import IR
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+def test_polars_verbose_warns(monkeypatch):
+    def raise_unimplemented(self):
+        raise NotImplementedError("We don't support this")
+
+    monkeypatch.setattr(IR, "__post_init__", raise_unimplemented)
+    q = pl.LazyFrame({})
+    # Ensure that things raise
+    assert_ir_translation_raises(q, NotImplementedError)
+    with (
+        pl.Config(verbose=True),
+        pytest.raises(pl.exceptions.ComputeError),
+        pytest.warns(
+            pl.exceptions.PerformanceWarning,
+            match="Query execution with GPU not supported",
+        ),
+    ):
+        # And ensure that collecting issues the correct warning.
+        assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_drop_nulls.py b/python/cudf_polars/tests/test_drop_nulls.py
new file mode 100644
index 00000000000..5dfe9f66a97
--- /dev/null
+++ b/python/cudf_polars/tests/test_drop_nulls.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+@pytest.fixture(
+    params=[
+        [1, 2, 1, 3, 5, None, None],
+        [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        [],
+        [None, None],
+        [1, 2, 3, 4, 5],
+    ]
+)
+def null_data(request):
+    is_empty = pl.Series(request.param).dtype == pl.Null
+    return pl.DataFrame(
+        {
+            "a": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+            "b": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+        }
+    ).lazy()
+
+
+def test_drop_null(null_data):
+    q = null_data.select(pl.col("a").drop_nulls())
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [0, pl.col("a").mean(), pl.col("b")],
+    ids=["scalar", "aggregation", "column_expression"],
+)
+def test_fill_null(null_data, value):
+    q = null_data.select(pl.col("a").fill_null(value))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "strategy", ["forward", "backward", "min", "max", "mean", "zero", "one"]
+)
+def test_fill_null_with_strategy(null_data, strategy):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("strategy", ["forward", "backward"])
+@pytest.mark.parametrize("limit", [0, 1, 2])
+def test_fill_null_with_limit(null_data, strategy, limit):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy, limit=limit))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b07d8e38217..a75825ef3d3 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -155,3 +155,31 @@ def test_groupby_nan_minmax_raises(op):
     q = df.group_by("key").agg(op(pl.col("value")))
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "key",
+    [
+        pytest.param(
+            1,
+            marks=pytest.mark.xfail(
+                versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this"
+            ),
+        ),
+        pl.col("key1"),
+    ],
+)
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.lit(1).alias("value"),
+        pl.lit([[4, 5, 6]]).alias("value"),
+        pl.col("float") * (1 - pl.col("int")),
+        [pl.lit(2).alias("value"), pl.col("float") * 2],
+    ],
+)
+def test_groupby_literal_in_agg(df, key, expr):
+    # check_row_order=False doesn't work for list aggregations
+    # so just sort by the group key
+    q = df.group_by(key).agg(expr).sort(key, maintain_order=True)
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py
index 46cbb21b25a..4737aa18028 100644
--- a/python/cudf_polars/tests/test_hconcat.py
+++ b/python/cudf_polars/tests/test_hconcat.py
@@ -17,3 +17,12 @@ def test_hconcat():
     ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"))
     query = pl.concat([ldf, ldf2], how="horizontal")
     assert_gpu_result_equal(query)
+
+
+def test_hconcat_different_heights():
+    left = pl.LazyFrame({"a": [1, 2, 3, 4]})
+
+    right = pl.LazyFrame({"b": [[1], [2]], "c": ["a", "bcde"]})
+
+    q = pl.concat([left, right], how="horizontal")
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 89f6fd3455b..1e880cdc6de 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -12,65 +12,68 @@
 )
 
 
+@pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
+def join_nulls(request):
+    return request.param
+
+
+@pytest.fixture(params=["inner", "left", "semi", "anti", "full"])
+def how(request):
+    return request.param
+
+
+@pytest.fixture
+def left():
+    return pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 1, None],
+            "b": [1, 2, 3, 4, 5],
+            "c": [2, 3, 4, 5, 6],
+        }
+    )
+
+
+@pytest.fixture
+def right():
+    return pl.LazyFrame(
+        {
+            "a": [1, 4, 3, 7, None, None],
+            "c": [2, 3, 4, 5, 6, 7],
+        }
+    )
+
+
 @pytest.mark.parametrize(
-    "how",
+    "join_expr",
     [
-        "inner",
-        "left",
-        "semi",
-        "anti",
-        "full",
+        pl.col("a"),
+        pl.col("a") * 2,
+        [pl.col("a"), pl.col("c") + 1],
+        ["c", "a"],
     ],
 )
-@pytest.mark.parametrize("coalesce", [False, True])
-@pytest.mark.parametrize(
-    "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"]
-)
+def test_non_coalesce_join(left, right, how, join_nulls, join_expr):
+    query = left.join(
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=False
+    )
+    assert_gpu_result_equal(query, check_row_order=how == "left")
+
+
 @pytest.mark.parametrize(
     "join_expr",
     [
         pl.col("a"),
-        pl.col("a") * 2,
-        [pl.col("a"), pl.col("c") + 1],
         ["c", "a"],
     ],
 )
-def test_join(how, coalesce, join_nulls, join_expr):
-    left = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 1, None],
-            "b": [1, 2, 3, 4, 5],
-            "c": [2, 3, 4, 5, 6],
-        }
-    ).lazy()
-    right = pl.DataFrame(
-        {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
-        }
-    ).lazy()
-
+def test_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
-        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
     )
     assert_gpu_result_equal(query, check_row_order=False)
 
 
-def test_cross_join():
-    left = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 1, None],
-            "b": [1, 2, 3, 4, 5],
-            "c": [2, 3, 4, 5, 6],
-        }
-    ).lazy()
-    right = pl.DataFrame(
-        {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
-        }
-    ).lazy()
-
+def test_cross_join(left, right):
     q = left.join(right, how="cross")
 
     assert_gpu_result_equal(q)
@@ -79,9 +82,7 @@ def test_cross_join():
 @pytest.mark.parametrize(
     "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))]
 )
-def test_join_literal_key_unsupported(left_on, right_on):
-    left = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
-    right = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]})
+def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index d0c41090433..64acbb076ed 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import os
+
 import pytest
 
 import polars as pl
@@ -22,48 +24,23 @@ def row_index(request):
 
 
 @pytest.fixture(
-    params=[
-        None,
-        pytest.param(
-            2, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
-        ),
-        pytest.param(
-            3, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
-        ),
-    ],
+    params=[None, 2, 3],
     ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
 )
 def n_rows(request):
     return request.param
 
 
-@pytest.fixture(params=["csv", "parquet"])
-def df(request, tmp_path, row_index, n_rows):
-    df = pl.DataFrame(
+@pytest.fixture(scope="module")
+def df():
+    # TODO: more dtypes
+    return pl.DataFrame(
         {
-            "a": [1, 2, 3, None],
-            "b": ["ẅ", "x", "y", "z"],
-            "c": [None, None, 4, 5],
+            "a": [1, 2, 3, None, 4, 5],
+            "b": ["ẅ", "x", "y", "z", "123", "abcd"],
+            "c": [None, None, 4, 5, -1, 0],
         }
     )
-    name, offset = row_index
-    if request.param == "csv":
-        df.write_csv(tmp_path / "file.csv")
-        return pl.scan_csv(
-            tmp_path / "file.csv",
-            row_index_name=name,
-            row_index_offset=offset,
-            n_rows=n_rows,
-        )
-    else:
-        df.write_parquet(tmp_path / "file.pq")
-        # parquet doesn't have skip_rows argument
-        return pl.scan_parquet(
-            tmp_path / "file.pq",
-            row_index_name=name,
-            row_index_offset=offset,
-            n_rows=n_rows,
-        )
 
 
 @pytest.fixture(params=[None, ["a"], ["b", "a"]], ids=["all", "subset", "reordered"])
@@ -81,20 +58,72 @@ def mask(request):
     return request.param
 
 
-def test_scan(df, columns, mask):
-    q = df
+def make_source(df, path, format):
+    """
+    Writes the passed polars df to a file of
+    the desired format
+    """
+    if format == "csv":
+        df.write_csv(path)
+    elif format == "ndjson":
+        df.write_ndjson(path)
+    else:
+        df.write_parquet(path)
+
+
+@pytest.mark.parametrize(
+    "format, scan_fn",
+    [
+        ("csv", pl.scan_csv),
+        ("ndjson", pl.scan_ndjson),
+        ("parquet", pl.scan_parquet),
+    ],
+)
+def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request):
+    name, offset = row_index
+    make_source(df, tmp_path / "file", format)
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=(n_rows is not None and scan_fn is pl.scan_ndjson),
+            reason="libcudf does not support n_rows",
+        )
+    )
+    q = scan_fn(
+        tmp_path / "file",
+        row_index_name=name,
+        row_index_offset=offset,
+        n_rows=n_rows,
+    )
     if mask is not None:
         q = q.filter(mask)
     if columns is not None:
-        q = df.select(*columns)
-    assert_gpu_result_equal(q)
+        q = q.select(*columns)
+    polars_collect_kwargs = {}
+    if versions.POLARS_VERSION_LT_12:
+        # https://github.com/pola-rs/polars/issues/17553
+        polars_collect_kwargs = {"projection_pushdown": False}
+    assert_gpu_result_equal(
+        q,
+        polars_collect_kwargs=polars_collect_kwargs,
+        # This doesn't work in polars < 1.2 since the row-index
+        # is in the wrong order in previous polars releases
+        check_column_order=versions.POLARS_VERSION_LT_12,
+    )
 
 
 def test_scan_unsupported_raises(tmp_path):
     df = pl.DataFrame({"a": [1, 2, 3]})
 
-    df.write_ndjson(tmp_path / "df.json")
-    q = pl.scan_ndjson(tmp_path / "df.json")
+    df.write_ipc(tmp_path / "df.ipc")
+    q = pl.scan_ipc(tmp_path / "df.ipc")
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_ndjson_nrows_notimplemented(tmp_path, df):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_ndjson(tmp_path / "df.jsonl")
+    q = pl.scan_ndjson(tmp_path / "df.jsonl", n_rows=1)
     assert_ir_translation_raises(q, NotImplementedError)
 
 
@@ -129,6 +158,42 @@ def test_scan_csv_column_renames_projection_schema(tmp_path):
     assert_gpu_result_equal(q)
 
 
+@pytest.mark.parametrize(
+    "filename,glob",
+    [
+        (["test1.csv", "test2.csv"], True),
+        ("test*.csv", True),
+        # Make sure we don't expand glob when
+        # trying to read a file like test*.csv
+        # when glob=False
+        ("test*.csv", False),
+    ],
+)
+def test_scan_csv_multi(tmp_path, filename, glob):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test*.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    os.chdir(tmp_path)
+    q = pl.scan_csv(filename, glob=glob)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_multi_differing_colnames(tmp_path):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""abc,def,ghi\n1,2\n3,4,5""")
+    q = pl.scan_csv(
+        [tmp_path / "test1.csv", tmp_path / "test2.csv"],
+    )
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.explain()
+
+
 def test_scan_csv_skip_after_header_not_implemented(tmp_path):
     with (tmp_path / "test.csv").open("w") as f:
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
@@ -195,3 +260,23 @@ def test_scan_csv_skip_initial_empty_rows(tmp_path):
     q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "schema",
+    [
+        # List of colnames (basicaly like names param in CSV)
+        {"b": pl.String, "a": pl.Float32},
+        {"a": pl.UInt64},
+    ],
+)
+def test_scan_ndjson_schema(df, tmp_path, schema):
+    make_source(df, tmp_path / "file", "ndjson")
+    q = pl.scan_ndjson(tmp_path / "file", schema=schema)
+    assert_gpu_result_equal(q)
+
+
+def test_scan_ndjson_unsupported(df, tmp_path):
+    make_source(df, tmp_path / "file", "ndjson")
+    q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True)
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py
index 535fdd846a0..bbdb4faa256 100644
--- a/python/cudf_polars/tests/utils/test_dtypes.py
+++ b/python/cudf_polars/tests/utils/test_dtypes.py
@@ -16,6 +16,7 @@
         pl.Time(),
         pl.Struct({"a": pl.Int8, "b": pl.Float32}),
         pl.Datetime("ms", time_zone="US/Pacific"),
+        pl.List(pl.Datetime("ms", time_zone="US/Pacific")),
         pl.Array(pl.Int8, 2),
         pl.Binary(),
         pl.Categorical(),
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7b99e041b54..d6b88167262 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -20,8 +20,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.8.*,>=0.0.0a0",
-    "cudf_kafka==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
+    "cudf_kafka==24.10.*,>=0.0.0a0",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -49,6 +49,7 @@ Homepage = "https://github.com/rapidsai/cudf"
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.setuptools]
 license-files = ["LICENSE"]
@@ -57,7 +58,7 @@ zip-safe = false
 [tool.setuptools.dynamic]
 version = {file = "custreamz/VERSION"}
 
-[tools.setuptools.packages.find]
+[tool.setuptools.packages.find]
 include = [
     "custreamz",
     "custreamz.*",
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 810a804e428..f0cab953458 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -33,6 +33,7 @@
     _is_local_filesystem,
     _open_remote_files,
 )
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -110,39 +111,50 @@ def _read_paths(
                     ),
                 )
 
-            # Use cudf to read in data
-            try:
-                df = cudf.read_parquet(
-                    paths_or_fobs,
-                    engine="cudf",
-                    columns=columns,
-                    row_groups=row_groups if row_groups else None,
-                    dataset_kwargs=dataset_kwargs,
-                    categorical_partitions=False,
-                    **kwargs,
-                )
-            except RuntimeError as err:
-                # TODO: Remove try/except after null-schema issue is resolved
-                # (See: https://github.com/rapidsai/cudf/issues/12702)
-                if len(paths_or_fobs) > 1:
-                    df = cudf.concat(
-                        [
-                            cudf.read_parquet(
-                                pof,
-                                engine="cudf",
-                                columns=columns,
-                                row_groups=row_groups[i]
-                                if row_groups
-                                else None,
-                                dataset_kwargs=dataset_kwargs,
-                                categorical_partitions=False,
-                                **kwargs,
-                            )
-                            for i, pof in enumerate(paths_or_fobs)
-                        ]
+            # Filter out deprecation warning unless the user
+            # specifies open_file_options and/or use_python_file_object.
+            # Otherwise, the FutureWarning is out of their control.
+            with maybe_filter_deprecation(
+                (
+                    not open_file_options
+                    and "use_python_file_object" not in kwargs
+                ),
+                message="Support for reading pyarrow's NativeFile is deprecated",
+                category=FutureWarning,
+            ):
+                # Use cudf to read in data
+                try:
+                    df = cudf.read_parquet(
+                        paths_or_fobs,
+                        engine="cudf",
+                        columns=columns,
+                        row_groups=row_groups if row_groups else None,
+                        dataset_kwargs=dataset_kwargs,
+                        categorical_partitions=False,
+                        **kwargs,
                     )
-                else:
-                    raise err
+                except RuntimeError as err:
+                    # TODO: Remove try/except after null-schema issue is resolved
+                    # (See: https://github.com/rapidsai/cudf/issues/12702)
+                    if len(paths_or_fobs) > 1:
+                        df = cudf.concat(
+                            [
+                                cudf.read_parquet(
+                                    pof,
+                                    engine="cudf",
+                                    columns=columns,
+                                    row_groups=row_groups[i]
+                                    if row_groups
+                                    else None,
+                                    dataset_kwargs=dataset_kwargs,
+                                    categorical_partitions=False,
+                                    **kwargs,
+                                )
+                                for i, pof in enumerate(paths_or_fobs)
+                            ]
+                        )
+                    else:
+                        raise err
 
         # Apply filters (if any are defined)
         df = _apply_post_filters(df, filters)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a67404da4fe..ac3245b3748 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -9,6 +9,8 @@
 import pyarrow.fs as pa_fs
 import pytest
 
+from dask.dataframe import assert_eq
+
 import dask_cudf
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
@@ -102,6 +104,11 @@ def s3_context(s3_base, bucket, files=None):
                 pass
 
 
+@pytest.fixture
+def pdf(scope="module"):
+    return pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]})
+
+
 def test_read_csv(s3_base, s3so):
     with s3_context(
         s3_base=s3_base, bucket="daskcsv", files={"a.csv": b"a,b\n1,2\n3,4\n"}
@@ -112,6 +119,22 @@ def test_read_csv(s3_base, s3so):
         assert df.a.sum().compute() == 4
 
 
+def test_read_csv_warns(s3_base, s3so):
+    with s3_context(
+        s3_base=s3_base,
+        bucket="daskcsv_warns",
+        files={"a.csv": b"a,b\n1,2\n3,4\n"},
+    ):
+        with pytest.warns(FutureWarning):
+            df = dask_cudf.read_csv(
+                "s3://daskcsv_warns/*.csv",
+                blocksize="50 B",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
+            assert df.a.sum().compute() == 4
+
+
 @pytest.mark.parametrize(
     "open_file_options",
     [
@@ -120,8 +143,7 @@ def test_read_csv(s3_base, s3so):
         {"open_file_func": None},
     ],
 )
-def test_read_parquet(s3_base, s3so, open_file_options):
-    pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]})
+def test_read_parquet_open_file_options(s3_base, s3so, open_file_options, pdf):
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
@@ -138,5 +160,67 @@ def test_read_parquet(s3_base, s3so, open_file_options):
             storage_options=s3so,
             open_file_options=open_file_options,
         )
-        assert df.a.sum().compute() == 10
-        assert df.b.sum().compute() == 9
+        with pytest.warns(FutureWarning):
+            assert df.a.sum().compute() == 10
+        with pytest.warns(FutureWarning):
+            assert df.b.sum().compute() == 9
+
+
+def test_read_parquet(s3_base, s3so, pdf):
+    fname = "test_parquet_reader_dask.parquet"
+    bucket = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        got = dask_cudf.read_parquet(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+        )
+        assert_eq(pdf, got)
+
+
+def test_read_parquet_use_python_file_object(s3_base, s3so, pdf):
+    fname = "test_parquet_use_python_file_object.parquet"
+    bucket = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        with pytest.warns(FutureWarning):
+            got = dask_cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                read={"use_python_file_object": True},
+            ).head()
+            assert_eq(pdf, got)
+
+
+def test_read_orc(s3_base, s3so, pdf):
+    fname = "test_orc_reader_dask.orc"
+    bucket = "orc"
+    buffer = BytesIO()
+    pdf.to_orc(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        got = dask_cudf.read_orc(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+        )
+        assert_eq(pdf, got)
+
+
+def test_read_orc_use_python_file_object(s3_base, s3so, pdf):
+    fname = "test_orc_use_python_file_object.orc"
+    bucket = "orc"
+    buffer = BytesIO()
+    pdf.to_orc(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        with pytest.warns(FutureWarning):
+            got = dask_cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            ).head()
+            assert_eq(pdf, got)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 9b2e3a5a7b1..872ecd35c28 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -19,12 +19,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
     "pandas>=2.0,<2.2.3dev0",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -45,7 +45,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.8.*,>=0.0.0a0",
+    "dask-cuda==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",
@@ -58,6 +58,7 @@ Homepage = "https://github.com/rapidsai/cudf"
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.setuptools]
 license-files = ["LICENSE"]
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
index 021f8f593b3..fa52bc0780a 100644
--- a/python/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/CMakeLists.txt
@@ -16,10 +16,10 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../../rapids_config.cmake)
 include(rapids-cuda)
-rapids_cuda_init_architectures(cudf-python)
+rapids_cuda_init_architectures(pylibcudf)
 
 project(
-  cudf-python
+  pylibcudf
   VERSION "${RAPIDS_VERSION}"
   LANGUAGES CXX CUDA
 )
@@ -96,5 +96,5 @@ include(cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(pylibcudf)
 
 if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET pylibcudf PATHS "${cython_lib_dir}")
+  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
 endif()
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index 0800fa18e94..df4591baa71 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     concatenate.pyx
     copying.pyx
     datetime.pyx
+    experimental.pyx
     expressions.pyx
     filling.pyx
     gpumemoryview.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 26e89b818d3..71f523fc3cd 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     concatenate,
     copying,
     datetime,
+    experimental,
     expressions,
     filling,
     groupby,
@@ -48,6 +49,8 @@ __all__ = [
     "concatenate",
     "copying",
     "datetime",
+    "experimental",
+    "expressions",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index e89a5ed9f96..9705eba84b1 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -7,6 +7,7 @@
     concatenate,
     copying,
     datetime,
+    experimental,
     expressions,
     filling,
     groupby,
@@ -48,6 +49,8 @@
     "concatenate",
     "copying",
     "datetime",
+    "experimental",
+    "expressions",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/pylibcudf/pylibcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/binaryop.pxd
index 39bf183e7b0..06625e9e2db 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/binaryop.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
@@ -22,3 +23,10 @@ cpdef Column binary_operation(
     binary_operator op,
     DataType output_type
 )
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+)
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index 8870bd2f2f0..5a67f4d6cdb 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -2,6 +2,7 @@
 
 from cython.operator import dereference
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.libcudf cimport binaryop as cpp_binaryop
@@ -83,3 +84,37 @@ cpdef Column binary_operation(
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
 
     return Column.from_libcudf(move(result))
+
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+):
+    """Check if an operation is supported for the given data types.
+
+    For details, see :cpp:func::is_supported_operation`.
+
+    Parameters
+    ----------
+    out : DataType
+        The output data type.
+    lhs : DataType
+        The left hand side data type.
+    rhs : DataType
+        The right hand side data type.
+    op : BinaryOperator
+        The operation to check.
+    Returns
+    -------
+    bool
+        True if the operation is supported, False otherwise
+    """
+
+    return cpp_binaryop.is_supported_operation(
+        out.c_obj,
+        lhs.c_obj,
+        rhs.c_obj,
+        op
+    )
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index c987fa3af57..7177b321bbc 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -252,6 +252,28 @@ cdef class Column:
             c_result = move(make_column_from_scalar(dereference(c_scalar), size))
         return Column.from_libcudf(move(c_result))
 
+    @staticmethod
+    def all_null_like(Column like, size_type size):
+        """Create an all null column from a template.
+
+        Parameters
+        ----------
+        like : Column
+            Column whose type we should mimic
+        size : int
+            Number of rows in the resulting column.
+
+        Returns
+        -------
+        Column
+            An all-null column of `size` rows and type matching `like`.
+        """
+        cdef Scalar slr = Scalar.empty_like(like)
+        cdef unique_ptr[column] c_result
+        with nogil:
+            c_result = move(make_column_from_scalar(dereference(slr.get()), size))
+        return Column.from_libcudf(move(c_result))
+
     @staticmethod
     def from_cuda_array_interface_obj(object obj):
         """Create a Column from an object with a CUDA array interface.
diff --git a/python/pylibcudf/pylibcudf/experimental.pxd b/python/pylibcudf/pylibcudf/experimental.pxd
new file mode 100644
index 00000000000..107c91c8365
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/experimental.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+
+cpdef enable_prefetching(str key)
+
+cpdef disable_prefetching(str key)
+
+cpdef prefetch_debugging(bool enable)
diff --git a/python/pylibcudf/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx
new file mode 100644
index 00000000000..b25a53e13b2
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/experimental.pyx
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.string cimport string
+from pylibcudf.libcudf cimport experimental as cpp_experimental
+
+
+cpdef enable_prefetching(str key):
+    """Turn on prefetch instructions for the given key.
+
+    Parameters
+    ----------
+    key : str
+        The key to enable prefetching for.
+    """
+    cdef string c_key = key.encode("utf-8")
+    cpp_experimental.enable_prefetching(c_key)
+
+
+cpdef disable_prefetching(str key):
+    """Turn off prefetch instructions for the given key.
+
+    Parameters
+    ----------
+    key : str
+        The key to disable prefetching for.
+    """
+    cdef string c_key = key.encode("utf-8")
+    cpp_experimental.disable_prefetching(c_key)
+
+
+cpdef prefetch_debugging(bool enable):
+    """Enable or disable prefetch debugging.
+
+    When enabled, any prefetch instructions will be logged to the console.
+
+    Parameters
+    ----------
+    enable : bool
+        Whether to enable or disable prefetch debugging.
+    """
+    cpp_experimental.prefetch_debugging(enable)
diff --git a/python/pylibcudf/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
index 18ee9d3ad83..a44c9e25987 100644
--- a/python/pylibcudf/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -37,6 +37,17 @@ from .types cimport DataType
 # Aliases for simplicity
 ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
 
+# Define this class just to have a docstring for it
+cdef class Expression:
+    """
+    The base class for all expression types.
+    This class cannot be instantiated directly, please
+    instantiate one of its child classes instead.
+
+    For details, see :cpp:class:`cudf::ast::expression`.
+    """
+    pass
+
 cdef class Literal(Expression):
     """
     A literal value used in an abstract syntax tree.
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 8dd08d11dc8..55bea4fc262 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -22,6 +22,6 @@ rapids_cython_create_modules(
 )
 
 set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
-                                pylibcudf_io_json pylibcudf_io_types
+                                pylibcudf_io_json pylibcudf_io_parquet pylibcudf_io_types
 )
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
index 5b3272d60e0..62820048584 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, types
+from . cimport avro, datasource, json, parquet, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
index e17deaa4663..27640f7d955 100644
--- a/python/pylibcudf/pylibcudf/io/__init__.py
+++ b/python/pylibcudf/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, types
+from . import avro, csv, datasource, json, parquet, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx
index d2de339bd6b..6cc509b74cb 100644
--- a/python/pylibcudf/pylibcudf/io/datasource.pyx
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyx
@@ -6,6 +6,8 @@ from pyarrow.lib cimport NativeFile
 from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from pylibcudf.libcudf.io.datasource cimport datasource
 
+import warnings
+
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
@@ -15,10 +17,16 @@ cdef class Datasource:
 
 cdef class NativeFileDatasource(Datasource):
 
-    def __cinit__(self, NativeFile native_file,):
+    def __cinit__(self, NativeFile native_file):
 
         cdef shared_ptr[CRandomAccessFile] ra_src
 
+        warnings.warn(
+            "Support for reading pyarrow's NativeFile is deprecated "
+            "and will be removed in a future release of cudf.",
+            FutureWarning,
+        )
+
         ra_src = native_file.get_random_access_file()
         self.c_datasource.reset(new arrow_io_source(ra_src))
 
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
new file mode 100644
index 00000000000..4e9dbdf78df
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.parquet cimport (
+    chunked_parquet_reader as cpp_chunked_parquet_reader,
+)
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
+
+
+cdef class ChunkedParquetReader:
+    cdef unique_ptr[cpp_chunked_parquet_reader] reader
+
+    cpdef bool has_next(self)
+    cpdef TableWithMetadata read_chunk(self)
+
+
+cpdef read_parquet(
+    SourceInfo source_info,
+    list columns = *,
+    list row_groups = *,
+    Expression filters = *,
+    bool convert_strings_to_categories = *,
+    bool use_pandas_metadata = *,
+    int64_t skip_rows = *,
+    size_type num_rows = *,
+    # disabled see comment in parquet.pyx for more
+    # ReaderColumnSchema reader_column_schema = *,
+    # DataType timestamp_type = *
+)
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
new file mode 100644
index 00000000000..d48430fc958
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -0,0 +1,203 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cython.operator cimport dereference
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.io.parquet cimport (
+    chunked_parquet_reader as cpp_chunked_parquet_reader,
+    parquet_reader_options,
+    read_parquet as cpp_read_parquet,
+)
+from pylibcudf.libcudf.io.types cimport table_with_metadata
+from pylibcudf.libcudf.types cimport size_type
+
+
+cdef parquet_reader_options _setup_parquet_reader_options(
+    SourceInfo source_info,
+    list columns = None,
+    list row_groups = None,
+    Expression filters = None,
+    bool convert_strings_to_categories = False,
+    bool use_pandas_metadata = True,
+    int64_t skip_rows = 0,
+    size_type num_rows = -1,
+    # ReaderColumnSchema reader_column_schema = None,
+    # DataType timestamp_type = DataType(type_id.EMPTY)
+):
+    cdef vector[string] col_vec
+    cdef parquet_reader_options opts = (
+        parquet_reader_options.builder(source_info.c_obj)
+        .convert_strings_to_categories(convert_strings_to_categories)
+        .use_pandas_metadata(use_pandas_metadata)
+        .use_arrow_schema(True)
+        .build()
+    )
+    if row_groups is not None:
+        opts.set_row_groups(row_groups)
+    if num_rows != -1:
+        opts.set_num_rows(num_rows)
+    if skip_rows != 0:
+        opts.set_skip_rows(skip_rows)
+    if columns is not None:
+        col_vec.reserve(len(columns))
+        for col in columns:
+            col_vec.push_back(<string>str(col).encode())
+        opts.set_columns(col_vec)
+    if filters is not None:
+        opts.set_filter(<expression &>dereference(filters.c_obj.get()))
+    return opts
+
+
+cdef class ChunkedParquetReader:
+    """
+    Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The names of the columns to be read
+    row_groups : list[list[size_type]], default None
+        List of row groups to be read.
+    use_pandas_metadata : bool, default True
+        If True, return metadata about the index column in
+        the per-file user metadata of the ``TableWithMetadata``
+    convert_strings_to_categories : bool, default False
+        Whether to convert string columns to the category type
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    num_rows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+    chunk_read_limit : size_t, default 0
+        Limit on total number of bytes to be returned per read,
+        or 0 if there is no limit.
+    pass_read_limit : size_t, default 1024000000
+        Limit on the amount of memory used for reading and decompressing data
+        or 0 if there is no limit.
+    """
+    def __init__(
+        self,
+        SourceInfo source_info,
+        list columns=None,
+        list row_groups=None,
+        bool use_pandas_metadata=True,
+        bool convert_strings_to_categories=False,
+        int64_t skip_rows = 0,
+        size_type num_rows = -1,
+        size_t chunk_read_limit=0,
+        size_t pass_read_limit=1024000000
+    ):
+
+        cdef parquet_reader_options opts = _setup_parquet_reader_options(
+            source_info,
+            columns,
+            row_groups,
+            filters=None,
+            convert_strings_to_categories=convert_strings_to_categories,
+            use_pandas_metadata=use_pandas_metadata,
+            skip_rows=skip_rows,
+            num_rows=num_rows,
+        )
+
+        with nogil:
+            self.reader.reset(
+                new cpp_chunked_parquet_reader(
+                    chunk_read_limit,
+                    pass_read_limit,
+                    opts
+                )
+            )
+
+    cpdef bool has_next(self):
+        """
+        Returns True if there is another chunk in the Parquet file
+        to be read.
+
+        Returns
+        -------
+        True if we have not finished reading the file.
+        """
+        with nogil:
+            return self.reader.get()[0].has_next()
+
+    cpdef TableWithMetadata read_chunk(self):
+        """
+        Read the next chunk into a :py:class:`~.types.TableWithMetadata`
+
+        Returns
+        -------
+        TableWithMetadata
+            The Table and its corresponding metadata (column names) that were read in.
+        """
+        # Read Parquet
+        cdef table_with_metadata c_result
+
+        with nogil:
+            c_result = move(self.reader.get()[0].read_chunk())
+
+        return TableWithMetadata.from_libcudf(c_result)
+
+cpdef read_parquet(
+    SourceInfo source_info,
+    list columns = None,
+    list row_groups = None,
+    Expression filters = None,
+    bool convert_strings_to_categories = False,
+    bool use_pandas_metadata = True,
+    int64_t skip_rows = 0,
+    size_type num_rows = -1,
+    # Disabled, these aren't used by cudf-python
+    # we should only add them back in if there's user demand
+    # ReaderColumnSchema reader_column_schema = None,
+    # DataType timestamp_type = DataType(type_id.EMPTY)
+):
+    """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The string names of the columns to be read.
+    row_groups : list[list[size_type]], default None
+        List of row groups to be read.
+    filters : Expression, default None
+        An AST :py:class:`pylibcudf.expressions.Expression`
+        to use for predicate pushdown.
+    convert_strings_to_categories : bool, default False
+        Whether to convert string columns to the category type
+    use_pandas_metadata : bool, default True
+        If True, return metadata about the index column in
+        the per-file user metadata of the ``TableWithMetadata``
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    num_rows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef table_with_metadata c_result
+    cdef parquet_reader_options opts = _setup_parquet_reader_options(
+        source_info,
+        columns,
+        row_groups,
+        filters,
+        convert_strings_to_categories,
+        use_pandas_metadata,
+        skip_rows,
+        num_rows,
+    )
+
+    with nogil:
+        c_result = move(cpp_read_parquet(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 1edb753efc6..1600a805b37 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -121,6 +121,14 @@ cdef class TableWithMetadata:
         out.metadata = tbl_with_meta.metadata
         return out
 
+    @property
+    def per_file_user_data(self):
+        """
+        Returns a list containing a dict
+        containing file-format specific metadata,
+        for each file being read in.
+        """
+        return self.metadata.per_file_user_data
 
 cdef class SourceInfo:
     """A class containing details on a source to read from.
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index 925efface7b..25664286f19 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -7,12 +7,7 @@ from libcpp.utility cimport move
 from pylibcudf.libcudf cimport join as cpp_join
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.types cimport (
-    data_type,
-    null_equality,
-    size_type,
-    type_id,
-)
+from pylibcudf.libcudf.types cimport null_equality
 
 from rmm._lib.device_buffer cimport device_buffer
 
@@ -22,15 +17,11 @@ from .table cimport Table
 
 cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
     # helper to convert a gather map to a Column
-    cdef device_buffer c_empty
-    cdef size_type size = dereference(gather_map.get()).size()
     return Column.from_libcudf(
         move(
             make_unique[column](
-                data_type(type_id.INT32),
-                size,
-                dereference(gather_map.get()).release(),
-                move(c_empty),
+                move(dereference(gather_map.get())),
+                device_buffer(),
                 0
             )
         )
diff --git a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
index a9ca4f5b708..b9480c66c52 100644
--- a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
@@ -1,10 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.exception_handler cimport cudf_exception_handler
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport data_type
 
@@ -18,9 +20,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         TRUE_DIV
         FLOOR_DIV
         MOD
+        PMOD
         PYMOD
         POW
         INT_POW
+        LOG_BASE
+        ATAN2
+        SHIFT_LEFT
+        SHIFT_RIGHT
+        SHIFT_RIGHT_UNSIGNED
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        LOGICAL_AND
+        LOGICAL_OR
         EQUAL
         NOT_EQUAL
         LESS
@@ -28,38 +41,46 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_MAX
+        NULL_MIN
         NULL_NOT_EQUALS
-        BITWISE_AND
-        BITWISE_OR
-        BITWISE_XOR
-        LOGICAL_AND
-        LOGICAL_OR
         GENERIC_BINARY
+        NULL_LOGICAL_AND
+        NULL_LOGICAL_OR
+        INVALID_BINARY
 
     cdef unique_ptr[column] binary_operation (
         const scalar& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
+
+cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
+    cdef bool is_supported_operation(
+        data_type output_type,
+        data_type lhs_type,
+        data_type rhs_type,
+        binary_operator op
+    ) except +cudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
index a4b6525d4d1..97439145160 100644
--- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
+from pylibcudf.libcudf.exception_handler cimport cudf_exception_handler
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
@@ -17,8 +18,6 @@ from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
-
 ctypedef const scalar constscalar
 
 cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/exception_handler.pxd b/python/pylibcudf/pylibcudf/libcudf/exception_handler.pxd
new file mode 100644
index 00000000000..4337d8db285
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/exception_handler.pxd
@@ -0,0 +1,69 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+# See
+# https://github.com/cython/cython/blob/master/Cython/Utility/CppSupport.cpp
+# for the original Cython exception handler.
+cdef extern from *:
+    """
+    #include <Python.h>
+    #include <cudf/utilities/error.hpp>
+    #include <ios>
+    #include <stdexcept>
+
+    namespace {
+
+    /**
+     * @brief Exception handler to map C++ exceptions to Python ones in Cython
+     *
+     * This exception handler extends the base exception handler provided by
+     * Cython. In addition to the exceptions that Cython itself supports, this
+     * file adds support for additional exceptions thrown by libcudf that need
+     * to be mapped to specific Python exceptions.
+     *
+     * Since this function interoperates with Python's exception state, it
+     * does not throw any C++ exceptions.
+     */
+    void cudf_exception_handler()
+    {
+      // Catch a handful of different errors here and turn them into the
+      // equivalent Python errors.
+      try {
+        if (PyErr_Occurred())
+          ;  // let latest Python exn pass through and ignore the current one
+        throw;
+      } catch (const std::bad_alloc& exn) {
+        PyErr_SetString(PyExc_MemoryError, exn.what());
+      } catch (const std::bad_cast& exn) {
+        PyErr_SetString(PyExc_TypeError, exn.what());
+      } catch (const std::domain_error& exn) {
+        PyErr_SetString(PyExc_ValueError, exn.what());
+      } catch (const cudf::data_type_error& exn) {
+        // Catch subclass (data_type_error) before parent (invalid_argument)
+        PyErr_SetString(PyExc_TypeError, exn.what());
+      } catch (const std::invalid_argument& exn) {
+        PyErr_SetString(PyExc_ValueError, exn.what());
+      } catch (const std::ios_base::failure& exn) {
+        // Unfortunately, in standard C++ we have no way of distinguishing EOF
+        // from other errors here; be careful with the exception mask
+        PyErr_SetString(PyExc_IOError, exn.what());
+      } catch (const std::out_of_range& exn) {
+        // Change out_of_range to IndexError
+        PyErr_SetString(PyExc_IndexError, exn.what());
+      } catch (const std::overflow_error& exn) {
+        PyErr_SetString(PyExc_OverflowError, exn.what());
+      } catch (const std::range_error& exn) {
+        PyErr_SetString(PyExc_ArithmeticError, exn.what());
+      } catch (const std::underflow_error& exn) {
+        PyErr_SetString(PyExc_ArithmeticError, exn.what());
+        // The below is the default catch-all case.
+      } catch (const std::exception& exn) {
+        PyErr_SetString(PyExc_RuntimeError, exn.what());
+      } catch (...) {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown exception");
+      }
+    }
+
+    }  // anonymous namespace
+    """
+    cdef void cudf_exception_handler()
diff --git a/python/pylibcudf/pylibcudf/libcudf/experimental.pxd b/python/pylibcudf/pylibcudf/libcudf/experimental.pxd
new file mode 100644
index 00000000000..f280a382a04
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/experimental.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.string cimport string
+
+
+cdef extern from "cudf/utilities/prefetch.hpp" \
+        namespace "cudf::experimental::prefetch" nogil:
+    # Not technically the right signature, but it's good enough to let Cython
+    # generate valid C++ code. It just means we'll be copying a host string
+    # extra, but that's OK. If we care we could generate string_view bindings,
+    # but there's no real rush so if we go that route we might as well
+    # contribute them upstream to Cython itself.
+    void enable_prefetching(string key)
+    void disable_prefetching(string key)
+    void prefetch_debugging(bool enable)
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
index a8e1364b54a..222d87defa0 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
@@ -1,8 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-cimport pylibcudf.libcudf.io.types as cudf_io_types
-cimport pylibcudf.libcudf.table.table_view as cudf_table_view
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
@@ -11,13 +9,24 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    dictionary_policy,
+    partition_info,
+    sink_info,
+    source_info,
+    statistics_freq,
+    table_input_metadata,
+    table_with_metadata,
+)
+from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cppclass parquet_reader_options:
         parquet_reader_options() except +
-        cudf_io_types.source_info get_source_info() except +
+        source_info get_source_info() except +
         vector[vector[size_type]] get_row_groups() except +
         const optional[reference_wrapper[expression]]& get_filter() except +
         data_type get_timestamp_type() except +
@@ -26,21 +35,24 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         # setter
 
+        void set_filter(expression &filter) except +
         void set_columns(vector[string] col_names) except +
+        void set_num_rows(size_type val) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
+        void set_skip_rows(int64_t val) except +
         void enable_use_arrow_schema(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
         @staticmethod
         parquet_reader_options_builder builder(
-            cudf_io_types.source_info src
+            source_info src
         ) except +
 
     cdef cppclass parquet_reader_options_builder:
         parquet_reader_options_builder() except +
         parquet_reader_options_builder(
-            cudf_io_types.source_info src
+            source_info src
         ) except +
         parquet_reader_options_builder& columns(
             vector[string] col_names
@@ -48,6 +60,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& row_groups(
             vector[vector[size_type]] row_grp
         ) except +
+        parquet_reader_options_builder& convert_strings_to_categories(
+            bool val
+        ) except +
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
@@ -62,15 +77,15 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         ) except +
         parquet_reader_options build() except +
 
-    cdef cudf_io_types.table_with_metadata read_parquet(
+    cdef table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
     cdef cppclass parquet_writer_options_base:
         parquet_writer_options_base() except +
-        cudf_io_types.sink_info get_sink_info() except +
-        cudf_io_types.compression_type get_compression() except +
-        cudf_io_types.statistics_freq get_stats_level() except +
-        const optional[cudf_io_types.table_input_metadata]& get_metadata(
+        sink_info get_sink_info() except +
+        compression_type get_compression() except +
+        statistics_freq get_stats_level() except +
+        const optional[table_input_metadata]& get_metadata(
         ) except +
         size_t get_row_group_size_bytes() except +
         size_type get_row_group_size_rows() except +
@@ -80,16 +95,16 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         bool is_enabled_write_arrow_schema() except +
 
         void set_metadata(
-            cudf_io_types.table_input_metadata m
+            table_input_metadata m
         ) except +
         void set_key_value_metadata(
             vector[map[string, string]] kvm
         ) except +
         void set_stats_level(
-            cudf_io_types.statistics_freq sf
+            statistics_freq sf
         ) except +
         void set_compression(
-            cudf_io_types.compression_type compression
+            compression_type compression
         ) except +
         void set_int96_timestamps(
             bool enabled
@@ -104,14 +119,14 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
         void enable_write_arrow_schema(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
+        void set_dictionary_policy(dictionary_policy policy) except +
 
     cdef cppclass parquet_writer_options(parquet_writer_options_base):
         parquet_writer_options() except +
-        cudf_table_view.table_view get_table() except +
+        table_view get_table() except +
         string get_column_chunks_file_paths() except +
         void set_partitions(
-            vector[cudf_io_types.partition_info] partitions
+            vector[partition_info] partitions
         ) except +
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
@@ -119,24 +134,24 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         parquet_writer_options_builder builder(
-            cudf_io_types.sink_info sink_,
-            cudf_table_view.table_view table_
+            sink_info sink_,
+            table_view table_
         ) except +
 
     cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]:
         parquet_writer_options_builder_base() except +
 
         BuilderT& metadata(
-            cudf_io_types.table_input_metadata m
+            table_input_metadata m
         ) except +
         BuilderT& key_value_metadata(
             vector[map[string, string]] kvm
         ) except +
         BuilderT& stats_level(
-            cudf_io_types.statistics_freq sf
+            statistics_freq sf
         ) except +
         BuilderT& compression(
-            cudf_io_types.compression_type compression
+            compression_type compression
         ) except +
         BuilderT& int96_timestamps(
             bool enabled
@@ -166,7 +181,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             bool val
         ) except +
         BuilderT& dictionary_policy(
-            cudf_io_types.dictionary_policy val
+            dictionary_policy val
         ) except +
         OptionsT build() except +
 
@@ -175,11 +190,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
                                                 parquet_writer_options]):
         parquet_writer_options_builder() except +
         parquet_writer_options_builder(
-            cudf_io_types.sink_info sink_,
-            cudf_table_view.table_view table_
+            sink_info sink_,
+            table_view table_
         ) except +
         parquet_writer_options_builder& partitions(
-            vector[cudf_io_types.partition_info] partitions
+            vector[partition_info] partitions
         ) except +
         parquet_writer_options_builder& column_chunks_file_paths(
             vector[string] column_chunks_file_paths
@@ -194,7 +209,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
-            cudf_io_types.sink_info sink_,
+            sink_info sink_,
         ) except +
 
     cdef cppclass chunked_parquet_writer_options_builder(
@@ -203,18 +218,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             ):
         chunked_parquet_writer_options_builder() except +
         chunked_parquet_writer_options_builder(
-            cudf_io_types.sink_info sink_,
+            sink_info sink_,
         ) except +
 
     cdef cppclass parquet_chunked_writer:
         parquet_chunked_writer() except +
         parquet_chunked_writer(chunked_parquet_writer_options args) except +
         parquet_chunked_writer& write(
-            cudf_table_view.table_view table_,
+            table_view table_,
         ) except +
         parquet_chunked_writer& write(
-            const cudf_table_view.table_view& table_,
-            const vector[cudf_io_types.partition_info]& partitions,
+            const table_view& table_,
+            const vector[partition_info]& partitions,
         ) except +
         unique_ptr[vector[uint8_t]] close(
             vector[string] column_chunks_file_paths,
@@ -230,7 +245,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             size_t pass_read_limit,
             const parquet_reader_options& options) except +
         bool has_next() except +
-        cudf_io_types.table_with_metadata read_chunk() except +
+        table_with_metadata read_chunk() except +
 
     cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
index 4e83c56dcf0..a3d99807876 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
@@ -80,6 +80,7 @@ cdef extern from "cudf/io/types.hpp" \
         map[string, string] user_data
         vector[unordered_map[string, string]] per_file_user_data
         vector[column_name_info] schema_info
+        vector[size_t] num_rows_per_source
 
     cdef cppclass table_with_metadata:
         unique_ptr[table] tbl
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
index 05d4e7628e6..48c4ec70c8a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
@@ -4,11 +4,10 @@ from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.exception_handler cimport cudf_exception_handler
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
-
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
index fff082d90ef..e283551ed0c 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
@@ -6,4 +6,4 @@ from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
+    cdef unique_ptr[column] count_elements(const lists_column_view&) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
new file mode 100644
index 00000000000..54f5a8409b6
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+
+
+cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] sequences(
+        const column_view& starts,
+        const column_view& sizes,
+    ) except +
+
+    cdef unique_ptr[column] sequences(
+        const column_view& starts,
+        const column_view& steps,
+        const column_view& sizes,
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
new file mode 100644
index 00000000000..266f04ef6b3
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
@@ -0,0 +1,36 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport nan_equality, null_equality
+
+
+cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] difference_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] have_overlap(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] intersect_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] union_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
index 561b25ed0a9..ea45f999c47 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
@@ -12,3 +12,9 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
         order column_order,
         null_order null_precedence
     ) except +
+
+    cdef unique_ptr[column] stable_sort_lists(
+        const lists_column_view source_column,
+        order column_order,
+        null_order null_precedence
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
index f9980765772..d9df7c3ca2e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -8,8 +8,13 @@ from pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 cdef extern from "cudf/lists/stream_compaction.hpp" \
         namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] apply_boolean_mask(
+        const lists_column_view& lists_column,
+        const lists_column_view& boolean_mask,
+    ) except +
+
     cdef unique_ptr[column] distinct(
-        const lists_column_view lists_column,
+        const lists_column_view& lists_column,
         null_equality nulls_equal,
         nan_equality nans_equal
     ) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
index 3a2dd57f15c..ee4b47935b2 100644
--- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -2,9 +2,12 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
+from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
     cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
+
+    cdef unique_ptr[scalar] make_empty_scalar_like(const column_view &) except +
diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
new file mode 100644
index 00000000000..fbeb6e9db90
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.types cimport type_id
+
+
+cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil:
+    cdef type_id type_to_id[T]()
diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd
index a8c5d3a5a7f..e7d006e6e2e 100644
--- a/python/pylibcudf/pylibcudf/lists.pxd
+++ b/python/pylibcudf/pylibcudf/lists.pxd
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport null_order, size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -32,3 +32,21 @@ cpdef Column reverse(Column)
 cpdef Column segmented_gather(Column, Column)
 
 cpdef Column extract_list_element(Column, ColumnOrSizeType)
+
+cpdef Column count_elements(Column)
+
+cpdef Column sequences(Column, Column, Column steps = *)
+
+cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
+
+cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column apply_boolean_mask(Column, Column)
+
+cpdef Column distinct(Column, bool, bool)
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index 4081249a0b7..947caddc485 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -8,19 +8,38 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    filling as cpp_filling,
     gather as cpp_gather,
     reverse as cpp_reverse,
+    set_operations as cpp_set_operations,
 )
 from pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from pylibcudf.libcudf.lists.count_elements cimport (
+    count_elements as cpp_count_elements,
+)
 from pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
+from pylibcudf.libcudf.lists.sorting cimport (
+    sort_lists as cpp_sort_lists,
+    stable_sort_lists as cpp_stable_sort_lists,
+)
+from pylibcudf.libcudf.lists.stream_compaction cimport (
+    apply_boolean_mask as cpp_apply_boolean_mask,
+    distinct as cpp_distinct,
+)
 from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport (
+    nan_equality,
+    null_equality,
+    null_order,
+    order,
+    size_type,
+)
 from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
@@ -292,3 +311,376 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
             index.view() if ColumnOrSizeType is Column else index,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_elements(Column input):
+    """Count the number of rows in each
+    list element in the given lists column.
+    For details, see :cpp:func:`count_elements`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column
+
+    Returns
+    -------
+    Column
+        A new Column of the lengths of each list element
+    """
+    cdef ListColumnView list_view = input.list_view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_count_elements(list_view.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sequences(Column starts, Column sizes, Column steps = None):
+    """Create a lists column in which each row contains a sequence of
+    values specified by a tuple of (start, step, size) parameters.
+
+    For details, see :cpp:func:`sequences`.
+
+    Parameters
+    ----------
+    starts : Column
+        First values in the result sequences.
+    sizes : Column
+        Numbers of values in the result sequences.
+    steps : Optional[Column]
+        Increment values for the result sequences.
+
+    Returns
+    -------
+    Column
+        The result column containing generated sequences.
+    """
+    cdef unique_ptr[column] c_result
+
+    if steps is not None:
+        with nogil:
+            c_result = move(cpp_filling.sequences(
+                starts.view(),
+                steps.view(),
+                sizes.view(),
+            ))
+    else:
+        with nogil:
+            c_result = move(cpp_filling.sequences(
+                starts.view(),
+                sizes.view(),
+            ))
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column sort_lists(
+    Column input,
+    bool ascending,
+    null_order na_position,
+    bool stable = False
+):
+    """Sort the elements within a list in each row of a list column.
+
+    For details, see :cpp:func:`sort_lists`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    ascending : bool
+        If true, the sort order is ascending. Otherwise, the sort order is descending.
+    na_position : NullOrder
+        If na_position equals NullOrder.FIRST, then the null values in the output
+        column are placed first. Otherwise, they are be placed after.
+    stable: bool
+        If true :cpp:func:`stable_sort_lists` is used, Otherwise,
+        :cpp:func:`sort_lists` is used.
+
+    Returns
+    -------
+    Column
+        A new Column with elements in each list sorted.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    cdef order c_sort_order = (
+        order.ASCENDING if ascending else order.DESCENDING
+    )
+
+    with nogil:
+        if stable:
+            c_result = move(cpp_stable_sort_lists(
+                    list_view.view(),
+                    c_sort_order,
+                    na_position,
+            ))
+        else:
+            c_result = move(cpp_sort_lists(
+                    list_view.view(),
+                    c_sort_order,
+                    na_position,
+            ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column difference_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a column of index values indicating the position of a search
+    key row within the corresponding list row in the lists column.
+
+    For details, see :cpp:func:`difference_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the difference results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.difference_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column have_overlap(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Check if lists at each row of the given lists columns overlap.
+
+    For details, see :cpp:func:`have_overlap`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column for one side.
+    rhs : Column
+        The input lists column for the other side.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A column containing the check results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.have_overlap(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column intersect_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a lists column of distinct elements common to two input lists columns.
+
+    For details, see :cpp:func:`intersect_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the intersection results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.intersect_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column union_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a lists column of distinct elements found in
+    either of two input lists columns.
+
+    For details, see :cpp:func:`union_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the union results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.union_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
+    """Filters elements in each row of the input lists column using a boolean mask
+
+    For details, see :cpp:func:`apply_boolean_mask`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    boolean_mask : Column
+        The boolean mask.
+
+    Returns
+    -------
+    Column
+        A Column of filtered elements based upon the boolean mask.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    cdef ListColumnView mask_view = boolean_mask.list_view()
+    with nogil:
+        c_result = move(cpp_apply_boolean_mask(
+            list_view.view(),
+            mask_view.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
+    """Create a new list column without duplicate elements in each list.
+
+    For details, see :cpp:func:`distinct`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    nulls_equal : bool
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A new list column without duplicate elements in each list.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_distinct(
+            list_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd
index 1f10649c4e0..8664dfa4b7e 100644
--- a/python/pylibcudf/pylibcudf/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/scalar.pxd
@@ -6,6 +6,7 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
+from .column cimport Column
 from .types cimport DataType
 
 
@@ -23,5 +24,8 @@ cdef class Scalar:
     cpdef DataType type(self)
     cpdef bool is_valid(self)
 
+    @staticmethod
+    cdef Scalar empty_like(Column column)
+
     @staticmethod
     cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)
diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index c2b89b222cf..3e20938af0c 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -2,10 +2,13 @@
 
 from cython cimport no_gc_clear
 from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
 from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
+from .column cimport Column
 from .types cimport DataType
 
 
@@ -45,6 +48,21 @@ cdef class Scalar:
         """True if the scalar is valid, false if not"""
         return self.get().is_valid()
 
+    @staticmethod
+    cdef Scalar empty_like(Column column):
+        """Construct a null scalar with the same type as column.
+
+        Parameters
+        ----------
+        column
+            Column to take type from
+
+        Returns
+        -------
+        New empty (null) scalar of the given type.
+        """
+        return Scalar.from_libcudf(move(make_empty_scalar_like(column.view())))
+
     @staticmethod
     cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None):
         """Construct a Scalar object from a libcudf scalar.
diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
index 798b14c01a8..e92a5fc655f 100644
--- a/python/pylibcudf/pylibcudf/tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -6,11 +6,11 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
+from pyarrow.parquet import write_table as pq_write_table
 from pylibcudf.io.types import CompressionType
 
-from cudf._lib import pylibcudf as plc
-
 
 def metadata_from_arrow_type(
     pa_type: pa.Array,
@@ -103,15 +103,68 @@ def _make_fields_nullable(typ):
             return pa.list_(new_fields[0])
         return typ
 
+    def _contains_type(parent_typ, typ_checker):
+        """
+        Check whether the parent or one of the children
+        satisfies the typ_checker.
+        """
+        if typ_checker(parent_typ):
+            return True
+        if pa.types.is_nested(parent_typ):
+            for i in range(parent_typ.num_fields):
+                if _contains_type(parent_typ.field(i).type, typ_checker):
+                    return True
+        return False
+
     if not check_field_nullability:
         rhs_type = _make_fields_nullable(rhs.type)
         rhs = rhs.cast(rhs_type)
 
         lhs_type = _make_fields_nullable(lhs.type)
-        lhs = rhs.cast(lhs_type)
+        lhs = lhs.cast(lhs_type)
 
-    if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
-        np.testing.assert_array_almost_equal(lhs, rhs)
+    assert lhs.type == rhs.type, f"{lhs.type} != {rhs.type}"
+    if _contains_type(lhs.type, pa.types.is_floating) and _contains_type(
+        rhs.type, pa.types.is_floating
+    ):
+        # Flatten nested arrays to liststo do comparisons if nested
+        # This is so we can do approximate comparisons
+        # for floats in numpy
+        def _flatten_arrays(arr):
+            if pa.types.is_nested(arr.type):
+                flattened = arr.flatten()
+                flat_arrs = []
+                if isinstance(flattened, list):
+                    for flat_arr in flattened:
+                        flat_arrs += _flatten_arrays(flat_arr)
+                else:
+                    flat_arrs = [flattened]
+            else:
+                flat_arrs = [arr]
+            return flat_arrs
+
+        if isinstance(lhs, (pa.ListArray, pa.StructArray)):
+            lhs = _flatten_arrays(lhs)
+            rhs = _flatten_arrays(rhs)
+        else:
+            # Just a regular doublearray
+            lhs = [lhs]
+            rhs = [rhs]
+
+        for lh_arr, rh_arr in zip(lhs, rhs):
+            # Check NaNs positions match
+            # and then filter out nans
+            lhs_nans = pa.compute.is_nan(lh_arr)
+            rhs_nans = pa.compute.is_nan(rh_arr)
+            assert lhs_nans.equals(rhs_nans)
+
+            if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+                # masks must be equal at this point
+                mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+                lh_arr = lh_arr.filter(mask)
+                rh_arr = rh_arr.filter(mask)
+
+            np.testing.assert_array_almost_equal(lh_arr, rh_arr)
     else:
         assert lhs.equals(rhs)
 
@@ -266,6 +319,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs):
         df.to_json(path_or_buf, mode=mode, **kwargs)
     elif format == "csv":
         df.to_csv(path_or_buf, mode=mode, **kwargs)
+    elif format == "parquet":
+        # The conversion to pandas is lossy (doesn't preserve
+        # nested types) so we
+        # will just use pyarrow directly to write this
+        pq_write_table(
+            pa_table,
+            pa.PythonFile(path_or_buf)
+            if isinstance(path_or_buf, io.IOBase)
+            else path_or_buf,
+        )
     if isinstance(path_or_buf, io.IOBase):
         path_or_buf.seek(0)
     return path_or_buf
diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
index 83166bb4990..fdce6f353ca 100644
--- a/python/pylibcudf/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -169,6 +169,21 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+@pytest.fixture(
+    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO],
+)
+def binary_source_or_sink(request, tmp_path):
+    fp_or_buf = request.param
+    if isinstance(fp_or_buf, str):
+        return f"{tmp_path}/{fp_or_buf}"
+    elif isinstance(fp_or_buf, os.PathLike):
+        return tmp_path.joinpath(fp_or_buf)
+    elif issubclass(fp_or_buf, io.IOBase):
+        # Must construct io.StringIO/io.BytesIO inside
+        # fixture, or we'll end up re-using it
+        return fp_or_buf()
+
+
 unsupported_types = {
     # Not supported by pandas
     # TODO: find a way to test these
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
new file mode 100644
index 00000000000..7c27115008e
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
+import pytest
+from pyarrow.parquet import read_table
+from pylibcudf.expressions import (
+    ASTOperator,
+    ColumnNameReference,
+    ColumnReference,
+    Literal,
+    Operation,
+)
+from utils import assert_table_and_meta_eq, make_source
+
+# Shared kwargs to pass to make_source
+_COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
+
+
+@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]])
+def test_read_parquet_basic(
+    table_data, binary_source_or_sink, nrows_skiprows, columns
+):
+    _, pa_table = table_data
+    nrows, skiprows = nrows_skiprows
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
+    )
+
+    res = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo([source]),
+        num_rows=nrows,
+        skip_rows=skiprows,
+        columns=columns,
+    )
+
+    if columns is not None:
+        pa_table = pa_table.select(columns)
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
+
+
+@pytest.mark.parametrize(
+    "pa_filters,plc_filters",
+    [
+        (
+            pc.field("col_int64") >= 10,
+            Operation(
+                ASTOperator.GREATER_EQUAL,
+                ColumnNameReference("col_int64"),
+                Literal(plc.interop.from_arrow(pa.scalar(10))),
+            ),
+        ),
+        (
+            (pc.field("col_int64") >= 10) & (pc.field("col_double") < 0),
+            Operation(
+                ASTOperator.LOGICAL_AND,
+                Operation(
+                    ASTOperator.GREATER_EQUAL,
+                    ColumnNameReference("col_int64"),
+                    Literal(plc.interop.from_arrow(pa.scalar(10))),
+                ),
+                Operation(
+                    ASTOperator.LESS,
+                    ColumnNameReference("col_double"),
+                    Literal(plc.interop.from_arrow(pa.scalar(0.0))),
+                ),
+            ),
+        ),
+        (
+            (pc.field(0) == 10),
+            Operation(
+                ASTOperator.EQUAL,
+                ColumnReference(0),
+                Literal(plc.interop.from_arrow(pa.scalar(10))),
+            ),
+        ),
+    ],
+)
+def test_read_parquet_filters(
+    table_data, binary_source_or_sink, pa_filters, plc_filters
+):
+    _, pa_table = table_data
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
+    )
+
+    plc_table_w_meta = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo([source]), filters=plc_filters
+    )
+    exp = read_table(source, filters=pa_filters)
+    assert_table_and_meta_eq(
+        exp, plc_table_w_meta, check_field_nullability=False
+    )
+
+
+# TODO: Test these options
+# list row_groups = None,
+# ^^^ This one is not tested since it's not in pyarrow/pandas, deprecate?
+# bool convert_strings_to_categories = False,
+# bool use_pandas_metadata = True
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 907e69d309a..747f58ec8cf 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,10 +2,8 @@
 
 import io
 
-import pyarrow as pa
 import pylibcudf as plc
 import pytest
-from pylibcudf.io.datasource import NativeFileDatasource
 
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -17,10 +15,8 @@ def _skip_invalid_sinks(io_class, sink):
     """
     Skip invalid sinks for SinkInfo
     """
-    if io_class is plc.io.SinkInfo and isinstance(
-        sink, (bytes, NativeFileDatasource)
-    ):
-        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+    if io_class is plc.io.SinkInfo and isinstance(sink, bytes):
+        pytest.skip("bytes is not a valid input for SinkInfo")
 
 
 @pytest.mark.parametrize(
@@ -29,7 +25,6 @@ def _skip_invalid_sinks(io_class, sink):
         "a.txt",
         b"hello world",
         io.BytesIO(b"hello world"),
-        NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
 def test_source_info_ctor(io_class, source, tmp_path):
@@ -46,13 +41,12 @@ def test_source_info_ctor(io_class, source, tmp_path):
 @pytest.mark.parametrize(
     "sources",
     [
+        ["a.txt"],
+        [b"hello world"],
+        [io.BytesIO(b"hello world")],
         ["a.txt", "a.txt"],
         [b"hello world", b"hello there"],
         [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-        ],
     ],
 )
 def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -78,11 +72,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
             io.BytesIO(b"hello there"),
             b"hello world",
         ],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            "awef.txt",
-            b"hello world",
-        ],
     ],
 )
 def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
new file mode 100644
index 00000000000..f784cb3c191
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -0,0 +1,785 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import numpy as np
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from utils import assert_column_eq
+
+
+def idfn(param):
+    ltype, rtype, outtype, plc_op, _ = param
+    params = (plc_op.name, ltype, rtype, outtype)
+    return "-".join(map(str, params))
+
+
+@pytest.fixture(params=[True, False], ids=["nulls", "no_nulls"])
+def nulls(request):
+    return request.param
+
+
+def make_col(dtype, nulls):
+    if dtype == "int64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.int64()
+    elif dtype == "uint64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.uint64()
+    elif dtype == "float64":
+        data = [1.0, 2.0, 3.0, 4.0, 5.0]
+        pa_type = pa.float64()
+    elif dtype == "bool":
+        data = [True, False, True, False, True]
+        pa_type = pa.bool_()
+    elif dtype == "timestamp64[ns]":
+        data = [
+            np.datetime64("2022-01-01"),
+            np.datetime64("2022-01-02"),
+            np.datetime64("2022-01-03"),
+            np.datetime64("2022-01-04"),
+            np.datetime64("2022-01-05"),
+        ]
+        pa_type = pa.timestamp("ns")
+    elif dtype == "timedelta64[ns]":
+        data = [
+            np.timedelta64(1, "ns"),
+            np.timedelta64(2, "ns"),
+            np.timedelta64(3, "ns"),
+            np.timedelta64(4, "ns"),
+            np.timedelta64(5, "ns"),
+        ]
+        pa_type = pa.duration("ns")
+    else:
+        raise ValueError("Unsupported dtype")
+
+    if nulls:
+        data[3] = None
+
+    return pa.array(data, type=pa_type)
+
+
+@pytest.fixture
+def pa_data(request, nulls):
+    ltype, rtype, outtype = request.param
+    values = make_col(ltype, nulls), make_col(rtype, nulls), outtype
+    return values
+
+
+@pytest.fixture
+def plc_data(pa_data):
+    lhs, rhs, outtype = pa_data
+    return (
+        plc.interop.from_arrow(lhs),
+        plc.interop.from_arrow(rhs),
+        plc.interop.from_arrow(pa.from_numpy_dtype(np.dtype(outtype))),
+    )
+
+
+@pytest.fixture
+def tests(request, nulls):
+    ltype, rtype, py_outtype, plc_op, py_op = request.param
+    pa_lhs, pa_rhs = make_col(ltype, nulls), make_col(rtype, nulls)
+    plc_lhs, plc_rhs = (
+        plc.interop.from_arrow(pa_lhs),
+        plc.interop.from_arrow(pa_rhs),
+    )
+    plc_dtype = plc.interop.from_arrow(
+        pa.from_numpy_dtype(np.dtype(py_outtype))
+    )
+    return (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_dtype,
+        py_op,
+        plc_op,
+    )
+
+
+def custom_pyop(func):
+    def wrapper(x, y):
+        x = x.to_pylist()
+        y = y.to_pylist()
+
+        def inner(x, y):
+            if x is None or y is None:
+                return None
+            return func(x, y)
+
+        return pa.array([inner(x, y) for x, y in zip(x, y)])
+
+    return wrapper
+
+
+@custom_pyop
+def py_floordiv(x, y):
+    return x // y
+
+
+@custom_pyop
+def py_pmod(x, y):
+    return (x % y + y) % y
+
+
+@custom_pyop
+def py_mod(x, y):
+    return x % y
+
+
+@custom_pyop
+def py_atan2(x, y):
+    return math.atan2(x, y)
+
+
+@custom_pyop
+def py_shift_right_unsigned(x, y):
+    unsigned_x = np.uint32(x)
+    result = unsigned_x >> y
+    return result
+
+
+@pytest.mark.parametrize(
+    "tests",
+    [
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.MOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PMOD, py_pmod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PYMOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.GENERIC_BINARY,
+            None,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INVALID_BINARY,
+            None,
+        ),
+    ],
+    indirect=True,
+    ids=idfn,
+)
+def test_binaryops(tests):
+    (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_outtype,
+        py_op,
+        plc_op,
+    ) = tests
+
+    def get_result():
+        return plc.binaryop.binary_operation(
+            plc_lhs,
+            plc_rhs,
+            plc_op,
+            plc_outtype,
+        )
+
+    if not plc.binaryop.is_supported_operation(
+        plc_outtype, plc_lhs.type(), plc_rhs.type(), plc_op
+    ):
+        with pytest.raises(TypeError):
+            get_result()
+    else:
+        expect = py_op(pa_lhs, pa_rhs).cast(py_outtype)
+        got = get_result()
+        assert_column_eq(expect, got)
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 4c05770a41f..8cedbc6d42f 100644
--- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
deleted file mode 100644
index c4ff7bb43a5..00000000000
--- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import pyarrow as pa
-import pytest
-from utils import assert_column_eq
-
-import cudf
-from cudf._lib import pylibcudf as plc
-
-VALID_TYPES = [
-    pa.int8(),
-    pa.int16(),
-    pa.int32(),
-    pa.int64(),
-    pa.uint8(),
-    pa.uint16(),
-    pa.uint32(),
-    pa.uint64(),
-    pa.float32(),
-    pa.float64(),
-    pa.bool_(),
-    pa.timestamp("s"),
-    pa.timestamp("ms"),
-    pa.timestamp("us"),
-    pa.timestamp("ns"),
-    pa.duration("s"),
-    pa.duration("ms"),
-    pa.duration("us"),
-    pa.duration("ns"),
-]
-
-
-@pytest.fixture(params=VALID_TYPES, ids=repr)
-def valid_type(request):
-    return request.param
-
-
-@pytest.fixture
-def valid_column(valid_type):
-    if valid_type == pa.bool_():
-        return pa.array([True, False, True], type=valid_type)
-    return pa.array([1, 2, 3], type=valid_type)
-
-
-def test_from_cuda_array_interface(valid_column):
-    col = plc.column.Column.from_cuda_array_interface_obj(
-        cudf.Series(valid_column)
-    )
-    expect = valid_column
-
-    assert_column_eq(expect, col)
diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
index f27fe4e942e..628682d0a66 100644
--- a/python/pylibcudf/pylibcudf/tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -15,8 +16,6 @@
     metadata_from_arrow_type,
 )
 
-from cudf._lib import pylibcudf as plc
-
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
index eb25ed915b1..61e02f4d28d 100644
--- a/python/pylibcudf/pylibcudf/tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,10 +2,9 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 from utils import assert_table_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
index 07ecaed5012..9176bb536ef 100644
--- a/python/pylibcudf/pylibcudf/tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -1,27 +1,49 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 @pytest.fixture
 def test_data():
     return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]
 
 
+@pytest.fixture
+def list_column():
+    return [[0, 1], [2], [5], [6, 7]]
+
+
 @pytest.fixture
 def scalar():
     return pa.scalar(1)
 
 
 @pytest.fixture
-def column():
+def search_key_column():
     return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
 
 
+@pytest.fixture
+def bool_column():
+    return pa.array([[False, True], [True], [True], [True, True]])
+
+
+@pytest.fixture
+def set_lists_column():
+    lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]]
+    rhs = [[np.nan, 1, 2, 3], [4, 5], [None, 7, 8], [None, None]]
+    return lhs, rhs
+
+
+@pytest.fixture
+def lists_column():
+    return [[4, 2, 3, 1], [1, 2, None, 4], [-10, 10, 10, 0]]
+
+
 def test_concatenate_rows(test_data):
     arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
     plc_tbl = plc.interop.from_arrow(arrow_tbl)
@@ -59,8 +81,7 @@ def test_concatenate_list_elements(test_data, dropna, expected):
     assert_column_eq(expect, res)
 
 
-def test_contains_scalar(test_data, scalar):
-    list_column = test_data[0][0]
+def test_contains_scalar(list_column, scalar):
     arr = pa.array(list_column)
 
     plc_column = plc.interop.from_arrow(arr)
@@ -72,9 +93,9 @@ def test_contains_scalar(test_data, scalar):
     assert_column_eq(expect, res)
 
 
-def test_contains_list_column(test_data):
-    list_column1 = test_data[0][0]
-    list_column2 = [1, 3, 5, 1]
+def test_contains_list_column(list_column, search_key_column):
+    list_column1 = list_column
+    list_column2, _ = search_key_column
     arr1 = pa.array(list_column1)
     arr2 = pa.array(list_column2)
 
@@ -82,7 +103,7 @@ def test_contains_list_column(test_data):
     plc_column2 = plc.interop.from_arrow(arr2)
     res = plc.lists.contains(plc_column1, plc_column2)
 
-    expect = pa.array([True, False, True, False])
+    expect = pa.array([False, True, True, True])
 
     assert_column_eq(expect, res)
 
@@ -110,8 +131,7 @@ def test_contains_nulls(list_column, expected):
     assert_column_eq(expect, res)
 
 
-def test_index_of_scalar(test_data, scalar):
-    list_column = test_data[0][0]
+def test_index_of_scalar(list_column, scalar):
     arr = pa.array(list_column)
 
     plc_column = plc.interop.from_arrow(arr)
@@ -123,21 +143,19 @@ def test_index_of_scalar(test_data, scalar):
     assert_column_eq(expect, res)
 
 
-def test_index_of_list_column(test_data, column):
-    list_column = test_data[0][0]
+def test_index_of_list_column(list_column, search_key_column):
     arr1 = pa.array(list_column)
-    arr2, expect = column
+    arr2, expect = search_key_column
     plc_column1 = plc.interop.from_arrow(arr1)
     plc_column2 = plc.interop.from_arrow(arr2)
     res = plc.lists.index_of(plc_column1, plc_column2, True)
 
-    expect = pa.array(column[1], type=pa.int32())
+    expect = pa.array(search_key_column[1], type=pa.int32())
 
     assert_column_eq(expect, res)
 
 
-def test_reverse(test_data):
-    list_column = test_data[0][0]
+def test_reverse(list_column):
     arr = pa.array(list_column)
     plc_column = plc.interop.from_arrow(arr)
 
@@ -149,8 +167,7 @@ def test_reverse(test_data):
 
 
 def test_segmented_gather(test_data):
-    list_column1 = test_data[0][0]
-    list_column2 = test_data[0][1]
+    list_column1, list_column2 = test_data[0]
 
     plc_column1 = plc.interop.from_arrow(pa.array(list_column1))
     plc_column2 = plc.interop.from_arrow(pa.array(list_column2))
@@ -162,22 +179,212 @@ def test_segmented_gather(test_data):
     assert_column_eq(expect, res)
 
 
-def test_extract_list_element_scalar(test_data):
-    arr = pa.array(test_data[0][0])
-    plc_column = plc.interop.from_arrow(arr)
+def test_extract_list_element_scalar(list_column):
+    plc_column = plc.interop.from_arrow(pa.array(list_column))
 
     res = plc.lists.extract_list_element(plc_column, 0)
-    expect = pa.compute.list_element(test_data[0][0], 0)
+    expect = pa.compute.list_element(list_column, 0)
 
     assert_column_eq(expect, res)
 
 
-def test_extract_list_element_column(test_data):
-    arr = pa.array(test_data[0][0])
-    plc_column = plc.interop.from_arrow(arr)
+def test_extract_list_element_column(list_column):
+    plc_column = plc.interop.from_arrow(pa.array(list_column))
     indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1]))
 
     res = plc.lists.extract_list_element(plc_column, indices)
     expect = pa.array([0, None, None, 7])
 
     assert_column_eq(expect, res)
+
+
+def test_count_elements(test_data):
+    arr = pa.array(test_data[0][1])
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.count_elements(plc_column)
+
+    expect = pa.array([1, 1, 0, 3], type=pa.int32())
+
+    assert_column_eq(expect, res)
+
+
+def test_sequences():
+    starts = plc.interop.from_arrow(pa.array([0, 1, 2, 3, 4]))
+    steps = plc.interop.from_arrow(pa.array([2, 1, 1, 1, -3]))
+    sizes = plc.interop.from_arrow(pa.array([0, 2, 2, 1, 3]))
+
+    res1 = plc.lists.sequences(starts, sizes, steps)
+    res2 = plc.lists.sequences(starts, sizes)
+
+    expect1 = pa.array([[], [1, 2], [2, 3], [3], [4, 1, -2]])
+    expect2 = pa.array([[], [1, 2], [2, 3], [3], [4, 5, 6]])
+
+    assert_column_eq(expect1, res1)
+
+    assert_column_eq(expect2, res2)
+
+
+@pytest.mark.parametrize(
+    "ascending,na_position,expected",
+    [
+        (
+            True,
+            plc.types.NullOrder.BEFORE,
+            [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]],
+        ),
+        (
+            True,
+            plc.types.NullOrder.AFTER,
+            [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.BEFORE,
+            [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.AFTER,
+            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.AFTER,
+            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
+        ),
+    ],
+)
+def test_sort_lists(lists_column, ascending, na_position, expected):
+    plc_column = plc.interop.from_arrow(pa.array(lists_column))
+    res = plc.lists.sort_lists(plc_column, ascending, na_position, False)
+    res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
+    assert_column_eq(expect, res_stable)
+
+
+@pytest.mark.parametrize(
+    "set_operation,nans_equal,nulls_equal,expected",
+    [
+        (
+            plc.lists.difference_distinct,
+            True,
+            True,
+            [[], [1, 2, 3], None, [4, 5]],
+        ),
+        (
+            plc.lists.difference_distinct,
+            False,
+            True,
+            [[], [1, 2, 3], None, [4, None, 5]],
+        ),
+        (
+            plc.lists.have_overlap,
+            True,
+            True,
+            [True, False, None, True],
+        ),
+        (
+            plc.lists.have_overlap,
+            False,
+            False,
+            [True, False, None, False],
+        ),
+        (
+            plc.lists.intersect_distinct,
+            True,
+            True,
+            [[np.nan, 1, 2], [], None, [None]],
+        ),
+        (
+            plc.lists.intersect_distinct,
+            True,
+            False,
+            [[1, 2], [], None, [None]],
+        ),
+        (
+            plc.lists.union_distinct,
+            False,
+            True,
+            [
+                [np.nan, 2, 1, 3],
+                [1, 2, 3, 4, 5],
+                None,
+                [4, None, 5, None, None],
+            ],
+        ),
+        (
+            plc.lists.union_distinct,
+            False,
+            False,
+            [
+                [np.nan, np.nan, 2, 1, np.nan, 3],
+                [1, 2, 3, 4, 5],
+                None,
+                [4, None, 5, None, None],
+            ],
+        ),
+    ],
+)
+def test_set_operations(
+    set_lists_column, set_operation, nans_equal, nulls_equal, expected
+):
+    lhs, rhs = set_lists_column
+
+    res = set_operation(
+        plc.interop.from_arrow(pa.array(lhs)),
+        plc.interop.from_arrow(pa.array(rhs)),
+        nans_equal,
+        nulls_equal,
+    )
+
+    if set_operation != plc.lists.have_overlap:
+        expect = pa.array(expected, type=pa.list_(pa.float64()))
+    else:
+        expect = pa.array(expected)
+    assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "nans_equal,nulls_equal,expected",
+    [
+        (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]),
+        (
+            False,
+            True,
+            [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]],
+        ),
+        (
+            True,
+            False,
+            [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
+        ),
+        (
+            False,
+            False,
+            [
+                [np.nan, np.nan, 0, 1, 2, 3],
+                [3, 1, 2],
+                None,
+                [4, None, None, 5],
+            ],
+        ),
+    ],
+)
+def test_distinct(list_column, nans_equal, nulls_equal, expected):
+    list_column = [
+        [np.nan, np.nan, 0, 1, 2, 3, 2],
+        [3, 1, 2],
+        None,
+        [4, None, None, 5],
+    ]
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.distinct(plc_column, nans_equal, nulls_equal)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
index da1157e5832..01115bc363a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/pylibcudf/pylibcudf/tests/test_traits.py b/python/pylibcudf/pylibcudf/tests/test_traits.py
index 6c22cb02f21..2570e8abd51 100644
--- a/python/pylibcudf/pylibcudf/tests/test_traits.py
+++ b/python/pylibcudf/pylibcudf/tests/test_traits.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib import pylibcudf as plc
+import pylibcudf as plc
 
 
 def test_is_relationally_comparable():
diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
index 312939888dd..06fc35d8835 100644
--- a/python/pylibcudf/pylibcudf/tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,10 +3,9 @@
 import math
 
 import pyarrow as pa
+import pylibcudf as plc
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/pylibcudf/pylibcudf/tests/test_unary.py b/python/pylibcudf/pylibcudf/tests/test_unary.py
index b5e4f0cb0e8..9b8085d5c52 100644
--- a/python/pylibcudf/pylibcudf/tests/test_unary.py
+++ b/python/pylibcudf/pylibcudf/tests/test_unary.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib import pylibcudf as plc
+import pylibcudf as plc
 
 
 def test_is_supported_cast():
diff --git a/python/pylibcudf/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx
index 0a8cf5fcb6a..d13365eebfb 100644
--- a/python/pylibcudf/pylibcudf/types.pyx
+++ b/python/pylibcudf/pylibcudf/types.pyx
@@ -1,7 +1,8 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
-from pylibcudf.libcudf.types cimport data_type, type_id
+from pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
 from pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
@@ -66,3 +67,7 @@ cdef class DataType:
         cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
         ret.c_obj = dt
         return ret
+
+
+SIZE_TYPE = DataType(type_to_id[size_type]())
+SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 2686e09269a..d9f4ffc0759 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "nvtx>=0.2.1",
     "packaging",
     "pyarrow>=16.1.0,<16.2.0a0",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -111,13 +111,14 @@ skip = [
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]
@@ -127,7 +128,7 @@ cmake.minimum-version = "3.26.4"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true
-wheel.packages = ["cudf"]
+wheel.packages = ["pylibcudf"]
 
 [tool.scikit-build.metadata.version]
 provider = "scikit_build_core.metadata.regex"