diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1e1ad94ab0b..9e72c0119f3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -88,6 +88,13 @@ repos:
                 # of dependencies, so we'll have to update this manually.
                 additional_dependencies:
                   - cmake-format==0.6.11
+              - id: copyright-check
+                name: copyright-check
+                # This hook's use of Git tools appears to conflict with
+                # existing CI invocations so we don't invoke it during CI runs.
+                stages: [commit]
+                entry: python ./ci/checks/copyright.py --git-modified-only
+                language: python
 
 default_language_version:
       python: python3
diff --git a/build.sh b/build.sh
index 8b3add1dddd..765a1b5325f 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 REPODIR=$(cd $(dirname $0); pwd)
 
 VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
-HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"<args>\\\"]
+HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
@@ -32,7 +32,6 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk
    -v                            - verbose build mode
    -g                            - build for debug
    -n                            - no install step
-   -l                            - build legacy tests
    --allgpuarch                  - build for all supported GPU architectures
    --disable_nvtx                - disable inserting NVTX profiling ranges
    --show_depr_warn              - show cmake deprecation warnings
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 178bdab0154..62eeb4d131b 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #########################################
 # cuDF GPU build and test script for CI #
 #########################################
@@ -98,11 +98,7 @@ conda list --show-channel-urls
 ################################################################################
 
 logger "Build libcudf..."
-if [[ "${BUILD_MODE}" == "pull-request" ]]; then
-    "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds
-else
-    "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds
-fi
+"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds
 
 ################################################################################
 # BENCHMARK - Run and parse libcudf and cuDF benchmarks
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
new file mode 100644
index 00000000000..d72fd95fea3
--- /dev/null
+++ b/ci/checks/copyright.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import datetime
+import re
+import argparse
+import io
+import os
+import sys
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.expanduser(__file__)))
+
+# Add the scripts dir for gitutils
+sys.path.append(os.path.normpath(SCRIPT_DIR))
+
+# Now import gitutils. Ignore flake8 error here since there is no other way to
+# set up imports
+import gitutils  # noqa: E402
+
+FilesToCheck = [
+    re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"),
+    re.compile(r"CMakeLists[.]txt$"),
+    re.compile(r"CMakeLists_standalone[.]txt$"),
+    re.compile(r"setup[.]cfg$"),
+    re.compile(r"[.]flake8[.]cython$"),
+    re.compile(r"meta[.]yaml$")
+]
+ExemptFiles = []
+
+# this will break starting at year 10000, which is probably OK :)
+CheckSimple = re.compile(
+    r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)")
+CheckDouble = re.compile(
+    r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)"  # noqa: E501
+)
+
+
+def checkThisFile(f):
+    # This check covers things like symlinks which point to files that DNE
+    if not (os.path.exists(f)):
+        return False
+    if gitutils and gitutils.isFileEmpty(f):
+        return False
+    for exempt in ExemptFiles:
+        if exempt.search(f):
+            return False
+    for checker in FilesToCheck:
+        if checker.search(f):
+            return True
+    return False
+
+
+def getCopyrightYears(line):
+    res = CheckSimple.search(line)
+    if res:
+        return (int(res.group(1)), int(res.group(1)))
+    res = CheckDouble.search(line)
+    if res:
+        return (int(res.group(1)), int(res.group(2)))
+    return (None, None)
+
+
+def replaceCurrentYear(line, start, end):
+    # first turn a simple regex into double (if applicable). then update years
+    res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line)
+    res = CheckDouble.sub(
+        r"Copyright (c) {:04d}-{:04d}, NVIDIA CORPORATION".format(start, end),
+        res)
+    return res
+
+
+def checkCopyright(f, update_current_year):
+    """
+    Checks for copyright headers and their years
+    """
+    errs = []
+    thisYear = datetime.datetime.now().year
+    lineNum = 0
+    crFound = False
+    yearMatched = False
+    with io.open(f, "r", encoding="utf-8") as fp:
+        lines = fp.readlines()
+    for line in lines:
+        lineNum += 1
+        start, end = getCopyrightYears(line)
+        if start is None:
+            continue
+        crFound = True
+        if start > end:
+            e = [
+                f,
+                lineNum,
+                "First year after second year in the copyright "
+                "header (manual fix required)",
+                None
+            ]
+            errs.append(e)
+        if thisYear < start or thisYear > end:
+            e = [
+                f,
+                lineNum,
+                "Current year not included in the "
+                "copyright header",
+                None
+            ]
+            if thisYear < start:
+                e[-1] = replaceCurrentYear(line, thisYear, end)
+            if thisYear > end:
+                e[-1] = replaceCurrentYear(line, start, thisYear)
+            errs.append(e)
+        else:
+            yearMatched = True
+    fp.close()
+    # copyright header itself not found
+    if not crFound:
+        e = [
+            f,
+            0,
+            "Copyright header missing or formatted incorrectly "
+            "(manual fix required)",
+            None
+        ]
+        errs.append(e)
+    # even if the year matches a copyright header, make the check pass
+    if yearMatched:
+        errs = []
+
+    if update_current_year:
+        errs_update = [x for x in errs if x[-1] is not None]
+        if len(errs_update) > 0:
+            print("File: {}. Changing line(s) {}".format(
+                f, ', '.join(str(x[1]) for x in errs if x[-1] is not None)))
+            for _, lineNum, __, replacement in errs_update:
+                lines[lineNum - 1] = replacement
+            with io.open(f, "w", encoding="utf-8") as out_file:
+                for new_line in lines:
+                    out_file.write(new_line)
+        errs = [x for x in errs if x[-1] is None]
+
+    return errs
+
+
+def getAllFilesUnderDir(root, pathFilter=None):
+    retList = []
+    for (dirpath, dirnames, filenames) in os.walk(root):
+        for fn in filenames:
+            filePath = os.path.join(dirpath, fn)
+            if pathFilter(filePath):
+                retList.append(filePath)
+    return retList
+
+
+def checkCopyright_main():
+    """
+    Checks for copyright headers in all the modified files. In case of local
+    repo, this script will just look for uncommitted files and in case of CI
+    it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch"
+    """
+    retVal = 0
+    global ExemptFiles
+
+    argparser = argparse.ArgumentParser(
+        "Checks for a consistent copyright header in git's modified files")
+    argparser.add_argument("--update-current-year",
+                           dest='update_current_year',
+                           action="store_true",
+                           required=False,
+                           help="If set, "
+                           "update the current year if a header is already "
+                           "present and well formatted.")
+    argparser.add_argument("--git-modified-only",
+                           dest='git_modified_only',
+                           action="store_true",
+                           required=False,
+                           help="If set, "
+                           "only files seen as modified by git will be "
+                           "processed.")
+
+    (args, dirs) = argparser.parse_known_args()
+    try:
+        ExemptFiles = [re.compile(file) for file in ExemptFiles]
+    except re.error as reException:
+        print("Regular expression error:")
+        print(reException)
+        return 1
+
+    if args.git_modified_only:
+        files = gitutils.modifiedFiles(pathFilter=checkThisFile)
+    else:
+        files = []
+        for d in [os.path.abspath(d) for d in dirs]:
+            if not (os.path.isdir(d)):
+                raise ValueError(f"{d} is not a directory.")
+            files += getAllFilesUnderDir(d, pathFilter=checkThisFile)
+
+    errors = []
+    for f in files:
+        errors += checkCopyright(f, args.update_current_year)
+
+    if len(errors) > 0:
+        print("Copyright headers incomplete in some of the files!")
+        for e in errors:
+            print("  %s:%d Issue: %s" % (e[0], e[1], e[2]))
+        print("")
+        n_fixable = sum(1 for e in errors if e[-1] is not None)
+        path_parts = os.path.abspath(__file__).split(os.sep)
+        file_from_repo = os.sep.join(path_parts[path_parts.index("ci"):])
+        if n_fixable > 0:
+            print(("You can run `python {} --git-modified-only "
+                   "--update-current-year` to fix {} of these "
+                   "errors.\n").format(file_from_repo, n_fixable))
+        retVal = 1
+    else:
+        print("Copyright check passed")
+
+    return retVal
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(checkCopyright_main())
\ No newline at end of file
diff --git a/ci/checks/gitutils.py b/ci/checks/gitutils.py
new file mode 100644
index 00000000000..0aea1d660cb
--- /dev/null
+++ b/ci/checks/gitutils.py
@@ -0,0 +1,286 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import subprocess
+import os
+import re
+
+
+def isFileEmpty(f):
+    return os.stat(f).st_size == 0
+
+
+def __git(*opts):
+    """Runs a git command and returns its output"""
+    cmd = "git " + " ".join(list(opts))
+    ret = subprocess.check_output(cmd, shell=True)
+    return ret.decode("UTF-8").rstrip("\n")
+
+
+def __gitdiff(*opts):
+    """Runs a git diff command with no pager set"""
+    return __git("--no-pager", "diff", *opts)
+
+
+def branch():
+    """Returns the name of the current branch"""
+    name = __git("rev-parse", "--abbrev-ref", "HEAD")
+    name = name.rstrip()
+    return name
+
+
+def repo_version():
+    """
+    Determines the version of the repo by using `git describe`
+
+    Returns
+    -------
+    str
+        The full version of the repo in the format 'v#.#.#{a|b|rc}'
+    """
+    return __git("describe", "--tags", "--abbrev=0")
+
+
+def repo_version_major_minor():
+    """
+    Determines the version of the repo using `git describe` and returns only
+    the major and minor portion
+
+    Returns
+    -------
+    str
+        The partial version of the repo in the format '{major}.{minor}'
+    """
+
+    full_repo_version = repo_version()
+
+    match = re.match(r"^v?(?P<major>[0-9]+)(?:\.(?P<minor>[0-9]+))?",
+                     full_repo_version)
+
+    if (match is None):
+        print("   [DEBUG] Could not determine repo major minor version. "
+              f"Full repo version: {full_repo_version}.")
+        return None
+
+    out_version = match.group("major")
+
+    if (match.group("minor")):
+        out_version += "." + match.group("minor")
+
+    return out_version
+
+
+def determine_merge_commit(current_branch="HEAD"):
+    """
+    When running outside of CI, this will estimate the target merge commit hash
+    of `current_branch` by finding a common ancester with the remote branch
+    'branch-{major}.{minor}' where {major} and {minor} are determined from the
+    repo version.
+
+    Parameters
+    ----------
+    current_branch : str, optional
+        Which branch to consider as the current branch, by default "HEAD"
+
+    Returns
+    -------
+    str
+        The common commit hash ID
+    """
+
+    try:
+        # Try to determine the target branch from the most recent tag
+        head_branch = __git("describe",
+                            "--all",
+                            "--tags",
+                            "--match='branch-*'",
+                            "--abbrev=0")
+    except subprocess.CalledProcessError:
+        print("   [DEBUG] Could not determine target branch from most recent "
+              "tag. Falling back to 'branch-{major}.{minor}.")
+        head_branch = None
+
+    if (head_branch is not None):
+        # Convert from head to branch name
+        head_branch = __git("name-rev", "--name-only", head_branch)
+    else:
+        # Try and guess the target branch as "branch-<major>.<minor>"
+        version = repo_version_major_minor()
+
+        if (version is None):
+            return None
+
+        head_branch = "branch-{}".format(version)
+
+    try:
+        # Now get the remote tracking branch
+        remote_branch = __git("rev-parse",
+                              "--abbrev-ref",
+                              "--symbolic-full-name",
+                              head_branch + "@{upstream}")
+    except subprocess.CalledProcessError:
+        print("   [DEBUG] Could not remote tracking reference for "
+              f"branch {head_branch}.")
+        remote_branch = None
+
+    if (remote_branch is None):
+        return None
+
+    print(f"   [DEBUG] Determined TARGET_BRANCH as: '{remote_branch}'. "
+          "Finding common ancestor.")
+
+    common_commit = __git("merge-base", remote_branch, current_branch)
+
+    return common_commit
+
+
+def uncommittedFiles():
+    """
+    Returns a list of all changed files that are not yet committed. This
+    means both untracked/unstaged as well as uncommitted files too.
+    """
+    files = __git("status", "-u", "-s")
+    ret = []
+    for f in files.splitlines():
+        f = f.strip(" ")
+        f = re.sub("\s+", " ", f)  # noqa: W605
+        tmp = f.split(" ", 1)
+        # only consider staged files or uncommitted files
+        # in other words, ignore untracked files
+        if tmp[0] == "M" or tmp[0] == "A":
+            ret.append(tmp[1])
+    return ret
+
+
+def changedFilesBetween(baseName, branchName, commitHash):
+    """
+    Returns a list of files changed between branches baseName and latest commit
+    of branchName.
+    """
+    current = branch()
+    # checkout "base" branch
+    __git("checkout", "--force", baseName)
+    # checkout branch for comparing
+    __git("checkout", "--force", branchName)
+    # checkout latest commit from branch
+    __git("checkout", "-fq", commitHash)
+
+    files = __gitdiff("--name-only",
+                      "--ignore-submodules",
+                      f"{baseName}..{branchName}")
+
+    # restore the original branch
+    __git("checkout", "--force", current)
+    return files.splitlines()
+
+
+def changesInFileBetween(file, b1, b2, filter=None):
+    """Filters the changed lines to a file between the branches b1 and b2"""
+    current = branch()
+    __git("checkout", "--quiet", b1)
+    __git("checkout", "--quiet", b2)
+    diffs = __gitdiff("--ignore-submodules",
+                      "-w",
+                      "--minimal",
+                      "-U0",
+                      "%s...%s" % (b1, b2),
+                      "--",
+                      file)
+    __git("checkout", "--quiet", current)
+    lines = []
+    for line in diffs.splitlines():
+        if filter is None or filter(line):
+            lines.append(line)
+    return lines
+
+
+def modifiedFiles(pathFilter=None):
+    """
+    If inside a CI-env (ie. TARGET_BRANCH and COMMIT_HASH are defined, and
+    current branch is "current-pr-branch"), then lists out all files modified
+    between these 2 branches. Locally, TARGET_BRANCH will try to be determined
+    from the current repo version and finding a coresponding branch named
+    'branch-{major}.{minor}'. If this fails, this functino will list out all
+    the uncommitted files in the current branch.
+
+    Such utility function is helpful while putting checker scripts as part of
+    cmake, as well as CI process. This way, during development, only the files
+    touched (but not yet committed) by devs can be checked. But, during the CI
+    process ALL files modified by the dev, as submiited in the PR, will be
+    checked. This happens, all the while using the same script.
+    """
+    targetBranch = os.environ.get("TARGET_BRANCH")
+    commitHash = os.environ.get("COMMIT_HASH")
+    currentBranch = branch()
+    print(
+        f"   [DEBUG] TARGET_BRANCH={targetBranch}, COMMIT_HASH={commitHash}, "
+        f"currentBranch={currentBranch}")
+
+    if targetBranch and commitHash and (currentBranch == "current-pr-branch"):
+        print("   [DEBUG] Assuming a CI environment.")
+        allFiles = changedFilesBetween(targetBranch, currentBranch, commitHash)
+    else:
+        print("   [DEBUG] Did not detect CI environment. "
+              "Determining TARGET_BRANCH locally.")
+
+        common_commit = determine_merge_commit(currentBranch)
+
+        if (common_commit is not None):
+
+            # Now get the diff. Use --staged to get both diff between
+            # common_commit..HEAD and any locally staged files
+            allFiles = __gitdiff("--name-only",
+                                 "--ignore-submodules",
+                                 "--staged",
+                                 f"{common_commit}").splitlines()
+        else:
+            # Fallback to just uncommitted files
+            allFiles = uncommittedFiles()
+
+    files = []
+    for f in allFiles:
+        if pathFilter is None or pathFilter(f):
+            files.append(f)
+
+    filesToCheckString = "\n\t".join(files) if files else "<None>"
+    print(f"   [DEBUG] Found files to check:\n\t{filesToCheckString}\n")
+    return files
+
+
+def listAllFilesInDir(folder):
+    """Utility function to list all files/subdirs in the input folder"""
+    allFiles = []
+    for root, dirs, files in os.walk(folder):
+        for name in files:
+            allFiles.append(os.path.join(root, name))
+    return allFiles
+
+
+def listFilesToCheck(filesDirs, filter=None):
+    """
+    Utility function to filter the input list of files/dirs based on the input
+    filter method and returns all the files that need to be checked
+    """
+    allFiles = []
+    for f in filesDirs:
+        if os.path.isfile(f):
+            if filter is None or filter(f):
+                allFiles.append(f)
+        elif os.path.isdir(f):
+            files = listAllFilesInDir(f)
+            for f_ in files:
+                if filter is None or filter(f_):
+                    allFiles.append(f_)
+    return allFiles
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 9fb86b0b3c5..a7ad260758d 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #####################
 # cuDF Style Tester #
 #####################
@@ -19,9 +19,24 @@ export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
 
+
 pre-commit run --hook-stage manual --all-files
 PRE_COMMIT_RETVAL=$?
 
+# Check for copyright headers in the files modified currently
+COPYRIGHT=`python ci/checks/copyright.py --git-modified-only 2>&1`
+CR_RETVAL=$?
+
+# Output results if failure otherwise show pass
+if [ "$CR_RETVAL" != "0" ]; then
+  echo -e "\n\n>>>> FAILED: copyright check; begin output\n\n"
+  echo -e "$COPYRIGHT"
+  echo -e "\n\n>>>> FAILED: copyright check; end output\n\n"
+else
+  echo -e "\n\n>>>> PASSED: copyright check\n\n"
+  echo -e "$COPYRIGHT"
+fi
+
 # Run clang-format and check for a consistent code format
 CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
 CLANG_FORMAT_RETVAL=$?
@@ -40,7 +55,7 @@ HEADER_META_RETVAL=$?
 echo -e "$HEADER_META"
 
 RETVALS=(
-  $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL
+  $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL
 )
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index e6ef72d930c..f2f67e9e000 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -23,25 +23,13 @@ if [ -z "$MY_UPLOAD_KEY" ]; then
   return 0
 fi
 
-################################################################################
-# SETUP - Get conda file output locations
-################################################################################
-
-gpuci_logger "Get conda file output locations"
-
-export LIBCUDF_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf --output`
-export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf_kafka --output`
-export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output`
-export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output`
-export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output`
-export CUSTREAMZ_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output`
-
 ################################################################################
 # UPLOAD - Conda packages
 ################################################################################
 
 gpuci_logger "Starting conda uploads"
 if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then
+  export LIBCUDF_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf --output)
   test -e ${LIBCUDF_FILE}
   echo "Upload libcudf"
   echo ${LIBCUDF_FILE}
@@ -49,16 +37,19 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then
 fi
 
 if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then
+  export CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf --python=$PYTHON --output)
   test -e ${CUDF_FILE}
   echo "Upload cudf"
   echo ${CUDF_FILE}
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE} --no-progress
 
+  export DASK_CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/dask-cudf --python=$PYTHON --output)
   test -e ${DASK_CUDF_FILE}
   echo "Upload dask-cudf"
   echo ${DASK_CUDF_FILE}
   gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE} --no-progress
 
+  export CUSTREAMZ_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/custreamz --python=$PYTHON --output)
   test -e ${CUSTREAMZ_FILE}
   echo "Upload custreamz"
   echo ${CUSTREAMZ_FILE}
@@ -66,6 +57,7 @@ if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then
 fi
 
 if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then
+  export LIBCUDF_KAFKA_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf_kafka --output)
   test -e ${LIBCUDF_KAFKA_FILE}
   echo "Upload libcudf_kafka"
   echo ${LIBCUDF_KAFKA_FILE}
@@ -73,6 +65,7 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then
 fi
 
 if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF_KAFKA" == "1" ]]; then
+  export CUDF_KAFKA_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf_kafka --python=$PYTHON --output)
   test -e ${CUDF_KAFKA_FILE}
   echo "Upload cudf_kafka"
   echo ${CUDF_KAFKA_FILE}
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 53ad948b61c..a79ffa0fc47 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 ##############################################
 # cuDF GPU build and test script for CI      #
 ##############################################
@@ -128,11 +128,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     ################################################################################
 
     gpuci_logger "Build from source"
-    if [[ "${BUILD_MODE}" == "pull-request" ]]; then
-        "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
-    else
-        "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds
-    fi
+    "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
 
     ################################################################################
     # TEST - Run GoogleTest
@@ -226,11 +222,7 @@ else
     install_dask
 
     gpuci_logger "Build python libs from source"
-    if [[ "${BUILD_MODE}" == "pull-request" ]]; then
-        "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds
-    else
-        "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds
-    fi
+    "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds
 
 fi
 
@@ -249,15 +241,15 @@ fi
 
 cd "$WORKSPACE/python/cudf"
 gpuci_logger "Python py.test for cuDF"
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf
 
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf
 
 cd "$WORKSPACE/python/custreamz"
 gpuci_logger "Python py.test for cuStreamz"
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz
 
 gpuci_logger "Test notebooks"
 "$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index b9577d937d9..b926a6cdc99 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 name: cudf_dev
 channels:
@@ -17,7 +17,7 @@ dependencies:
   - numba>=0.54
   - numpy
   - pandas>=1.0,<1.4.0dev0
-  - pyarrow=5.0.0=*cuda
+  - pyarrow=6.0.1=*cuda
   - fastavro>=0.22.9
   - python-snappy>=0.6.0
   - notebook>=0.5.0
@@ -45,7 +45,7 @@ dependencies:
   - dask>=2021.11.1,<=2022.01.0
   - distributed>=2021.11.1,<=2022.01.0
   - streamz
-  - arrow-cpp=5.0.0
+  - arrow-cpp=6.0.1
   - dlpack>=0.5,<0.6.0a0
   - arrow-cpp-proc * cuda
   - double-conversion
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index bd1412bc611..0145e2e4d01 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -31,7 +31,7 @@ requirements:
     - setuptools
     - numba >=0.54
     - dlpack>=0.5,<0.6.0a0
-    - pyarrow 5.0.0 *cuda
+    - pyarrow 6.0.1 *cuda
     - libcudf {{ version }}
     - rmm {{ minor_version }}
     - cudatoolkit {{ cuda_version }}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 70c020d4abd..4e20c979f6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -40,7 +40,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp 5.0.0 *cuda
+    - arrow-cpp 6.0.1 *cuda
     - arrow-cpp-proc * cuda
     - dlpack>=0.5,<0.6.0a0
   run:
@@ -216,6 +216,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
     - test -f $PREFIX/include/cudf/strings/split/partition.hpp
     - test -f $PREFIX/include/cudf/strings/split/split.hpp
+    - test -f $PREFIX/include/cudf/strings/split/split_re.hpp
     - test -f $PREFIX/include/cudf/strings/string_view.hpp
     - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp
     - test -f $PREFIX/include/cudf/strings/strip.hpp
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 0b274f3a41d..1eb5b13ddc3 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -27,8 +27,6 @@ requirements:
   host:
     - libcudf {{version}}
     - librdkafka >=1.7.0,<1.8.0a0
-  run:
-    - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 
 test:
   commands:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 90e94ffcc7b..2ffd62f1b53 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -445,6 +445,7 @@ add_library(
   src/strings/search/find_multiple.cu
   src/strings/split/partition.cu
   src/strings/split/split.cu
+  src/strings/split/split_re.cu
   src/strings/split/split_record.cu
   src/strings/strings_column_factories.cu
   src/strings/strings_column_view.cpp
@@ -497,6 +498,8 @@ set_target_properties(
              # set target compile options
              CXX_STANDARD 17
              CXX_STANDARD_REQUIRED ON
+             # For std:: support of __int128_t. Can be removed once using cuda::std
+             CXX_EXTENSIONS ON
              CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON
              POSITION_INDEPENDENT_CODE ON
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 0704180bad0..11eef015364 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -59,6 +59,12 @@ function(ConfigureBench CMAKE_BENCH_NAME)
     ${CMAKE_BENCH_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               # For std:: support of __int128_t. Can be removed once using cuda::std
+               CXX_EXTENSIONS ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
   )
   target_link_libraries(
     ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
@@ -254,7 +260,7 @@ ConfigureBench(
   string/convert_durations.cpp
   string/convert_fixed_point.cpp
   string/convert_numerics.cpp
-  string/copy.cpp
+  string/copy.cu
   string/extract.cpp
   string/factory.cu
   string/filter.cpp
@@ -265,7 +271,7 @@ ConfigureBench(
   string/split.cpp
   string/substring.cpp
   string/translate.cpp
-  string/url_decode.cpp
+  string/url_decode.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index 745d4e354e7..f8226c7387a 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,8 +27,8 @@
 class COMPILED_BINARYOP : public cudf::benchmark {
 };
 
-template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator binop>
-void BM_compiled_binaryop(benchmark::State& state)
+template <typename TypeLhs, typename TypeRhs, typename TypeOut>
+void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 {
   const cudf::size_type column_size{(cudf::size_type)state.range(0)};
 
@@ -50,21 +50,26 @@ void BM_compiled_binaryop(benchmark::State& state)
 }
 
 // TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \
-  TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP,                          \
-                        BM_compiled_binaryop,                       \
-                        TypeLhs,                                    \
-                        TypeRhs,                                    \
-                        TypeOut,                                    \
-                        cudf::binary_operator::binop)               \
-    ->Unit(benchmark::kMicrosecond)                                 \
-    ->UseManualTime()                                               \
-    ->Arg(10000)      /* 10k */                                     \
-    ->Arg(100000)     /* 100k */                                    \
-    ->Arg(1000000)    /* 1M */                                      \
-    ->Arg(10000000)   /* 10M */                                     \
+#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout)           \
+  BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name)                             \
+  (::benchmark::State & st)                                               \
+  {                                                                       \
+    BM_compiled_binaryop<lhs, rhs, tout>(st, cudf::binary_operator::bop); \
+  }                                                                       \
+  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                           \
+    ->Unit(benchmark::kMicrosecond)                                       \
+    ->UseManualTime()                                                     \
+    ->Arg(10000)      /* 10k */                                           \
+    ->Arg(100000)     /* 100k */                                          \
+    ->Arg(1000000)    /* 1M */                                            \
+    ->Arg(10000000)   /* 10M */                                           \
     ->Arg(100000000); /* 100M */
 
+#define build_name(a, b, c, d) a##_##b##_##c##_##d
+
+#define BINARYOP_BENCHMARK_DEFINE(lhs, rhs, bop, tout) \
+  BM_BINARYOP_BENCHMARK_DEFINE(build_name(bop, lhs, rhs, tout), lhs, rhs, bop, tout)
+
 using namespace cudf;
 using namespace numeric;
 
diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index 55a1e524479..f21356aff02 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ void nvbench_inner_join(nvbench::state& state,
                  cudf::null_equality compare_nulls,
                  rmm::cuda_stream_view stream) {
     cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
-    return hj_obj.inner_join(right_input.select(right_on), compare_nulls, std::nullopt, stream);
+    return hj_obj.inner_join(right_input.select(right_on), std::nullopt, stream);
   };
 
   BM_join<key_type, payload_type, Nullable>(state, join);
@@ -71,7 +71,7 @@ void nvbench_left_join(nvbench::state& state,
                  cudf::null_equality compare_nulls,
                  rmm::cuda_stream_view stream) {
     cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
-    return hj_obj.left_join(right_input.select(right_on), compare_nulls, std::nullopt, stream);
+    return hj_obj.left_join(right_input.select(right_on), std::nullopt, stream);
   };
 
   BM_join<key_type, payload_type, Nullable>(state, join);
@@ -93,7 +93,7 @@ void nvbench_full_join(nvbench::state& state,
                  cudf::null_equality compare_nulls,
                  rmm::cuda_stream_view stream) {
     cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
-    return hj_obj.full_join(right_input.select(right_on), compare_nulls, std::nullopt, stream);
+    return hj_obj.full_join(right_input.select(right_on), std::nullopt, stream);
   };
 
   BM_join<key_type, payload_type, Nullable>(state, join);
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index e88253395d8..f2b9cb1bdb9 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,24 +16,41 @@
 
 #pragma once
 
-#include <benchmark/benchmark.h>
-#include <nvbench/nvbench.cuh>
+#include "generate_input_tables.cuh"
 
-#include <thrust/iterator/counting_iterator.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/filling.hpp>
 #include <cudf/join.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
 
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random/linear_congruential_engine.h>
+#include <thrust/random/uniform_int_distribution.h>
 
 #include <vector>
 
-#include "generate_input_tables.cuh"
+struct null75_generator {
+  thrust::minstd_rand engine;
+  thrust::uniform_int_distribution<unsigned> rand_gen;
+  null75_generator() : engine(), rand_gen() {}
+  __device__ bool operator()(size_t i)
+  {
+    engine.discard(i);
+    // roughly 75% nulls
+    return (rand_gen(engine) & 3) == 0;
+  }
+};
 
 template <typename key_type,
           typename payload_type,
@@ -64,13 +81,11 @@ static void BM_join(state_type& state, Join JoinFunc)
   const int multiplicity   = 1;
 
   // Generate build and probe tables
-  cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
-  auto build_random_null_mask = [&rand_gen](int size) {
+  auto build_random_null_mask = [](int size) {
     // roughly 75% nulls
-    auto validity = thrust::make_transform_iterator(
-      thrust::make_counting_iterator(0),
-      [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
-    return cudf::test::detail::make_null_mask(validity, validity + size);
+    auto validity =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
+    return cudf::detail::valid_if(validity, validity + size, thrust::identity<bool>{}).first;
   };
 
   std::unique_ptr<cudf::column> build_key_column = [&]() {
@@ -96,17 +111,14 @@ static void BM_join(state_type& state, Join JoinFunc)
     selectivity,
     multiplicity);
 
-  auto payload_data_it = thrust::make_counting_iterator(0);
-  cudf::test::fixed_width_column_wrapper<payload_type> build_payload_column(
-    payload_data_it, payload_data_it + build_table_size);
-
-  cudf::test::fixed_width_column_wrapper<payload_type> probe_payload_column(
-    payload_data_it, payload_data_it + probe_table_size);
+  auto init = cudf::make_fixed_width_scalar<payload_type>(static_cast<payload_type>(0));
+  auto build_payload_column = cudf::sequence(build_table_size, *init);
+  auto probe_payload_column = cudf::sequence(probe_table_size, *init);
 
   CHECK_CUDA(0);
 
-  cudf::table_view build_table({build_key_column->view(), build_payload_column});
-  cudf::table_view probe_table({probe_key_column->view(), probe_payload_column});
+  cudf::table_view build_table({build_key_column->view(), *build_payload_column});
+  cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column});
 
   // Setup join parameters and result table
   [[maybe_unused]] std::vector<cudf::size_type> columns_to_join = {0};
diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cu
similarity index 83%
rename from cpp/benchmarks/string/copy.cpp
rename to cpp/benchmarks/string/copy.cu
index d40b0e069bc..2f064e71c44 100644
--- a/cpp/benchmarks/string/copy.cpp
+++ b/cpp/benchmarks/string/copy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,8 @@
  * limitations under the License.
  */
 
-#include <benchmark/benchmark.h>
+#include "string_bench_args.hpp"
+
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -23,10 +24,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <algorithm>
-#include <random>
-
-#include "string_bench_args.hpp"
+#include <thrust/shuffle.h>
 
 class StringCopy : public cudf::benchmark {
 };
@@ -47,11 +45,14 @@ static void BM_copy(benchmark::State& state, copy_type ct)
     create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
 
   // scatter indices
-  std::vector<cudf::size_type> host_map_data(n_rows);
-  std::iota(host_map_data.begin(), host_map_data.end(), 0);
-  std::random_shuffle(host_map_data.begin(), host_map_data.end());
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> index_map(host_map_data.begin(),
-                                                                    host_map_data.end());
+  auto index_map_col = make_numeric_column(
+    cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED);
+  auto index_map = index_map_col->mutable_view();
+  thrust::shuffle_copy(thrust::device,
+                       thrust::counting_iterator<cudf::size_type>(0),
+                       thrust::counting_iterator<cudf::size_type>(n_rows),
+                       index_map.begin<cudf::size_type>(),
+                       thrust::default_random_engine());
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
diff --git a/cpp/benchmarks/string/url_decode.cpp b/cpp/benchmarks/string/url_decode.cpp
deleted file mode 100644
index 4dc77cffa1a..00000000000
--- a/cpp/benchmarks/string/url_decode.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmark/benchmark.h>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/strings/convert/convert_urls.hpp>
-#include <cudf/types.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-
-#include <algorithm>
-#include <random>
-
-struct url_string_generator {
-  size_t num_chars;
-  std::bernoulli_distribution dist;
-
-  url_string_generator(size_t num_chars, double esc_seq_chance)
-    : num_chars{num_chars}, dist{esc_seq_chance}
-  {
-  }
-
-  std::string operator()(std::mt19937& engine)
-  {
-    std::string str;
-    str.reserve(num_chars);
-    while (str.size() < num_chars) {
-      if (str.size() < num_chars - 3 && dist(engine)) {
-        str += "%20";
-      } else {
-        str.push_back('a');
-      }
-    }
-    return str;
-  }
-};
-
-cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows,
-                                                   cudf::size_type chars_per_row,
-                                                   double esc_seq_chance)
-{
-  std::mt19937 engine(1);
-  url_string_generator url_gen(chars_per_row, esc_seq_chance);
-  std::vector<std::string> strings;
-  strings.reserve(num_rows);
-  std::generate_n(std::back_inserter(strings), num_rows, [&]() { return url_gen(engine); });
-  return cudf::test::strings_column_wrapper(strings.begin(), strings.end());
-}
-
-class UrlDecode : public cudf::benchmark {
-};
-
-template <int esc_seq_pct>
-void BM_url_decode(benchmark::State& state)
-{
-  cudf::size_type const num_rows      = state.range(0);
-  cudf::size_type const chars_per_row = state.range(1);
-
-  auto column       = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0);
-  auto strings_view = cudf::strings_column_view(column);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
-    auto result = cudf::strings::url_decode(strings_view);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_rows *
-                          (chars_per_row + sizeof(cudf::size_type)));
-}
-
-#define URLD_BENCHMARK_DEFINE(esc_seq_pct)                     \
-  TEMPLATED_BENCHMARK_F(UrlDecode, BM_url_decode, esc_seq_pct) \
-    ->Args({100000000, 10})                                    \
-    ->Args({10000000, 100})                                    \
-    ->Args({1000000, 1000})                                    \
-    ->Unit(benchmark::kMillisecond)                            \
-    ->UseManualTime();
-
-URLD_BENCHMARK_DEFINE(10)
-URLD_BENCHMARK_DEFINE(50)
diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu
new file mode 100644
index 00000000000..c460820d788
--- /dev/null
+++ b/cpp/benchmarks/string/url_decode.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/strings/convert/convert_urls.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+
+struct url_string_generator {
+  char* chars;
+  double esc_seq_chance;
+  thrust::minstd_rand engine;
+  thrust::uniform_real_distribution<float> esc_seq_dist;
+  url_string_generator(char* c, double esc_seq_chance, thrust::minstd_rand& engine)
+    : chars(c), esc_seq_chance(esc_seq_chance), engine(engine), esc_seq_dist(0, 1)
+  {
+  }
+
+  __device__ void operator()(thrust::tuple<cudf::size_type, cudf::size_type> str_begin_end)
+  {
+    auto begin = thrust::get<0>(str_begin_end);
+    auto end   = thrust::get<1>(str_begin_end);
+    engine.discard(begin);
+    for (auto i = begin; i < end; ++i) {
+      if (esc_seq_dist(engine) < esc_seq_chance and i < end - 3) {
+        chars[i]     = '%';
+        chars[i + 1] = '2';
+        chars[i + 2] = '0';
+        i += 2;
+      } else {
+        chars[i] = 'a';
+      }
+    }
+  }
+};
+
+auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, double esc_seq_chance)
+{
+  std::vector<std::string> strings{std::string(chars_per_row, 'a')};
+  auto col_1a     = cudf::test::strings_column_wrapper(strings.begin(), strings.end());
+  auto table_a    = cudf::repeat(cudf::table_view{{col_1a}}, num_rows);
+  auto result_col = std::move(table_a->release()[0]);  // string column with num_rows  aaa...
+  auto chars_col  = result_col->child(cudf::strings_column_view::chars_column_index).mutable_view();
+  auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view();
+
+  auto engine = thrust::default_random_engine{};
+  thrust::for_each_n(thrust::device,
+                     thrust::make_zip_iterator(offset_col.begin<cudf::size_type>(),
+                                               offset_col.begin<cudf::size_type>() + 1),
+                     num_rows,
+                     url_string_generator{chars_col.begin<char>(), esc_seq_chance, engine});
+  return result_col;
+}
+
+class UrlDecode : public cudf::benchmark {
+};
+
+void BM_url_decode(benchmark::State& state, int esc_seq_pct)
+{
+  cudf::size_type const num_rows      = state.range(0);
+  cudf::size_type const chars_per_row = state.range(1);
+
+  auto column       = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0);
+  auto strings_view = cudf::strings_column_view(column->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+    auto result = cudf::strings::url_decode(strings_view);
+  }
+
+  state.SetBytesProcessed(state.iterations() * num_rows *
+                          (chars_per_row + sizeof(cudf::size_type)));
+}
+
+#define URLD_BENCHMARK_DEFINE(esc_seq_pct)                      \
+  BENCHMARK_DEFINE_F(UrlDecode, esc_seq_pct)                    \
+  (::benchmark::State & st) { BM_url_decode(st, esc_seq_pct); } \
+  BENCHMARK_REGISTER_F(UrlDecode, esc_seq_pct)                  \
+    ->Args({100000000, 10})                                     \
+    ->Args({10000000, 100})                                     \
+    ->Args({1000000, 1000})                                     \
+    ->Unit(benchmark::kMillisecond)                             \
+    ->UseManualTime();
+
+URLD_BENCHMARK_DEFINE(10)
+URLD_BENCHMARK_DEFINE(50)
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 90097889a86..ca19e3046ad 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
-#include "../fixture/benchmark_fixture.hpp"
-#include "../synchronization/synchronization.hpp"
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 
@@ -170,21 +171,18 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa
 template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
 void type_dispatcher_benchmark(::benchmark::State& state)
 {
-  const auto source_size = static_cast<cudf::size_type>(state.range(1));
-
-  const auto n_cols = static_cast<cudf::size_type>(state.range(0));
-
+  const auto n_cols          = static_cast<cudf::size_type>(state.range(0));
+  const auto source_size     = static_cast<cudf::size_type>(state.range(1));
   const auto work_per_thread = static_cast<cudf::size_type>(state.range(2));
 
-  auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto init = cudf::make_fixed_width_scalar<TypeParam>(static_cast<TypeParam>(0));
 
-  std::vector<cudf::test::fixed_width_column_wrapper<TypeParam>> source_column_wrappers;
+  std::vector<std::unique_ptr<cudf::column>> source_column_wrappers;
   std::vector<cudf::mutable_column_view> source_columns;
 
   for (int i = 0; i < n_cols; ++i) {
-    source_column_wrappers.push_back(
-      cudf::test::fixed_width_column_wrapper<TypeParam>(data, data + source_size));
-    source_columns.push_back(source_column_wrappers[i]);
+    source_column_wrappers.push_back(cudf::sequence(source_size, *init));
+    source_columns.push_back(*source_column_wrappers[i]);
   }
   cudf::mutable_table_view source_table{source_columns};
 
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index ae1448da502..83c5e4c3e8f 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -308,7 +308,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
 endfunction()
 
-set(CUDF_VERSION_Arrow 5.0.0)
+set(CUDF_VERSION_Arrow 6.0.1)
 
 find_and_configure_arrow(
   ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC}
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 23587f49334..fb5b968671f 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,6 +79,7 @@ class aggregation {
     ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
     RANK,            ///< get rank       of current index
     DENSE_RANK,      ///< get dense rank of current index
+    PERCENT_RANK,    ///< get percent (i.e. fractional) rank of current index
     COLLECT_LIST,    ///< collect values into a list
     COLLECT_SET,     ///< collect values into a list without duplicate entries
     LEAD,            ///< window function, accesses row at specified offset following current row
@@ -305,34 +306,34 @@ std::unique_ptr<Base> make_row_number_aggregation();
  *  3. `RANK` aggregations are not compatible with exclusive scans.
  *
  * @code{.pseudo}
- * Example: Consider an motor-racing statistics dataset, containing the following columns:
- *   1. driver_name:   (STRING) Name of the car driver
- *   2. num_overtakes: (INT32)  Number of times the driver overtook another car in a lap
- *   3. lap_number:    (INT32)  The number of the lap
+ * Example: Consider a motor-racing statistics dataset, containing the following columns:
+ *   1. venue:  (STRING) Location of the race event
+ *   2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
+ *   3. time:   (INT32)  Time taken to complete the circuit
  *
  * For the following presorted data:
  *
- *  [ // driver_name,  num_overtakes,  lap_number
- *    {   "bottas",        2,            3        },
- *    {   "bottas",        2,            7        },
- *    {   "bottas",        2,            7        },
- *    {   "bottas",        1,            1        },
- *    {   "bottas",        1,            2        },
- *    {   "hamilton",      4,            1        },
- *    {   "hamilton",      4,            1        },
- *    {   "hamilton",      3,            4        },
- *    {   "hamilton",      2,            4        }
+ *  [ //      venue,           driver,           time
+ *    {   "silverstone",  "HAM" ("hamilton"),   15823},
+ *    {   "silverstone",  "LEC" ("leclerc"),    15827},
+ *    {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
+ *    {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
+ *    {   "silverstone",  "RIC" ("ricciardo"),  15905},
+ *    {      "monza",     "RIC" ("ricciardo"),  12154},
+ *    {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
+ *    {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
+ *    {      "monza",     "LEC" ("leclerc"),    12201},
+ *    {      "monza",     "PER" ("perez"),      12203}
  *  ]
  *
  * A grouped rank aggregation scan with:
- *   groupby column      : driver_name
- *   input orderby column: struct_column{num_overtakes, lap_number}
- *  result: column<size_type>{1, 2, 2, 4, 5, 1, 1, 3, 4}
- *
- * A grouped rank aggregation scan with:
- *   groupby column      : driver_name
- *   input orderby column: num_overtakes
- *  result: column<size_type>{1, 1, 1, 4, 4, 1, 1, 3, 4}
+ *   groupby column      : venue
+ *   input orderby column: time
+ * Produces the following rank column:
+ * {   1,     2,     3,     3,     5,      1,     2,     2,     4,     5}
+ * (This corresponds to the following grouping and `driver` rows:)
+ * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
+ *   <----------silverstone----------->|<-------------monza-------------->
  * @endcode
  */
 template <typename Base = aggregation>
@@ -357,39 +358,95 @@ std::unique_ptr<Base> make_rank_aggregation();
  *  3. `DENSE_RANK` aggregations are not compatible with exclusive scans.
  *
  * @code{.pseudo}
- * Example: Consider an motor-racing statistics dataset, containing the following columns:
- *   1. driver_name:   (STRING) Name of the car driver
- *   2. num_overtakes: (INT32)  Number of times the driver overtook another car in a lap
- *   3. lap_number:    (INT32)  The number of the lap
+ * Example: Consider a motor-racing statistics dataset, containing the following columns:
+ *   1. venue:  (STRING) Location of the race event
+ *   2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
+ *   3. time:   (INT32)  Time taken to complete the circuit
  *
  * For the following presorted data:
  *
- *  [ // driver_name,  num_overtakes,  lap_number
- *    {   "bottas",        2,            3        },
- *    {   "bottas",        2,            7        },
- *    {   "bottas",        2,            7        },
- *    {   "bottas",        1,            1        },
- *    {   "bottas",        1,            2        },
- *    {   "hamilton",      4,            1        },
- *    {   "hamilton",      4,            1        },
- *    {   "hamilton",      3,            4        },
- *    {   "hamilton",      2,            4        }
+ *  [ //      venue,           driver,           time
+ *    {   "silverstone",  "HAM" ("hamilton"),   15823},
+ *    {   "silverstone",  "LEC" ("leclerc"),    15827},
+ *    {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
+ *    {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
+ *    {   "silverstone",  "RIC" ("ricciardo"),  15905},
+ *    {      "monza",     "RIC" ("ricciardo"),  12154},
+ *    {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
+ *    {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
+ *    {      "monza",     "LEC" ("leclerc"),    12201},
+ *    {      "monza",     "PER" ("perez"),      12203}
  *  ]
  *
  * A grouped dense rank aggregation scan with:
- *   groupby column      : driver_name
- *   input orderby column: struct_column{num_overtakes, lap_number}
- *  result: column<size_type>{1, 2, 2, 3, 4, 1, 1, 2, 3}
- *
- * A grouped dense rank aggregation scan with:
- *   groupby column      : driver_name
- *   input orderby column: num_overtakes
- *  result: column<size_type>{1, 1, 1, 2, 2, 1, 1, 2, 3}
+ *   groupby column      : venue
+ *   input orderby column: time
+ * Produces the following dense rank column:
+ * {   1,     2,     3,     3,     4,      1,     2,     2,     3,     4}
+ * (This corresponds to the following grouping and `driver` rows:)
+ * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
+ *   <----------silverstone----------->|<-------------monza-------------->
  * @endcode
  */
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_dense_rank_aggregation();
 
+/**
+ * @brief Factory to create a PERCENT_RANK aggregation
+ *
+ * `PERCENT_RANK` returns a non-nullable column of double precision "fractional" ranks.
+ * For row index `i`, the percent rank of row `i` is defined as:
+ *   percent_rank = (rank - 1) / (group_row_count - 1)
+ * where,
+ *   1. rank is the `RANK` of the row within the group
+ *   2. group_row_count is the number of rows in the group
+ *
+ * This aggregation only works with "scan" algorithms. The input to the grouped or
+ * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks.
+ * If rows are ordered by more than one column, the orderby input column should be a struct
+ * column containing the ordering columns.
+ *
+ * Note:
+ *  1. This method requires that the rows are presorted by the group keys and order_by columns.
+ *  2. `PERCENT_RANK` aggregations will return a fully valid column regardless of null_handling
+ *     policy specified in the scan.
+ *  3. `PERCENT_RANK` aggregations are not compatible with exclusive scans.
+ *
+ * @code{.pseudo}
+ * Example: Consider a motor-racing statistics dataset, containing the following columns:
+ *   1. venue:  (STRING) Location of the race event
+ *   2. driver: (STRING) Name of the car driver (abbreviated to 3 characters)
+ *   3. time:   (INT32)  Time taken to complete the circuit
+ *
+ * For the following presorted data:
+ *
+ *  [ //      venue,           driver,           time
+ *    {   "silverstone",  "HAM" ("hamilton"),   15823},
+ *    {   "silverstone",  "LEC" ("leclerc"),    15827},
+ *    {   "silverstone",  "BOT" ("bottas"),     15834},  // <-- Tied for 3rd place.
+ *    {   "silverstone",  "NOR" ("norris"),     15834},  // <-- Tied for 3rd place.
+ *    {   "silverstone",  "RIC" ("ricciardo"),  15905},
+ *    {      "monza",     "RIC" ("ricciardo"),  12154},
+ *    {      "monza",     "NOR" ("norris"),     12156},  // <-- Tied for 2nd place.
+ *    {      "monza",     "BOT" ("bottas"),     12156},  // <-- Tied for 2nd place.
+ *    {      "monza",     "LEC" ("leclerc"),    12201},
+ *    {      "monza",     "PER" ("perez"),      12203}
+ *  ]
+ *
+ * A grouped percent rank aggregation scan with:
+ *   groupby column      : venue
+ *   input orderby column: time
+ * Produces the following percent rank column:
+ * { 0.00,  0.25,  0.50,  0.50,  1.00,   0.00,  0.25,  0.25,  0.75,  1.00 }
+ *
+ * (This corresponds to the following grouping and `driver` rows:)
+ * { "HAM", "LEC", "BOT", "NOR", "RIC",  "RIC", "NOR", "BOT", "LEC", "PER" }
+ *   <----------silverstone----------->|<-------------monza-------------->
+ * @endcode
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_percent_rank_aggregation();
+
 /**
  * @brief Factory to create a COLLECT_LIST aggregation
  *
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 325f023f283..ba15e37f9ea 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,13 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
+#include <limits>
+#include <type_traits>
 #include <vector>
 
 /**
@@ -375,6 +380,43 @@ class column_view : public detail::column_view_base {
    */
   auto child_end() const noexcept { return _children.cend(); }
 
+  /**
+   * @brief Construct a column view from a device_span<T>.
+   *
+   * Only numeric and chrono types are supported.
+   *
+   * @tparam T The device span type. Must be const and match the column view's type.
+   * @param data A typed device span containing the column view's data.
+   */
+  template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() or cudf::is_chrono<T>())>
+  column_view(device_span<T const> data)
+    : column_view(
+        cudf::data_type{cudf::type_to_id<T>()}, data.size(), data.data(), nullptr, 0, 0, {})
+  {
+    CUDF_EXPECTS(data.size() < std::numeric_limits<cudf::size_type>::max(),
+                 "Data exceeds the maximum size of a column view.");
+  }
+
+  /**
+   * @brief Converts a column view into a device span.
+   *
+   * Only numeric and chrono data types are supported. The column view must not
+   * be nullable.
+   *
+   * @tparam T The device span type. Must be const and match the column view's type.
+   * @throws cudf::logic_error if the column view type does not match the span type.
+   * @throws cudf::logic_error if the column view is nullable.
+   * @return A typed device span of the column view's data.
+   */
+  template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() or cudf::is_chrono<T>())>
+  [[nodiscard]] operator device_span<T const>() const
+  {
+    CUDF_EXPECTS(type() == cudf::data_type{cudf::type_to_id<T>()},
+                 "Device span type must match column view type.");
+    CUDF_EXPECTS(!nullable(), "A nullable column view cannot be converted to a device span.");
+    return device_span<T const>(data<T>(), size());
+  }
+
  private:
   friend column_view bit_cast(column_view const& input, data_type type);
 
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index fbf315776f4..a234f4d3715 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,6 +77,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class rank_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class dense_rank_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class percent_rank_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(
     data_type col_type, class collect_list_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -126,6 +128,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class row_number_aggregation const& agg);
   virtual void visit(class rank_aggregation const& agg);
   virtual void visit(class dense_rank_aggregation const& agg);
+  virtual void visit(class percent_rank_aggregation const& agg);
   virtual void visit(class collect_list_aggregation const& agg);
   virtual void visit(class collect_set_aggregation const& agg);
   virtual void visit(class lead_lag_aggregation const& agg);
@@ -654,6 +657,22 @@ class dense_rank_aggregation final : public rolling_aggregation, public groupby_
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+class percent_rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation {
+ public:
+  percent_rank_aggregation() : aggregation{PERCENT_RANK} {}
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<percent_rank_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived aggregation class for specifying COLLECT_LIST aggregation
  */
@@ -1243,6 +1262,12 @@ struct target_type_impl<Source, aggregation::DENSE_RANK> {
   using type = size_type;
 };
 
+// Always use double for PERCENT_RANK
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::PERCENT_RANK> {
+  using type = double;
+};
+
 // Always use list for COLLECT_LIST
 template <typename Source>
 struct target_type_impl<Source, aggregation::COLLECT_LIST> {
@@ -1405,6 +1430,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::RANK>(std::forward<Ts>(args)...);
     case aggregation::DENSE_RANK:
       return f.template operator()<aggregation::DENSE_RANK>(std::forward<Ts>(args)...);
+    case aggregation::PERCENT_RANK:
+      return f.template operator()<aggregation::PERCENT_RANK>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_LIST:
       return f.template operator()<aggregation::COLLECT_LIST>(std::forward<Ts>(args)...);
     case aggregation::COLLECT_SET:
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 8e3db1c7b10..36dce6caf0b 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,5 +102,17 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Generate row percent ranks for a column.
+ *
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
+ */
+std::unique_ptr<column> inclusive_percent_rank_scan(column_view const& order_by,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index a59ad4c42ee..d31f85d6603 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,6 +129,8 @@ struct DeviceMin {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+    // chrono types do not have std::numeric_limits specializations and should use T::max()
+    // https://eel.is/c++draft/numeric.limits.general#6
     if constexpr (cudf::is_chrono<T>()) return T::max();
     return cuda::std::numeric_limits<T>::max();
   }
@@ -171,6 +173,8 @@ struct DeviceMax {
                               !cudf::is_fixed_point<T>()>* = nullptr>
   static constexpr T identity()
   {
+    // chrono types do not have std::numeric_limits specializations and should use T::min()
+    // https://eel.is/c++draft/numeric.limits.general#6
     if constexpr (cudf::is_chrono<T>()) return T::min();
     return cuda::std::numeric_limits<T>::lowest();
   }
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 51d58383de4..4ec5f134bb0 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 
@@ -130,7 +131,7 @@ struct MurmurHash3_32 {
    *
    * @returns A hash value that intelligently combines the lhs and rhs hash values
    */
-  [[nodiscard]] __device__ inline result_type hash_combine(result_type lhs, result_type rhs)
+  constexpr result_type hash_combine(result_type lhs, result_type rhs) const
   {
     result_type combined{lhs};
 
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index cce05042917..e4e94074fb8 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 namespace cudf {
 /**
@@ -26,6 +25,22 @@ namespace cudf {
  * @file
  */
 
+/**
+ *  @brief Identifies the hash function to be used
+ */
+enum class hash_id {
+  HASH_IDENTITY = 0,    ///< Identity hash function that simply returns the key to be hashed
+  HASH_MURMUR3,         ///< Murmur3 hash function
+  HASH_MD5,             ///< MD5 hash function
+  HASH_SERIAL_MURMUR3,  ///< Serial Murmur3 hash function
+  HASH_SPARK_MURMUR3    ///< Spark Murmur3 hash function
+};
+
+/**
+ * @brief The default seed value for hash functions
+ */
+static constexpr uint32_t DEFAULT_HASH_SEED = 0;
+
 /**
  * @brief Computes the hash value of each row in the input set of columns.
  *
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index f6efea5f2bb..d56f8f0e904 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -530,7 +530,6 @@ class hash_join {
    * provided `output_size` is smaller than the actual output size.
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param output_size Optional value which allows users to specify the exact output size.
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
@@ -543,7 +542,6 @@ class hash_join {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(cudf::table_view const& probe,
-             null_equality compare_nulls            = null_equality::EQUAL,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
              rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
@@ -554,7 +552,6 @@ class hash_join {
    * provided `output_size` is smaller than the actual output size.
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param output_size Optional value which allows users to specify the exact output size.
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
@@ -567,7 +564,6 @@ class hash_join {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   left_join(cudf::table_view const& probe,
-            null_equality compare_nulls            = null_equality::EQUAL,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
@@ -578,7 +574,6 @@ class hash_join {
    * provided `output_size` is smaller than the actual output size.
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param output_size Optional value which allows users to specify the exact output size.
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
@@ -591,7 +586,6 @@ class hash_join {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   full_join(cudf::table_view const& probe,
-            null_equality compare_nulls            = null_equality::EQUAL,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
@@ -601,39 +595,32 @@ class hash_join {
    * probe table.
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
    * @return The exact number of output when performing an inner join between two tables with
    * `build` and `probe` as the the join keys .
    */
   [[nodiscard]] std::size_t inner_join_size(
-    cudf::table_view const& probe,
-    null_equality compare_nulls  = null_equality::EQUAL,
-    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+    cudf::table_view const& probe, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a left join with the specified probe
    * table.
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
    * @return The exact number of output when performing a left join between two tables with `build`
    * and `probe` as the the join keys .
    */
   [[nodiscard]] std::size_t left_join_size(
-    cudf::table_view const& probe,
-    null_equality compare_nulls  = null_equality::EQUAL,
-    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+    cudf::table_view const& probe, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a full join with the specified probe
    * table.
    *
    * @param probe The probe table, from which the tuples are probed.
-   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the intermediate table and columns' device
    * memory.
@@ -643,7 +630,6 @@ class hash_join {
    */
   std::size_t full_join_size(
     cudf::table_view const& probe,
-    null_equality compare_nulls         = null_equality::EQUAL,
     rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 6b1ad7db08b..3ffd9a87d39 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/types.hpp>
+#include <cudf/hashing.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index 3f35b796e58..ba34e20398e 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -451,7 +451,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  *      should be the exact same type (`INT32`).
  *
  * @code{.pseudo}
- * Example: Consider an motor-racing statistics dataset, containing the following columns:
+ * Example: Consider a motor-racing statistics dataset, containing the following columns:
  *   1. driver_name:   (STRING) Name of the car driver
  *   2. num_overtakes: (INT32)  Number of times the driver overtook another car in a lap
  *   3. lap_number:    (INT32)  The number of the lap
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index aa3f544202f..7af56f89449 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <cudf/fixed_point/temporary.hpp>
+
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
@@ -150,9 +152,11 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int
   exp_ten += exp_offset;
 
   // shift the output value based on the exp_ten and the scale values
-  value = exp_ten < scale
-            ? value / static_cast<UnsignedDecimalType>(exp10(static_cast<double>(scale - exp_ten)))
-            : value * static_cast<UnsignedDecimalType>(exp10(static_cast<double>(exp_ten - scale)));
+  auto const shift_adjust =
+    abs(scale - exp_ten) > cuda::std::numeric_limits<UnsignedDecimalType>::digits10
+      ? cuda::std::numeric_limits<UnsignedDecimalType>::max()
+      : numeric::detail::exp10<UnsignedDecimalType>(abs(scale - exp_ten));
+  value = exp_ten < scale ? value / shift_adjust : value * shift_adjust;
 
   return static_cast<DecimalType>(value) * (sign == 0 ? 1 : sign);
 }
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 466f71aace0..94e9f36d7d3 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 
@@ -48,12 +49,14 @@ namespace strings {
  *
  * @param strings Strings instance for this operation.
  * @param pattern The regular expression pattern with group indicators.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned table's device memory.
  * @return Columns of strings extracted from the input column.
  */
 std::unique_ptr<table> extract(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -69,7 +72,7 @@ std::unique_ptr<table> extract(
  * @code{.pseudo}
  * Example:
  * s = ["a1 b4", "b2", "c3 a5", "b", null]
- * r = extract_all(s,"([ab])(\\d)")
+ * r = extract_all_record(s,"([ab])(\\d)")
  * r is now [ ["a", "1", "b", "4"],
  *            ["b", "2"],
  *            ["a", "5"],
@@ -81,12 +84,14 @@ std::unique_ptr<table> extract(
  *
  * @param strings Strings instance for this operation.
  * @param pattern The regular expression pattern with group indicators.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate any returned device memory.
  * @return Lists column containing strings extracted from the input column.
  */
-std::unique_ptr<column> extract_all(
+std::unique_ptr<column> extract_all_record(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
new file mode 100644
index 00000000000..320d1bdc9b4
--- /dev/null
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+
+namespace cudf {
+namespace strings {
+/**
+ * @addtogroup strings_split
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Splits strings elements into a table of strings columns
+ * using a regex pattern to delimit each string.
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of strings[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = split_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc def_g", "_bc", "ab cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> split_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits strings elements into a table of strings columns
+ * using a regex pattern to delimit each string starting from the end of the string.
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = rsplit_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a_bc def", "a_", "_ab", "ab"],
+ *       ["g", "bc", "cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> rsplit_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits strings elements into a list column of strings
+ * using the given regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_record_re(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = split_record_re(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab cd"],
+ *       ["ab", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits strings elements into a list column of strings
+ * using the given regex pattern to delimit each string starting from the end of the string.
+ *
+ * Each element generates a vector of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_record_re(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = rsplit_record_re(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> rsplit_record_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 6222b2e680e..76e2589a5a9 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -326,21 +326,5 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh
  */
 std::size_t size_of(data_type t);
 
-/**
- *  @brief Identifies the hash function to be used
- */
-enum class hash_id {
-  HASH_IDENTITY = 0,    ///< Identity hash function that simply returns the key to be hashed
-  HASH_MURMUR3,         ///< Murmur3 hash function
-  HASH_MD5,             ///< MD5 hash function
-  HASH_SERIAL_MURMUR3,  ///< Serial Murmur3 hash function
-  HASH_SPARK_MURMUR3    ///< Spark Murmur3 hash function
-};
-
-/**
- * @brief The default seed value for hash functions
- */
-static constexpr uint32_t DEFAULT_HASH_SEED = 0;
-
 /** @} */
 }  // namespace cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 0ac41b2c4a1..bfaf4694693 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index 3a27d798487..62aa22c2788 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <limits>
-
 #include <cuda/std/chrono>
 
 namespace cudf {
@@ -68,32 +66,3 @@ static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep), "");
 
 /** @} */  // end of group
 }  // namespace cudf
-
-namespace std {
-/**
- * @brief Specialization of std::numeric_limits for cudf::detail::duration
- *
- * Pass through to return the limits of the underlying numeric representation.
- */
-#define DURATION_LIMITS(TypeName)                                             \
-  template <>                                                                 \
-  struct numeric_limits<TypeName> {                                           \
-    static constexpr TypeName max() noexcept { return TypeName::max(); }      \
-    static constexpr TypeName lowest() noexcept                               \
-    {                                                                         \
-      return TypeName(std::numeric_limits<typename TypeName::rep>::lowest()); \
-    }                                                                         \
-    static constexpr TypeName min() noexcept { return TypeName::min(); }      \
-  }
-
-DURATION_LIMITS(cudf::duration_D);
-DURATION_LIMITS(cudf::duration_h);
-DURATION_LIMITS(cudf::duration_m);
-DURATION_LIMITS(cudf::duration_s);
-DURATION_LIMITS(cudf::duration_ms);
-DURATION_LIMITS(cudf::duration_us);
-DURATION_LIMITS(cudf::duration_ns);
-
-#undef DURATION_LIMITS
-
-}  // namespace std
diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp
index 8481068ca05..5a4424112de 100644
--- a/cpp/include/cudf/wrappers/timestamps.hpp
+++ b/cpp/include/cudf/wrappers/timestamps.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -78,32 +78,3 @@ static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep), "");
 
 /** @} */  // end of group
 }  // namespace cudf
-
-namespace std {
-/**
- * @brief Specialization of std::numeric_limits for cudf::detail::timestamp
- *
- * Pass through to return the limits of the underlying numeric representation.
- */
-#define TIMESTAMP_LIMITS(TypeName)                                                                \
-  template <>                                                                                     \
-  struct numeric_limits<TypeName> {                                                               \
-    static constexpr TypeName max() noexcept { return TypeName::max(); }                          \
-    static constexpr TypeName lowest() noexcept                                                   \
-    {                                                                                             \
-      return TypeName{TypeName::duration{std::numeric_limits<typename TypeName::rep>::lowest()}}; \
-    }                                                                                             \
-    static constexpr TypeName min() noexcept { return TypeName::min(); }                          \
-  }
-
-TIMESTAMP_LIMITS(cudf::timestamp_D);
-TIMESTAMP_LIMITS(cudf::timestamp_h);
-TIMESTAMP_LIMITS(cudf::timestamp_m);
-TIMESTAMP_LIMITS(cudf::timestamp_s);
-TIMESTAMP_LIMITS(cudf::timestamp_ms);
-TIMESTAMP_LIMITS(cudf::timestamp_us);
-TIMESTAMP_LIMITS(cudf::timestamp_ns);
-
-#undef TIMESTAMP_LIMITS
-
-}  // namespace std
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index c00b3d6db85..8e2a167f7b2 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -160,6 +160,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, percent_rank_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, collect_list_aggregation const& agg)
 {
@@ -333,6 +339,11 @@ void aggregation_finalizer::visit(dense_rank_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(percent_rank_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(collect_list_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -616,6 +627,16 @@ template std::unique_ptr<aggregation> make_dense_rank_aggregation<aggregation>()
 template std::unique_ptr<groupby_scan_aggregation>
 make_dense_rank_aggregation<groupby_scan_aggregation>();
 
+/// Factory to create a PERCENT_RANK aggregation
+template <typename Base>
+std::unique_ptr<Base> make_percent_rank_aggregation()
+{
+  return std::make_unique<detail::percent_rank_aggregation>();
+}
+template std::unique_ptr<aggregation> make_percent_rank_aggregation<aggregation>();
+template std::unique_ptr<groupby_scan_aggregation>
+make_percent_rank_aggregation<groupby_scan_aggregation>();
+
 /// Factory to create a COLLECT_LIST aggregation
 template <typename Base>
 std::unique_ptr<Base> make_collect_list_aggregation(null_policy null_handling)
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 62aa3df8e5c..eae7d0b6129 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/row_operators.cuh>
@@ -98,7 +100,9 @@ std::unique_ptr<column> rank_scan(column_view const& order_by,
     order_by,
     group_labels,
     group_offsets,
-    [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
+    [] __device__(bool unequal, auto row_index_in_group) {
+      return unequal ? row_index_in_group + 1 : 0;
+    },
     DeviceMax{},
     has_nested_nulls(table_view{{order_by}}),
     stream,
@@ -115,13 +119,50 @@ std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
     order_by,
     group_labels,
     group_offsets,
-    [] __device__(bool equality, auto row_index) { return equality; },
+    [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; },
     DeviceSum{},
     has_nested_nulls(table_view{{order_by}}),
     stream,
     mr);
 }
 
+std::unique_ptr<column> percent_rank_scan(column_view const& order_by,
+                                          device_span<size_type const> group_labels,
+                                          device_span<size_type const> group_offsets,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  auto const rank_column = rank_scan(
+    order_by, group_labels, group_offsets, stream, rmm::mr::get_current_device_resource());
+  auto const rank_view       = rank_column->view();
+  auto const group_size_iter = cudf::detail::make_counting_transform_iterator(
+    0,
+    [labels  = group_labels.begin(),
+     offsets = group_offsets.begin()] __device__(size_type row_index) {
+      auto const group_label = labels[row_index];
+      auto const group_start = offsets[group_label];
+      auto const group_end   = offsets[group_label + 1];
+      return group_end - group_start;
+    });
+
+  // Result type for PERCENT_RANK is independent of input type.
+  using result_type = cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
+
+  auto percent_rank_result = cudf::make_fixed_width_column(
+    data_type{type_to_id<result_type>()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    rank_view.begin<size_type>(),
+                    rank_view.end<size_type>(),
+                    group_size_iter,
+                    percent_rank_result->mutable_view().begin<result_type>(),
+                    [] __device__(auto const rank, auto const group_size) {
+                      return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1));
+                    });
+
+  return percent_rank_result;
+}
+
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index 82ef0e25380..76a7f3f73c7 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,6 +116,21 @@ std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Internal API to calculate groupwise percent rank value
+ *
+ * @param order_by column or struct column by which the rows within a group are sorted
+ * @param group_labels ID of group to which the row belongs
+ * @param group_offsets group index offsets with group ID indices
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of type `double` of percent rank values
+ */
+std::unique_ptr<column> percent_rank_scan(column_view const& order_by,
+                                          device_span<size_type const> group_labels,
+                                          device_span<size_type const> group_offsets,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 }  // namespace detail
 }  // namespace groupby
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index dde4e00eb4a..8e1463f7964 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,12 +74,16 @@ struct element_arg_minmax_fn {
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
+ * This is similar to `value_accessor` in `column_device_view.cuh` but with support of dictionary
+ * type.
+ *
  * @tparam T Type of the underlying column. For dictionary column, type of the key column.
  */
 template <typename T>
 struct value_accessor {
   column_device_view const col;
   bool const is_dict;
+
   value_accessor(column_device_view const& col) : col(col), is_dict(cudf::is_dictionary(col.type()))
   {
   }
@@ -93,6 +97,7 @@ struct value_accessor {
       return col.element<T>(i);
     }
   }
+
   __device__ auto operator()(size_type i) const { return value(i); }
 };
 
@@ -100,20 +105,28 @@ struct value_accessor {
  * @brief Null replaced value accessor for column which supports dictionary column too.
  * For null value, returns null `init` value
  *
- * @tparam T Type of the underlying column. For dictionary column, type of the key column.
+ * @tparam SourceType Type of the underlying column. For dictionary column, type of the key column.
+ * @tparam TargetType Type that is used for computation.
  */
-template <typename T>
-struct null_replaced_value_accessor : value_accessor<T> {
-  using super_t = value_accessor<T>;
+template <typename SourceType, typename TargetType>
+struct null_replaced_value_accessor : value_accessor<SourceType> {
+  using super_t = value_accessor<SourceType>;
+
+  TargetType const init;
   bool const has_nulls;
-  T const init;
-  null_replaced_value_accessor(column_device_view const& col, T const& init, bool const has_nulls)
+
+  null_replaced_value_accessor(column_device_view const& col,
+                               TargetType const& init,
+                               bool const has_nulls)
     : super_t(col), init(init), has_nulls(has_nulls)
   {
   }
-  __device__ T operator()(size_type i) const
+
+  __device__ TargetType operator()(size_type i) const
   {
-    return has_nulls && super_t::col.is_null_nocheck(i) ? init : super_t::value(i);
+    return has_nulls && super_t::col.is_null_nocheck(i)
+             ? init
+             : static_cast<TargetType>(super_t::value(i));
   }
 };
 
@@ -168,7 +181,7 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
                                         rmm::mr::device_memory_resource* mr)
 
   {
-    using DeviceType  = device_storage_type_t<T>;
+    using SourceDType = device_storage_type_t<T>;
     using ResultType  = cudf::detail::target_type_t<T, K>;
     using ResultDType = device_storage_type_t<ResultType>;
 
@@ -203,9 +216,11 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
       do_reduction(count_iter, result_begin, binop);
     } else {
       using OpType    = cudf::detail::corresponding_operator_t<K>;
-      auto init       = OpType::template identity<DeviceType>();
+      auto init       = OpType::template identity<ResultDType>();
       auto inp_values = cudf::detail::make_counting_transform_iterator(
-        0, null_replaced_value_accessor{*d_values_ptr, init, values.has_nulls()});
+        0,
+        null_replaced_value_accessor<SourceDType, ResultDType>{
+          *d_values_ptr, init, values.has_nulls()});
       do_reduction(inp_values, result_begin, OpType{});
     }
 
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 402ff8c47ed..8c4959da35b 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,6 +144,23 @@ void scan_result_functor::operator()<aggregation::DENSE_RANK>(aggregation const&
     detail::dense_rank_scan(
       order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
 }
+
+template <>
+void scan_result_functor::operator()<aggregation::PERCENT_RANK>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) return;
+  CUDF_EXPECTS(helper.is_presorted(),
+               "Percent rank aggregate in groupby scan requires the keys to be presorted");
+  auto const order_by = get_grouped_values();
+  CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
+               "Unsupported list type in grouped percent_rank scan.");
+
+  cache.add_result(
+    values,
+    agg,
+    detail::percent_rank_scan(
+      order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr));
+}
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index dabe992d959..e2893a2e881 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -194,20 +194,13 @@ template <typename DataT,
 std::vector<std::future<ResultT>> make_sliced_tasks(
   F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
 {
+  constexpr size_t default_max_slice_size = 4 * 1024 * 1024;
+  static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size);
+  auto const slices                = make_file_io_slices(size, max_slice_size);
   std::vector<std::future<ResultT>> slice_tasks;
-  constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024;
-  static auto const max_slice_bytes =
-    getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes);
-  size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes);
-  size_t slice_offset   = 0;
-  for (size_t t = 0; t < n_slices; ++t) {
-    DataT* ptr_slice = ptr + slice_offset;
-
-    size_t const slice_size = (t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes;
-    slice_tasks.push_back(pool.submit(function, ptr_slice, slice_size, offset + slice_offset));
-
-    slice_offset += slice_size;
-  }
+  std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
+    return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset);
+  });
   return slice_tasks;
 }
 
@@ -318,6 +311,21 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
   return nullptr;
 }
 
+std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size)
+{
+  max_slice_size      = std::max(1024ul, max_slice_size);
+  auto const n_slices = util::div_rounding_up_safe(size, max_slice_size);
+  std::vector<file_io_slice> slices;
+  slices.reserve(n_slices);
+  std::generate_n(std::back_inserter(slices), n_slices, [&, idx = 0]() mutable {
+    auto const slice_offset = idx++ * max_slice_size;
+    auto const slice_size   = std::min(size - slice_offset, max_slice_size);
+    return file_io_slice{slice_offset, slice_size};
+  });
+
+  return slices;
+}
+
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index fcee4e43a20..be3ecc49ab0 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -291,6 +291,21 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
  */
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath);
 
+/**
+ * @brief Byte range to be read/written in a single operation.
+ */
+struct file_io_slice {
+  size_t offset;
+  size_t size;
+};
+
+/**
+ * @brief Split the total number of bytes to read/write into slices to enable parallel IO.
+ *
+ * If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse.
+ */
+std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size);
+
 }  // namespace detail
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 7590c93f0c3..b89bcabf23e 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,8 +140,8 @@ probe_join_hash_table(cudf::table_device_view build_table,
 std::size_t get_full_join_size(cudf::table_device_view build_table,
                                cudf::table_device_view probe_table,
                                multimap_type const& hash_table,
-                               bool has_nulls,
-                               null_equality compare_nulls,
+                               bool const has_nulls,
+                               null_equality const compare_nulls,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
@@ -235,6 +235,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
                                           null_equality compare_nulls,
                                           rmm::cuda_stream_view stream)
   : _is_empty{build.num_rows() == 0},
+    _nulls_equal{compare_nulls},
     _hash_table{compute_hash_table_size(build.num_rows()),
                 std::numeric_limits<hash_value_type>::max(),
                 cudf::detail::JoinNoneValue,
@@ -253,50 +254,43 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
 
   if (_is_empty) { return; }
 
-  build_join_hash_table(_build, _hash_table, compare_nulls, stream);
+  cudf::detail::build_join_hash_table(_build, _hash_table, _nulls_equal, stream);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::hash_join_impl::inner_join(cudf::table_view const& probe,
-                                      null_equality compare_nulls,
                                       std::optional<std::size_t> output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
-  return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(
-    probe, compare_nulls, output_size, stream, mr);
+  return compute_hash_join<cudf::detail::join_kind::INNER_JOIN>(probe, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::hash_join_impl::left_join(cudf::table_view const& probe,
-                                     null_equality compare_nulls,
                                      std::optional<std::size_t> output_size,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
-  return compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(
-    probe, compare_nulls, output_size, stream, mr);
+  return compute_hash_join<cudf::detail::join_kind::LEFT_JOIN>(probe, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::hash_join_impl::full_join(cudf::table_view const& probe,
-                                     null_equality compare_nulls,
                                      std::optional<std::size_t> output_size,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
 {
   CUDF_FUNC_RANGE();
-  return compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(
-    probe, compare_nulls, output_size, stream, mr);
+  return compute_hash_join<cudf::detail::join_kind::FULL_JOIN>(probe, output_size, stream, mr);
 }
 
 std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe,
-                                                       null_equality compare_nulls,
                                                        rmm::cuda_stream_view stream) const
 {
   CUDF_FUNC_RANGE();
@@ -316,12 +310,11 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
     *flattened_probe_table_ptr,
     _hash_table,
     cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build),
-    compare_nulls,
+    _nulls_equal,
     stream);
 }
 
 std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
-                                                      null_equality compare_nulls,
                                                       rmm::cuda_stream_view stream) const
 {
   CUDF_FUNC_RANGE();
@@ -341,12 +334,11 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
     *flattened_probe_table_ptr,
     _hash_table,
     cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build),
-    compare_nulls,
+    _nulls_equal,
     stream);
 }
 
 std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
-                                                      null_equality compare_nulls,
                                                       rmm::cuda_stream_view stream,
                                                       rmm::mr::device_memory_resource* mr) const
 {
@@ -362,20 +354,20 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
   auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
-  return get_full_join_size(*build_table_ptr,
-                            *flattened_probe_table_ptr,
-                            _hash_table,
-                            cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build),
-                            compare_nulls,
-                            stream,
-                            mr);
+  return cudf::detail::get_full_join_size(
+    *build_table_ptr,
+    *flattened_probe_table_ptr,
+    _hash_table,
+    cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build),
+    _nulls_equal,
+    stream,
+    mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
-                                             null_equality compare_nulls,
                                              std::optional<std::size_t> output_size,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr) const
@@ -403,42 +395,40 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
                           [](const auto& b, const auto& p) { return b.type() == p.type(); }),
                "Mismatch in joining column data types");
 
-  return probe_join_indices<JoinKind>(
-    flattened_probe_table, compare_nulls, output_size, stream, mr);
+  return probe_join_indices<JoinKind>(flattened_probe_table, output_size, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe,
-                                              null_equality compare_nulls,
+hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe_table,
                                               std::optional<std::size_t> output_size,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr) const
 {
   // Trivial left join case - exit early
   if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
-    return get_trivial_left_join_indices(probe, stream, mr);
+    return get_trivial_left_join_indices(probe_table, stream, mr);
   }
 
   CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");
 
   auto build_table_ptr = cudf::table_device_view::create(_build, stream);
-  auto probe_table_ptr = cudf::table_device_view::create(probe, stream);
-
-  auto join_indices =
-    cudf::detail::probe_join_hash_table<JoinKind>(*build_table_ptr,
-                                                  *probe_table_ptr,
-                                                  _hash_table,
-                                                  cudf::has_nulls(probe) | cudf::has_nulls(_build),
-                                                  compare_nulls,
-                                                  output_size,
-                                                  stream,
-                                                  mr);
+  auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream);
+
+  auto join_indices = cudf::detail::probe_join_hash_table<JoinKind>(
+    *build_table_ptr,
+    *probe_table_ptr,
+    _hash_table,
+    cudf::has_nulls(probe_table) | cudf::has_nulls(_build),
+    _nulls_equal,
+    output_size,
+    stream,
+    mr);
 
   if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr);
+      join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr);
     join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   }
   return join_indices;
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 21bfd8120f7..9c44aeebd59 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,7 +89,7 @@ class make_pair_function {
  * @param probe_table The left hand table
  * @param hash_table A hash table built on the build table that maps the index
  * of every row to the hash value of that row.
- * @param compare_nulls Controls whether null join-key values should match or not.
+ * @param nulls_equal Flag to denote nulls are equal or not.
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
  * @return The exact size of the output of the join operation
@@ -98,8 +98,8 @@ template <join_kind JoinKind, typename multimap_type>
 std::size_t compute_join_output_size(table_device_view build_table,
                                      table_device_view probe_table,
                                      multimap_type const& hash_table,
-                                     bool has_nulls,
-                                     null_equality compare_nulls,
+                                     bool const has_nulls,
+                                     cudf::null_equality const nulls_equal,
                                      rmm::cuda_stream_view stream)
 {
   const size_type build_table_num_rows{build_table.num_rows()};
@@ -121,7 +121,7 @@ std::size_t compute_join_output_size(table_device_view build_table,
   }
 
   auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
-  pair_equality equality{probe_table, build_table, probe_nulls, compare_nulls};
+  pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal};
 
   row_hash hash_probe{probe_nulls, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
@@ -152,14 +152,14 @@ std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& l
  *
  * @param build Table of columns used to build join hash.
  * @param hash_table Build hash table.
- * @param compare_nulls Controls whether null join-key values should match or not.
+ * @param nulls_equal Flag to denote nulls are equal or not.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  *
  */
 template <typename MultimapType>
 void build_join_hash_table(cudf::table_view const& build,
                            MultimapType& hash_table,
-                           null_equality compare_nulls,
+                           null_equality const nulls_equal,
                            rmm::cuda_stream_view stream)
 {
   auto build_table_ptr = cudf::table_device_view::create(build, stream);
@@ -174,7 +174,7 @@ void build_join_hash_table(cudf::table_view const& build,
   auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
 
   size_type const build_table_num_rows{build_table_ptr->num_rows()};
-  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
+  if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) {
     hash_table.insert(iter, iter + build_table_num_rows, stream.value());
   } else {
     thrust::counting_iterator<size_type> stencil(0);
@@ -197,7 +197,8 @@ struct hash_join::hash_join_impl {
   hash_join_impl& operator=(hash_join_impl&&) = delete;
 
  private:
-  bool _is_empty;
+  bool const _is_empty;
+  cudf::null_equality const _nulls_equal;
   cudf::table_view _build;
   std::vector<std::unique_ptr<cudf::column>> _created_null_columns;
   cudf::structs::detail::flattened_table _flattened_build_table;
@@ -221,7 +222,6 @@ struct hash_join::hash_join_impl {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(cudf::table_view const& probe,
-             null_equality compare_nulls,
              std::optional<std::size_t> output_size,
              rmm::cuda_stream_view stream,
              rmm::mr::device_memory_resource* mr) const;
@@ -229,7 +229,6 @@ struct hash_join::hash_join_impl {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   left_join(cudf::table_view const& probe,
-            null_equality compare_nulls,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
             rmm::mr::device_memory_resource* mr) const;
@@ -237,21 +236,17 @@ struct hash_join::hash_join_impl {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   full_join(cudf::table_view const& probe,
-            null_equality compare_nulls,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
             rmm::mr::device_memory_resource* mr) const;
 
   [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
-                                            null_equality compare_nulls,
                                             rmm::cuda_stream_view stream) const;
 
   [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
-                                           null_equality compare_nulls,
                                            rmm::cuda_stream_view stream) const;
 
   std::size_t full_join_size(cudf::table_view const& probe,
-                             null_equality compare_nulls,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr) const;
 
@@ -260,7 +255,6 @@ struct hash_join::hash_join_impl {
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
   compute_hash_join(cudf::table_view const& probe,
-                    null_equality compare_nulls,
                     std::optional<std::size_t> output_size,
                     rmm::cuda_stream_view stream,
                     rmm::mr::device_memory_resource* mr) const;
@@ -276,7 +270,6 @@ struct hash_join::hash_join_impl {
    * @tparam JoinKind The type of join to be performed.
    *
    * @param probe_table Table of probe side columns to join.
-   * @param compare_nulls Controls whether null join-key values should match or not.
    * @param output_size Optional value which allows users to specify the exact output size.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource used to allocate the returned vectors.
@@ -286,8 +279,7 @@ struct hash_join::hash_join_impl {
   template <cudf::detail::join_kind JoinKind>
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  probe_join_indices(cudf::table_view const& probe,
-                     null_equality compare_nulls,
+  probe_join_indices(cudf::table_view const& probe_table,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
                      rmm::mr::device_memory_resource* mr) const;
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index db79075d864..7a478ca2eb3 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <join/hash_join.cuh>
-#include <join/join_common_utils.hpp>
+#include "join/hash_join.cuh"
+#include "join/join_common_utils.hpp"
 
 #include <cudf/detail/gather.cuh>
 #include <cudf/dictionary/detail/update_keys.hpp>
@@ -51,11 +51,11 @@ inner_join(table_view const& left_input,
   // build the hash map from the smaller table.
   if (right.num_rows() > left.num_rows()) {
     cudf::hash_join hj_obj(left, compare_nulls, stream);
-    auto result = hj_obj.inner_join(right, compare_nulls, std::nullopt, stream, mr);
-    return std::make_pair(std::move(result.second), std::move(result.first));
+    auto [right_result, left_result] = hj_obj.inner_join(right, std::nullopt, stream, mr);
+    return std::make_pair(std::move(left_result), std::move(right_result));
   } else {
     cudf::hash_join hj_obj(right, compare_nulls, stream);
-    return hj_obj.inner_join(left, compare_nulls, std::nullopt, stream, mr);
+    return hj_obj.inner_join(left, std::nullopt, stream, mr);
   }
 }
 
@@ -78,16 +78,17 @@ std::unique_ptr<table> inner_join(table_view const& left_input,
   auto const left  = scatter_columns(matched.second.front(), left_on, left_input);
   auto const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  auto join_indices = inner_join(left.select(left_on), right.select(right_on), compare_nulls, mr);
+  auto const [left_join_indices, right_join_indices] = cudf::detail::inner_join(
+    left.select(left_on), right.select(right_on), compare_nulls, stream, mr);
   std::unique_ptr<table> left_result  = detail::gather(left,
-                                                      join_indices.first->begin(),
-                                                      join_indices.first->end(),
+                                                      left_join_indices->begin(),
+                                                      left_join_indices->end(),
                                                       out_of_bounds_policy::DONT_CHECK,
                                                       stream,
                                                       mr);
   std::unique_ptr<table> right_result = detail::gather(right,
-                                                       join_indices.second->begin(),
-                                                       join_indices.second->end(),
+                                                       right_join_indices->begin(),
+                                                       right_join_indices->end(),
                                                        out_of_bounds_policy::DONT_CHECK,
                                                        stream,
                                                        mr);
@@ -113,7 +114,7 @@ left_join(table_view const& left_input,
   table_view const right = matched.second.back();
 
   cudf::hash_join hj_obj(right, compare_nulls, stream);
-  return hj_obj.left_join(left, compare_nulls, std::nullopt, stream, mr);
+  return hj_obj.left_join(left, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<table> left_join(table_view const& left_input,
@@ -134,23 +135,24 @@ std::unique_ptr<table> left_join(table_view const& left_input,
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  auto join_indices = left_join(left.select(left_on), right.select(right_on), compare_nulls);
-
-  if ((left_on.empty() || right_on.empty()) ||
-      is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) {
-    auto probe_build_pair = get_empty_joined_table(left, right);
-    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                            std::move(probe_build_pair.second));
+  if ((left_on.empty() or right_on.empty()) or
+      cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) {
+    auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right);
+    return cudf::detail::combine_table_pair(std::move(left_empty_table),
+                                            std::move(right_empty_table));
   }
+
+  auto const [left_join_indices, right_join_indices] = cudf::detail::left_join(
+    left.select(left_on), right.select(right_on), compare_nulls, stream, mr);
   std::unique_ptr<table> left_result  = detail::gather(left,
-                                                      join_indices.first->begin(),
-                                                      join_indices.first->end(),
+                                                      left_join_indices->begin(),
+                                                      left_join_indices->end(),
                                                       out_of_bounds_policy::NULLIFY,
                                                       stream,
                                                       mr);
   std::unique_ptr<table> right_result = detail::gather(right,
-                                                       join_indices.second->begin(),
-                                                       join_indices.second->end(),
+                                                       right_join_indices->begin(),
+                                                       right_join_indices->end(),
                                                        out_of_bounds_policy::NULLIFY,
                                                        stream,
                                                        mr);
@@ -176,7 +178,7 @@ full_join(table_view const& left_input,
   table_view const right = matched.second.back();
 
   cudf::hash_join hj_obj(right, compare_nulls, stream);
-  return hj_obj.full_join(left, compare_nulls, std::nullopt, stream, mr);
+  return hj_obj.full_join(left, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<table> full_join(table_view const& left_input,
@@ -197,23 +199,24 @@ std::unique_ptr<table> full_join(table_view const& left_input,
   table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
   table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
 
-  auto join_indices = full_join(left.select(left_on), right.select(right_on), compare_nulls);
-
-  if ((left_on.empty() || right_on.empty()) ||
-      is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) {
-    auto probe_build_pair = get_empty_joined_table(left, right);
-    return cudf::detail::combine_table_pair(std::move(probe_build_pair.first),
-                                            std::move(probe_build_pair.second));
+  if ((left_on.empty() or right_on.empty()) or
+      cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) {
+    auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right);
+    return cudf::detail::combine_table_pair(std::move(left_empty_table),
+                                            std::move(right_empty_table));
   }
+
+  auto const [left_join_indices, right_join_indices] = cudf::detail::full_join(
+    left.select(left_on), right.select(right_on), compare_nulls, stream, mr);
   std::unique_ptr<table> left_result  = detail::gather(left,
-                                                      join_indices.first->begin(),
-                                                      join_indices.first->end(),
+                                                      left_join_indices->begin(),
+                                                      left_join_indices->end(),
                                                       out_of_bounds_policy::NULLIFY,
                                                       stream,
                                                       mr);
   std::unique_ptr<table> right_result = detail::gather(right,
-                                                       join_indices.second->begin(),
-                                                       join_indices.second->end(),
+                                                       right_join_indices->begin(),
+                                                       right_join_indices->end(),
                                                        out_of_bounds_policy::NULLIFY,
                                                        stream,
                                                        mr);
@@ -234,56 +237,50 @@ hash_join::hash_join(cudf::table_view const& build,
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::inner_join(cudf::table_view const& probe,
-                      null_equality compare_nulls,
                       std::optional<std::size_t> output_size,
                       rmm::cuda_stream_view stream,
                       rmm::mr::device_memory_resource* mr) const
 {
-  return impl->inner_join(probe, compare_nulls, output_size, stream, mr);
+  return impl->inner_join(probe, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::left_join(cudf::table_view const& probe,
-                     null_equality compare_nulls,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
                      rmm::mr::device_memory_resource* mr) const
 {
-  return impl->left_join(probe, compare_nulls, output_size, stream, mr);
+  return impl->left_join(probe, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 hash_join::full_join(cudf::table_view const& probe,
-                     null_equality compare_nulls,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
                      rmm::mr::device_memory_resource* mr) const
 {
-  return impl->full_join(probe, compare_nulls, output_size, stream, mr);
+  return impl->full_join(probe, output_size, stream, mr);
 }
 
 std::size_t hash_join::inner_join_size(cudf::table_view const& probe,
-                                       null_equality compare_nulls,
                                        rmm::cuda_stream_view stream) const
 {
-  return impl->inner_join_size(probe, compare_nulls, stream);
+  return impl->inner_join_size(probe, stream);
 }
 
 std::size_t hash_join::left_join_size(cudf::table_view const& probe,
-                                      null_equality compare_nulls,
                                       rmm::cuda_stream_view stream) const
 {
-  return impl->left_join_size(probe, compare_nulls, stream);
+  return impl->left_join_size(probe, stream);
 }
 
 std::size_t hash_join::full_join_size(cudf::table_view const& probe,
-                                      null_equality compare_nulls,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr) const
 {
-  return impl->full_join_size(probe, compare_nulls, stream, mr);
+  return impl->full_join_size(probe, stream, mr);
 }
 
 // external APIs
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 9ac4db3a34b..464a8688a2d 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/row_operators.cuh>
@@ -90,7 +91,7 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
   return rank_generator(
     order_by,
     has_nested_nulls(table_view{{order_by}}),
-    [] __device__(bool equality, auto row_index) { return equality; },
+    [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; },
     DeviceSum{},
     stream,
     mr);
@@ -105,11 +106,34 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
   return rank_generator(
     order_by,
     has_nested_nulls(table_view{{order_by}}),
-    [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
+    [] __device__(bool unequal, auto row_index) { return unequal ? row_index + 1 : 0; },
     DeviceMax{},
     stream,
     mr);
 }
 
+std::unique_ptr<column> inclusive_percent_rank_scan(column_view const& order_by,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  auto const rank_column =
+    inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
+  auto const rank_view = rank_column->view();
+
+  // Result type for PERCENT_RANK is independent of input type.
+  using result_type = cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
+  auto percent_rank_result = cudf::make_fixed_width_column(
+    data_type{type_to_id<result_type>()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    rank_view.begin<size_type>(),
+                    rank_view.end<size_type>(),
+                    percent_rank_result->mutable_view().begin<result_type>(),
+                    [n_rows = rank_view.size()] __device__(auto const rank) {
+                      return n_rows == 1 ? 0.0 : ((rank - 1.0) / (n_rows - 1));
+                    });
+  return percent_rank_result;
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index 5f10707d7fb..d73fc862380 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,14 +34,19 @@ std::unique_ptr<column> scan(column_view const& input,
 
   if (agg->kind == aggregation::RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
-                 "Unsupported rank aggregation operator for exclusive scan");
+                 "Rank aggregation operator requires an inclusive scan");
     return inclusive_rank_scan(input, rmm::cuda_stream_default, mr);
   }
   if (agg->kind == aggregation::DENSE_RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
-                 "Unsupported dense rank aggregation operator for exclusive scan");
+                 "Dense rank aggregation operator requires an inclusive scan");
     return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr);
   }
+  if (agg->kind == aggregation::PERCENT_RANK) {
+    CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
+                 "Percent rank aggregation operator requires an inclusive scan");
+    return inclusive_percent_rank_scan(input, rmm::cuda_stream_default, mr);
+  }
 
   return inclusive == scan_type::EXCLUSIVE
            ? detail::scan_exclusive(input, agg, null_handling, rmm::cuda_stream_default, mr)
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index c4076dd61c1..a67af9442f0 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,6 +83,7 @@ struct extract_fn {
 std::unique_ptr<table> extract(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -91,7 +92,8 @@ std::unique_ptr<table> extract(
   auto const d_strings      = *strings_column;
 
   // compile regex into device object
-  auto prog   = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
   auto d_prog = *prog;
   // extract should include groups
   auto const groups = d_prog.group_counts();
@@ -150,10 +152,11 @@ std::unique_ptr<table> extract(
 
 std::unique_ptr<table> extract(strings_column_view const& strings,
                                std::string const& pattern,
+                               regex_flags const flags,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract(strings, pattern, rmm::cuda_stream_default, mr);
+  return detail::extract(strings, pattern, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index c4749eae003..e27dccb9338 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -89,13 +89,14 @@ struct extract_fn {
 }  // namespace
 
 /**
- * @copydoc cudf::strings::extract_all
+ * @copydoc cudf::strings::extract_all_record
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> extract_all(
+std::unique_ptr<column> extract_all_record(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -103,7 +104,8 @@ std::unique_ptr<column> extract_all(
   auto const d_strings     = column_device_view::create(strings.parent(), stream);
 
   // Compile regex into device object.
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
   // The extract pattern should always include groups.
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern.");
@@ -179,12 +181,13 @@ std::unique_ptr<column> extract_all(
 
 // external API
 
-std::unique_ptr<column> extract_all(strings_column_view const& strings,
-                                    std::string const& pattern,
-                                    rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
+                                           std::string const& pattern,
+                                           regex_flags const flags,
+                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_all(strings, pattern, rmm::cuda_stream_default, mr);
+  return detail::extract_all_record(strings, pattern, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 065c358d08b..6f36658523b 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -280,15 +280,15 @@ class regex_parser {
             yy         = 0;
             if (a >= '0' && a <= '9')
               yy += (a - '0') << 4;
-            else if (a > 'a' && a <= 'f')
+            else if (a >= 'a' && a <= 'f')
               yy += (a - 'a' + 10) << 4;
-            else if (a > 'A' && a <= 'F')
+            else if (a >= 'A' && a <= 'F')
               yy += (a - 'A' + 10) << 4;
             if (b >= '0' && b <= '9')
               yy += b - '0';
-            else if (b > 'a' && b <= 'f')
+            else if (b >= 'a' && b <= 'f')
               yy += b - 'a' + 10;
-            else if (b > 'A' && b <= 'F')
+            else if (b >= 'A' && b <= 'F')
               yy += b - 'A' + 10;
             break;
           }
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
new file mode 100644
index 00000000000..d80148f2fe6
--- /dev/null
+++ b/cpp/src/strings/split/split_re.cu
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <strings/count_matches.hpp>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/split/split_re.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/transform_scan.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+using string_index_pair = thrust::pair<const char*, size_type>;
+
+enum class split_direction {
+  FORWARD,  ///< for split logic
+  BACKWARD  ///< for rsplit logic
+};
+
+/**
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
+ *
+ * Each string's tokens are stored in the `d_tokens` vector.
+ * The `d_token_offsets` specifies the output position within `d_tokens`
+ * for each string.
+ */
+template <int stack_size>
+struct token_reader_fn {
+  column_device_view const d_strings;
+  reprog_device prog;
+  split_direction const direction;
+  offset_type const* d_token_offsets;
+  string_index_pair* d_tokens;
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return; }
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    auto const token_offset = d_token_offsets[idx];
+    auto const token_count  = d_token_offsets[idx + 1] - token_offset;
+    auto const d_result     = d_tokens + token_offset;  // store tokens here
+
+    size_type token_idx = 0;
+    size_type begin     = 0;  // characters
+    size_type end       = d_str.length();
+    size_type last_pos  = 0;  // bytes
+    while (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+      // get the token (characters just before this match)
+      auto const token =
+        string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
+      // store it if we have space
+      if (token_idx < token_count - 1) {
+        d_result[token_idx++] = token;
+      } else {
+        if (direction == split_direction::FORWARD) { break; }  // we are done
+        for (auto l = 0; l < token_idx - 1; ++l) {
+          d_result[l] = d_result[l + 1];  // shift left
+        }
+        d_result[token_idx - 1] = token;
+      }
+      // setup for next match
+      last_pos = d_str.byte_offset(end);
+      begin    = end + (begin == end);
+      end      = d_str.length();
+    }
+
+    // set the last token to the remainder of the string
+    d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
+
+    if (direction == split_direction::BACKWARD) {
+      // update first entry -- this happens when max_tokens is hit before the end of the string
+      auto const first_offset =
+        d_result[0].first
+          ? static_cast<size_type>(thrust::distance(d_str.data(), d_result[0].first))
+          : 0;
+      if (first_offset) {
+        d_result[0] = string_index_pair{d_str.data(), first_offset + d_result[0].second};
+      }
+    }
+  }
+};
+
+/**
+ * @brief Call regex to split each input string into tokens.
+ *
+ * This will also convert the `offsets` values from counts to offsets.
+ *
+ * @param d_strings Strings to split
+ * @param d_prog Regex to evaluate against each string
+ * @param direction Whether tokens are generated forwards or backwards.
+ * @param max_tokens The maximum number of tokens for each split.
+ * @param offsets The number of matches on input.
+ *                The offsets for each token in each string on output.
+ * @param stream CUDA stream used for kernel launches.
+ */
+rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const& d_strings,
+                                                       reprog_device& d_prog,
+                                                       split_direction direction,
+                                                       size_type maxsplit,
+                                                       mutable_column_view& offsets,
+                                                       rmm::cuda_stream_view stream)
+{
+  auto const strings_count = d_strings.size();
+
+  auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+
+  auto const begin     = thrust::make_counting_iterator<size_type>(0);
+  auto const end       = thrust::make_counting_iterator<size_type>(strings_count);
+  auto const d_offsets = offsets.data<offset_type>();
+
+  // convert match counts to token offsets
+  auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) {
+    return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1;
+  };
+  thrust::transform_exclusive_scan(
+    rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<offset_type>{});
+
+  // the last offset entry is the total number of tokens to be generated
+  auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
+
+  // generate tokens for each string
+  rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+  auto const regex_insts = d_prog.insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else {
+    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  }
+
+  return tokens;
+}
+
+/**
+ * @brief Returns string pair for the specified column for each string in `d_strings`
+ *
+ * This is used to build the table result of a split.
+ * Null is returned if the row is null or if the `column_index` is larger
+ * than the token count for that string.
+ */
+struct tokens_transform_fn {
+  column_device_view const d_strings;
+  string_index_pair const* d_tokens;
+  offset_type const* d_token_offsets;
+  size_type const column_index;
+
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto const offset      = d_token_offsets[idx];
+    auto const token_count = d_token_offsets[idx + 1] - offset;
+    return (column_index >= token_count) || d_strings.is_null(idx)
+             ? string_index_pair{nullptr, 0}
+             : d_tokens[offset + column_index];
+  }
+};
+
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                split_direction direction,
+                                size_type maxsplit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const strings_count = input.size();
+
+  std::vector<std::unique_ptr<column>> results;
+  if (strings_count == 0) {
+    results.push_back(make_empty_column(type_id::STRING));
+    return std::make_unique<table>(std::move(results));
+  }
+
+  // create the regex device prog from the given pattern
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  // count the number of delimiters matched in each string
+  auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
+  auto offsets_view = offsets->mutable_view();
+  auto d_offsets    = offsets_view.data<offset_type>();
+
+  // get the split tokens from the input column; this also converts the counts into offsets
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
+
+  // the output column count is the maximum number of tokens generated for any input string
+  auto const columns_count = thrust::transform_reduce(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    [d_offsets] __device__(auto const idx) -> size_type {
+      return d_offsets[idx + 1] - d_offsets[idx];
+    },
+    0,
+    thrust::maximum<size_type>{});
+
+  // boundary case: if no columns, return one all-null column (custrings issue #119)
+  if (columns_count == 0) {
+    results.push_back(std::make_unique<column>(
+      data_type{type_id::STRING},
+      strings_count,
+      rmm::device_buffer{0, stream, mr},  // no data
+      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
+      strings_count));
+    return std::make_unique<table>(std::move(results));
+  }
+
+  // convert the tokens into multiple strings columns
+  auto make_strings_lambda = [&](size_type column_index) {
+    // returns appropriate token for each row/column
+    auto indices_itr = cudf::detail::make_counting_transform_iterator(
+      0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index});
+    return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
+  };
+  // build a vector of columns
+  results.resize(columns_count);
+  std::transform(thrust::make_counting_iterator<size_type>(0),
+                 thrust::make_counting_iterator<size_type>(columns_count),
+                 results.begin(),
+                 make_strings_lambda);
+
+  return std::make_unique<table>(std::move(results));
+}
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        split_direction direction,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const strings_count = input.size();
+
+  // create the regex device prog from the given pattern
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  // count the number of delimiters matched in each string
+  auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
+  auto offsets_view = offsets->mutable_view();
+
+  // get the split tokens from the input column; this also converts the counts into offsets
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
+
+  // convert the tokens into one big strings column
+  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+
+  // create a lists column using the offsets and the strings columns
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
+}
+
+}  // namespace
+
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 std::string const& pattern,
+                                 size_type maxsplit,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         std::string const& pattern,
+                                         size_type maxsplit,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+}
+
+}  // namespace detail
+
+// external APIs
+
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 std::string const& pattern,
+                                 size_type maxsplit,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         std::string const& pattern,
+                                         size_type maxsplit,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 913761ecd03..f96edd3ce5a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -23,7 +23,14 @@ function(ConfigureTest CMAKE_TEST_NAME)
     ${CMAKE_TEST_NAME}
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gtests>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               # For std:: support of __int128_t. Can be removed once using cuda::std
+               CXX_EXTENSIONS ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
   )
+
   target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main)
   add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
   install(
@@ -41,8 +48,13 @@ endfunction()
 # ##################################################################################################
 # * column tests ----------------------------------------------------------------------------------
 ConfigureTest(
-  COLUMN_TEST column/bit_cast_test.cpp column/column_view_shallow_test.cpp column/column_test.cu
-  column/column_device_view_test.cu column/compound_test.cu
+  COLUMN_TEST
+  column/bit_cast_test.cpp
+  column/column_device_view_test.cu
+  column/column_test.cu
+  column/column_view_device_span_test.cpp
+  column/column_view_shallow_test.cpp
+  column/compound_test.cu
 )
 
 # ##################################################################################################
@@ -199,6 +211,7 @@ ConfigureTest(
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
 ConfigureTest(CSV_TEST io/csv_test.cpp)
+ConfigureTest(FILE_IO_TEST io/file_io_test.cpp)
 ConfigureTest(ORC_TEST io/orc_test.cpp)
 ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
 ConfigureTest(JSON_TEST io/json_test.cpp)
diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp
new file mode 100644
index 00000000000..2b7ea3b3650
--- /dev/null
+++ b/cpp/tests/column/column_view_device_span_test.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <memory>
+
+template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() or cudf::is_chrono<T>())>
+std::unique_ptr<cudf::column> example_column()
+{
+  auto begin = thrust::make_counting_iterator(1);
+  auto end   = thrust::make_counting_iterator(16);
+  return cudf::test::fixed_width_column_wrapper<T>(begin, end).release();
+}
+
+template <typename T>
+struct ColumnViewDeviceSpanTests : public cudf::test::BaseFixture {
+};
+
+using DeviceSpanTypes = cudf::test::FixedWidthTypesWithoutFixedPoint;
+TYPED_TEST_SUITE(ColumnViewDeviceSpanTests, DeviceSpanTypes);
+
+TYPED_TEST(ColumnViewDeviceSpanTests, conversion_round_trip)
+{
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+
+  // Test implicit conversion, round trip
+  cudf::device_span<const TypeParam> device_span_from_col_view = col_view;
+  cudf::column_view col_view_from_device_span                  = device_span_from_col_view;
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_view, col_view_from_device_span);
+}
+
+struct ColumnViewDeviceSpanErrorTests : public cudf::test::BaseFixture {
+};
+
+TEST_F(ColumnViewDeviceSpanErrorTests, type_mismatch)
+{
+  auto col      = example_column<int32_t>();
+  auto col_view = cudf::column_view{*col};
+  EXPECT_THROW((void)cudf::device_span<const float>{col_view}, cudf::logic_error);
+}
+
+TEST_F(ColumnViewDeviceSpanErrorTests, nullable_column)
+{
+  auto col = example_column<int32_t>();
+  col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_NULL), col->size());
+  auto col_view = cudf::column_view{*col};
+  EXPECT_THROW((void)cudf::device_span<const int32_t>{col_view}, cudf::logic_error);
+}
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index 210b4b8f90d..256f9129cbf 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,10 +54,22 @@ std::unique_ptr<cudf::scalar> make_scalar(
 }
 
 template <typename T>
-auto lowest = std::numeric_limits<T>::lowest();
+constexpr auto highest()
+{
+  // chrono types do not have std::numeric_limits specializations and should use T::max()
+  // https://eel.is/c++draft/numeric.limits.general#6
+  if constexpr (cudf::is_chrono<T>()) return T::max();
+  return std::numeric_limits<T>::max();
+}
 
 template <typename T>
-auto highest = std::numeric_limits<T>::max();
+constexpr auto lowest()
+{
+  // chrono types do not have std::numeric_limits specializations and should use T::min()
+  // https://eel.is/c++draft/numeric.limits.general#6
+  if constexpr (cudf::is_chrono<T>()) return T::min();
+  return std::numeric_limits<T>::lowest();
+}
 
 template <typename T>
 struct ShiftTest : public cudf::test::BaseFixture {
@@ -101,16 +113,16 @@ TYPED_TEST(ShiftTest, OneColumn)
 {
   using T = TypeParam;
 
-  auto input    = fixed_width_column_wrapper<T>{lowest<T>,
+  auto input    = fixed_width_column_wrapper<T>{lowest<T>(),
                                              cudf::test::make_type_param_scalar<T>(1),
                                              cudf::test::make_type_param_scalar<T>(2),
                                              cudf::test::make_type_param_scalar<T>(3),
                                              cudf::test::make_type_param_scalar<T>(4),
                                              cudf::test::make_type_param_scalar<T>(5),
-                                             highest<T>};
+                                             highest<T>()};
   auto expected = fixed_width_column_wrapper<T>{cudf::test::make_type_param_scalar<T>(7),
                                                 cudf::test::make_type_param_scalar<T>(7),
-                                                lowest<T>,
+                                                lowest<T>(),
                                                 cudf::test::make_type_param_scalar<T>(1),
                                                 cudf::test::make_type_param_scalar<T>(2),
                                                 cudf::test::make_type_param_scalar<T>(3),
@@ -126,16 +138,16 @@ TYPED_TEST(ShiftTest, OneColumnNegativeShift)
 {
   using T = TypeParam;
 
-  auto input    = fixed_width_column_wrapper<T>{lowest<T>,
+  auto input    = fixed_width_column_wrapper<T>{lowest<T>(),
                                              cudf::test::make_type_param_scalar<T>(1),
                                              cudf::test::make_type_param_scalar<T>(2),
                                              cudf::test::make_type_param_scalar<T>(3),
                                              cudf::test::make_type_param_scalar<T>(4),
                                              cudf::test::make_type_param_scalar<T>(5),
-                                             highest<T>};
+                                             highest<T>()};
   auto expected = fixed_width_column_wrapper<T>{cudf::test::make_type_param_scalar<T>(4),
                                                 cudf::test::make_type_param_scalar<T>(5),
-                                                highest<T>,
+                                                highest<T>(),
                                                 cudf::test::make_type_param_scalar<T>(7),
                                                 cudf::test::make_type_param_scalar<T>(7),
                                                 cudf::test::make_type_param_scalar<T>(7),
@@ -151,16 +163,16 @@ TYPED_TEST(ShiftTest, OneColumnNullFill)
 {
   using T = TypeParam;
 
-  auto input    = fixed_width_column_wrapper<T>{lowest<T>,
+  auto input    = fixed_width_column_wrapper<T>{lowest<T>(),
                                              cudf::test::make_type_param_scalar<T>(5),
                                              cudf::test::make_type_param_scalar<T>(0),
                                              cudf::test::make_type_param_scalar<T>(3),
                                              cudf::test::make_type_param_scalar<T>(0),
                                              cudf::test::make_type_param_scalar<T>(1),
-                                             highest<T>};
+                                             highest<T>()};
   auto expected = fixed_width_column_wrapper<T>({cudf::test::make_type_param_scalar<T>(0),
                                                  cudf::test::make_type_param_scalar<T>(0),
-                                                 lowest<T>,
+                                                 lowest<T>(),
                                                  cudf::test::make_type_param_scalar<T>(5),
                                                  cudf::test::make_type_param_scalar<T>(0),
                                                  cudf::test::make_type_param_scalar<T>(3),
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index fddaa9d2050..fd065249c4e 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,8 +129,13 @@ struct AtomicsTest : public cudf::test::BaseFixture {
 
     thrust::host_vector<T> result_init(9);  // +3 padding for int8 tests
     result_init[0] = cudf::test::make_type_param_scalar<T>(0);
-    result_init[1] = std::numeric_limits<T>::max();
-    result_init[2] = std::numeric_limits<T>::min();
+    if constexpr (cudf::is_chrono<T>()) {
+      result_init[1] = T::max();
+      result_init[2] = T::min();
+    } else {
+      result_init[1] = std::numeric_limits<T>::max();
+      result_init[2] = std::numeric_limits<T>::min();
+    }
     result_init[3] = result_init[0];
     result_init[4] = result_init[1];
     result_init[5] = result_init[2];
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 1b1b12ea69e..81369beb2ec 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,17 +23,27 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 
-using namespace cudf::test::iterators;
-
 namespace cudf {
 namespace test {
 
-inline void test_pair_rank_scans(column_view const& keys,
-                                 column_view const& order,
-                                 column_view const& expected_dense,
-                                 column_view const& expected_rank,
-                                 null_policy include_null_keys = null_policy::INCLUDE,
-                                 sorted keys_are_sorted        = sorted::YES)
+using namespace iterators;
+
+template <typename T>
+using input           = fixed_width_column_wrapper<T>;
+using rank_result_col = fixed_width_column_wrapper<size_type>;
+using percent_result_t =
+  cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
+using percent_result_col = fixed_width_column_wrapper<percent_result_t>;
+using null_iter_t        = decltype(nulls_at({}));
+
+auto constexpr X     = int32_t{0};  // Placeholder for NULL rows.
+auto const all_valid = nulls_at({});
+
+inline void test_rank_scans(column_view const& keys,
+                            column_view const& order,
+                            column_view const& expected_dense,
+                            column_view const& expected_rank,
+                            column_view const& expected_percent_rank)
 {
   test_single_scan(keys,
                    order,
@@ -49,6 +59,13 @@ inline void test_pair_rank_scans(column_view const& keys,
                    make_rank_aggregation<groupby_scan_aggregation>(),
                    null_policy::INCLUDE,
                    sorted::YES);
+  test_single_scan(keys,
+                   order,
+                   keys,
+                   expected_percent_rank,
+                   make_percent_rank_aggregation<groupby_scan_aggregation>(),
+                   null_policy::INCLUDE,
+                   sorted::YES);
 }
 
 struct groupby_rank_scan_test : public BaseFixture {
@@ -70,248 +87,283 @@ TYPED_TEST(typed_groupby_rank_scan_test, empty_cols)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> keys{};
-  fixed_width_column_wrapper<T> order_col{};
-  structs_column_wrapper struct_order{};
+  auto const keys            = input<T>{};
+  auto const order_by        = input<T>{};
+  auto const order_by_struct = structs_column_wrapper{};
 
-  fixed_width_column_wrapper<size_type> expected_dense_vals{};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{};
+  auto const expected_dense   = rank_result_col{};
+  auto const expected_rank    = rank_result_col{};
+  auto const expected_percent = percent_result_col{};
 
-  test_pair_rank_scans(keys, order_col, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
+  test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, zero_valid_keys)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> keys{{1, 2, 3}, all_nulls()};
-  fixed_width_column_wrapper<T> order_col1{3, 3, 1};
-  fixed_width_column_wrapper<T> order_col2{3, 3, 1};
-  fixed_width_column_wrapper<T> order_col3{3, 3, 1};
-  structs_column_wrapper struct_order{order_col2, order_col3};
+  auto const keys            = input<T>{{X, X, X}, all_nulls()};
+  auto const order_by        = input<T>{{3, 3, 1}};
+  auto const order_by_struct = [] {
+    auto member_1 = input<T>{{3, 3, 1}};
+    auto member_2 = input<T>{{3, 3, 1}};
+    return structs_column_wrapper{member_1, member_2};
+  }();
 
-  fixed_width_column_wrapper<size_type> expected_dense_vals{1, 1, 2};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{1, 1, 3};
+  auto const dense_rank_results  = rank_result_col{1, 1, 2};
+  auto const rank_results        = rank_result_col{1, 1, 3};
+  auto const percent_rank_result = percent_result_col{0, 0, 1};
 
-  test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
+  test_rank_scans(keys, order_by, dense_rank_results, rank_results, percent_rank_result);
+  test_rank_scans(keys, order_by_struct, dense_rank_results, rank_results, percent_rank_result);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, zero_valid_orders)
 {
-  using T = TypeParam;
-
-  fixed_width_column_wrapper<T> keys{1, 1, 3, 3};
-  fixed_width_column_wrapper<T> order_col1{{5, 6, 7, 8}, all_nulls()};
-  fixed_width_column_wrapper<T> order_col2{{5, 6, 7, 8}, all_nulls()};
-  fixed_width_column_wrapper<T> order_col3{{5, 6, 7, 8}, all_nulls()};
-  fixed_width_column_wrapper<T> order_col4{{5, 6, 7, 8}, all_nulls()};
-  fixed_width_column_wrapper<T> order_col5{{5, 6, 7, 8}, all_nulls()};
-  structs_column_wrapper struct_order{order_col2, order_col3};
-  structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, all_nulls()};
-
-  fixed_width_column_wrapper<size_type> expected_dense_vals{1, 1, 1, 1};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{1, 1, 1, 1};
-
-  test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals);
+  using T           = TypeParam;
+  using null_iter_t = decltype(all_nulls());
+
+  auto const keys                 = input<T>{{1, 1, 3, 3}};
+  auto const make_order_by        = [&] { return input<T>{{X, X, X, X}, all_nulls()}; };
+  auto const make_struct_order_by = [&](null_iter_t const& null_iter = no_nulls()) {
+    auto member1 = make_order_by();
+    auto member2 = make_order_by();
+    return structs_column_wrapper{{member1, member2}, null_iter};
+  };
+  auto const order_by                  = make_order_by();
+  auto const order_by_struct           = make_struct_order_by();
+  auto const order_by_struct_all_nulls = make_struct_order_by(all_nulls());
+
+  auto const expected_dense   = rank_result_col{1, 1, 1, 1};
+  auto const expected_rank    = rank_result_col{1, 1, 1, 1};
+  auto const expected_percent = percent_result_col{0, 0, 0, 0};
+
+  test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_struct_all_nulls, expected_dense, expected_rank, expected_percent);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, basic)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> keys{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
-  fixed_width_column_wrapper<T> order_col1{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1};
-  fixed_width_column_wrapper<T> order_col2{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1};
-  fixed_width_column_wrapper<T> order_col3{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1};
-  structs_column_wrapper struct_order{order_col2, order_col3};
-
-  fixed_width_column_wrapper<size_type> expected_dense_vals = {
-    {1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3}};
-  fixed_width_column_wrapper<size_type> expected_rank_vals =
-    fixed_width_column_wrapper<size_type>{{1, 1, 1, 4, 4, 4, 7, 1, 2, 2, 4, 4}};
-
-  test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
+  auto const keys            = input<T>{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+  auto const make_order_by   = [&] { return input<T>{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; };
+  auto const order_by        = make_order_by();
+  auto const order_by_struct = [&] {
+    auto order2 = make_order_by();
+    auto order3 = make_order_by();
+    return structs_column_wrapper{order2, order3};
+  }();
+
+  auto const expected_dense   = rank_result_col{1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3};
+  auto const expected_rank    = rank_result_col{1, 1, 1, 4, 4, 4, 7, 1, 2, 2, 4, 4};
+  auto const expected_percent = percent_result_col{
+    0.0, 0.0, 0.0, 3.0 / 6, 3.0 / 6, 3.0 / 6, 6.0 / 6, 0.0, 1.0 / 4, 1.0 / 4, 3.0 / 4, 3.0 / 4};
+
+  test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, null_orders)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
-  fixed_width_column_wrapper<T> order_col1{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col2{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col3{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col4{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col5{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  structs_column_wrapper struct_order{order_col2, order_col3};
-  structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5},
-                                                 {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-
-  fixed_width_column_wrapper<size_type> expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 3, 3, 3}};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 4, 4, 4}};
-
-  test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
+  auto const null_mask     = nulls_at({2, 8});
+  auto const keys          = input<T>{{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+  auto const make_order_by = [&] {
+    return input<T>{{-1, -2, X, -2, -3, -3, -4, -4, X, -5, -5, -5}, null_mask};
+  };
+  auto const make_struct_order_by = [&](null_iter_t const& null_iter = all_valid) {
+    auto member1 = make_order_by();
+    auto member2 = make_order_by();
+    return structs_column_wrapper{{member1, member2}, null_iter};
+  };
+  auto const order_by                   = make_order_by();
+  auto const order_by_struct            = make_struct_order_by();
+  auto const order_by_struct_with_nulls = make_struct_order_by(null_mask);
+
+  auto const expected_dense   = rank_result_col{1, 2, 3, 4, 5, 5, 1, 1, 2, 3, 3, 3};
+  auto const expected_rank    = rank_result_col{1, 2, 3, 4, 5, 5, 1, 1, 3, 4, 4, 4};
+  auto const expected_percent = percent_result_col{
+    0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 5, 3.0 / 5, 3.0 / 5, 3.0 / 5};
+
+  test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(
+    keys, order_by_struct_with_nulls, expected_dense, expected_rank, expected_percent);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, null_orders_and_keys)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> keys = {{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1},
-                                        {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
-  fixed_width_column_wrapper<T> order_col1{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col2{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col3{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col4{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<T> order_col5{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6},
-                                           {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  structs_column_wrapper struct_order{order_col2, order_col3};
-  structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5},
-                                                 {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-
-  fixed_width_column_wrapper<size_type> expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}};
-
-  test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals);
+  auto const null_mask     = nulls_at({2, 8});
+  auto const keys          = input<T>{{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}, nulls_at({9, 10, 11})};
+  auto const make_order_by = [&] {
+    return input<T>{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, null_mask};
+  };
+  auto const make_struct_order_by = [&](null_iter_t const& null_iter = all_valid) {
+    auto member1 = make_order_by();
+    auto member2 = make_order_by();
+    return structs_column_wrapper{{member1, member2}, null_iter};
+  };
+  auto const order_by                   = make_order_by();
+  auto const order_by_struct            = make_struct_order_by();
+  auto const order_by_struct_with_nulls = make_struct_order_by(null_mask);
+
+  auto const expected_dense   = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}};
+  auto const expected_rank    = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}};
+  auto const expected_percent = percent_result_col{
+    {0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}};
+
+  test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(
+    keys, order_by_struct_with_nulls, expected_dense, expected_rank, expected_percent);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs)
 {
-  auto col     = fixed_width_column_wrapper<int>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
-  auto strings = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  auto struct_col = structs_column_wrapper{{col, strings}, null_at(11)}.release();
-
-  strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"},
-                                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
-  auto expected_dense_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 2, 2, 3, 4, 1, 1, 2, 1, 1, 2};
-  auto expected_rank_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3};
+  auto const struct_col = [] {
+    auto nums    = input<TypeParam>{{0, 0, 7, 7, 7, X, 4, 4, 4, 9, 9, 9}, null_at(5)};
+    auto strings = strings_column_wrapper{
+      {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "XX", "9", "9", "10d"}, null_at(8)};
+    return structs_column_wrapper{{nums, strings}, null_at(11)}.release();
+  }();
+
+  auto const keys = strings_column_wrapper{
+    {"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, nulls_at({9, 10, 11})};
+
+  auto const expected_dense   = rank_result_col{1, 1, 2, 2, 3, 4, 1, 1, 2, 1, 1, 2};
+  auto const expected_rank    = rank_result_col{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3};
+  auto const expected_percent = percent_result_col{
+    0.0, 0.0, 2.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2};
 
   std::vector<groupby::scan_request> requests;
   requests.emplace_back(groupby::scan_request());
   requests[0].values = *struct_col;
   requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
   requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
+  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
-  auto result = gb_obj.scan(requests);
+  auto [result_keys, agg_results] = gb_obj.scan(requests);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result.first->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], expected_dense_vals);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], expected_rank_vals);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result_keys->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], expected_dense);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], expected_rank);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], expected_percent);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, nestedStructs)
 {
   using T = TypeParam;
 
-  auto col1     = fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
-  auto col2     = fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
-  auto col3     = fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
-  auto col4     = fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
-  auto strings1 = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  auto strings2 = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  auto struct_col    = structs_column_wrapper{col1, strings1};
-  auto nested_col    = structs_column_wrapper{struct_col, col2}.release();
-  auto flattened_col = structs_column_wrapper{col3, strings2, col4}.release();
-
-  strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"},
-                                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
+  auto nested_structs = [] {
+    auto structs_member = [] {
+      auto nums_member    = input<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
+      auto strings_member = strings_column_wrapper{
+        {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
+      return structs_column_wrapper{nums_member, strings_member};
+    }();
+    auto nums_member = input<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
+    return structs_column_wrapper{structs_member, nums_member}.release();
+  }();
+
+  auto flat_struct = [] {
+    auto nums_member    = input<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
+    auto strings_member = strings_column_wrapper{
+      {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
+    auto nuther_nums =
+      fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
+    return structs_column_wrapper{nums_member, strings_member, nuther_nums}.release();
+  }();
+
+  auto const keys = strings_column_wrapper{
+    {"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, nulls_at({9, 10, 11})};
 
   std::vector<groupby::scan_request> requests;
   requests.emplace_back(groupby::scan_request());
   requests.emplace_back(groupby::scan_request());
-  requests[0].values = *nested_col;
+  requests[0].values = *nested_structs;
   requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
   requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].values = *flattened_col;
+  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].values = *flat_struct;
   requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
   requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
-  auto result = gb_obj.scan(requests);
+  auto [result_keys, agg_results] = gb_obj.scan(requests);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result.first->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *result.second[1].results[0]);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], *result.second[1].results[1]);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result_keys->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], *agg_results[1].results[0]);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], *agg_results[1].results[1]);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], *agg_results[1].results[2]);
 }
 
 TYPED_TEST(typed_groupby_rank_scan_test, structsWithNullPushdown)
 {
   using T = TypeParam;
 
-  auto col1     = fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
-  auto col2     = fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
-  auto strings1 = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  auto strings2 = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-
-  std::vector<std::unique_ptr<column>> struct_columns;
-  struct_columns.push_back(col1.release());
-  struct_columns.push_back(strings1.release());
-  auto struct_col =
-    cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{});
-  auto const struct_nulls =
-    thrust::host_vector<bool>(std::vector<bool>{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0});
-  struct_col->set_null_mask(
-    cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end()));
-
-  std::vector<std::unique_ptr<column>> null_struct_columns;
-  null_struct_columns.push_back(col2.release());
-  null_struct_columns.push_back(strings2.release());
-  auto null_col =
-    cudf::make_structs_column(12, std::move(null_struct_columns), 0, rmm::device_buffer{});
-  null_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL));
-
-  strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"},
-                                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
+  auto constexpr num_rows = 12;
+
+  auto get_struct_column = [] {
+    auto nums_member =
+      fixed_width_column_wrapper<T>{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)};
+    auto strings_member = strings_column_wrapper{
+      {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
+    auto struct_column = structs_column_wrapper{nums_member, strings_member}.release();
+    // Reset null-mask, a posteriori. Nulls will not be pushed down to children.
+    auto const null_iter = nulls_at({1, 2, 11});
+    struct_column->set_null_mask(
+      cudf::test::detail::make_null_mask(null_iter, null_iter + num_rows));
+    return struct_column;
+  };
+
+  auto const possibly_null_structs = get_struct_column();
+
+  auto const definitely_null_structs = [&] {
+    auto struct_column = get_struct_column();
+    struct_column->set_null_mask(create_null_mask(num_rows, mask_state::ALL_NULL));
+    return struct_column;
+  }();
+
+  strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"},
+                                 nulls_at({9, 10, 11})};
 
   std::vector<groupby::scan_request> requests;
   requests.emplace_back(groupby::scan_request());
   requests.emplace_back(groupby::scan_request());
-  requests[0].values = *struct_col;
+  requests[0].values = *possibly_null_structs;
   requests[0].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
   requests[0].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
-  requests[1].values = *null_col;
+  requests[0].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].values = *definitely_null_structs;
   requests[1].aggregations.push_back(make_dense_rank_aggregation<groupby_scan_aggregation>());
   requests[1].aggregations.push_back(make_rank_aggregation<groupby_scan_aggregation>());
+  requests[1].aggregations.push_back(make_percent_rank_aggregation<groupby_scan_aggregation>());
 
   groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES);
-  auto result = gb_obj.scan(requests);
-
-  auto expected_dense_vals =
-    fixed_width_column_wrapper<size_type>{1, 2, 2, 3, 4, 5, 1, 1, 2, 1, 1, 2};
-  auto expected_rank_vals =
-    fixed_width_column_wrapper<size_type>{1, 2, 2, 4, 5, 6, 1, 1, 3, 1, 1, 3};
-  auto expected_null_result =
-    fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], expected_dense_vals);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], expected_rank_vals);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[1].results[0], expected_null_result);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[1].results[1], expected_null_result);
+  auto [result_keys, agg_results] = gb_obj.scan(requests);
+
+  auto expected_dense   = rank_result_col{1, 2, 2, 3, 4, 5, 1, 1, 2, 1, 1, 2};
+  auto expected_rank    = rank_result_col{1, 2, 2, 4, 5, 6, 1, 1, 3, 1, 1, 3};
+  auto expected_percent = percent_result_col{
+    0.0, 1.0 / 5, 1.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2};
+  auto expected_rank_for_null = rank_result_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto expected_percent_for_null =
+    percent_result_col{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], expected_dense);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], expected_rank);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], expected_percent);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[0], expected_rank_for_null);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[1], expected_rank_for_null);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[2], expected_percent_for_null);
 }
 
 /* List support dependent on https://github.com/rapidsai/cudf/issues/8683
@@ -372,67 +424,72 @@ TYPED_TEST(list_groupby_rank_scan_test, lists)
 
 TEST(groupby_rank_scan_test, bools)
 {
-  fixed_width_column_wrapper<bool> keys = {{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1},
-                                           {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
-  fixed_width_column_wrapper<bool> order_col1{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1},
-                                              {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<bool> order_col2{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1},
-                                              {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<bool> order_col3{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1},
-                                              {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<bool> order_col4{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1},
-                                              {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}};
-  fixed_width_column_wrapper<bool> order_col5{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1},
-                                              {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}};
-  structs_column_wrapper struct_order{order_col2, order_col3};
-  structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5},
-                                                 {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}};
-
-  fixed_width_column_wrapper<size_type> expected_dense_vals{{1, 1, 2, 2, 2, 2, 1, 2, 3, 1, 1, 2}};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{{1, 1, 3, 3, 3, 3, 1, 2, 3, 1, 1, 3}};
-
-  test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals);
+  using bools       = fixed_width_column_wrapper<bool>;
+  using null_iter_t = decltype(nulls_at({}));
+
+  auto const keys          = bools{{0, 0, 0, 0, 0, 0, 1, 1, 1, X, X, X}, nulls_at({9, 10, 11})};
+  auto const nulls_6_8     = nulls_at({6, 8});
+  auto const make_order_by = [&] { return bools{{0, 0, 1, 1, 1, 1, X, 1, X, 0, 0, 1}, nulls_6_8}; };
+  auto const make_structs  = [&](null_iter_t const& null_iter = all_valid) {
+    auto member_1 = make_order_by();
+    auto member_2 = make_order_by();
+    return structs_column_wrapper{{member_1, member_2}, null_iter};
+  };
+
+  auto const order_by                    = make_order_by();
+  auto const order_by_structs            = make_structs();
+  auto const order_by_structs_with_nulls = make_structs(nulls_6_8);
+
+  auto const expected_dense   = rank_result_col{{1, 1, 2, 2, 2, 2, 1, 2, 3, 1, 1, 2}};
+  auto const expected_rank    = rank_result_col{{1, 1, 3, 3, 3, 3, 1, 2, 3, 1, 1, 3}};
+  auto const expected_percent = percent_result_col{
+    {0.0, 0.0, 2.0 / 5, 2.0 / 5, 2.0 / 5, 2.0 / 5, 0.0, 1.0 / 2, 2.0 / 2, 0.0, 0.0, 2.0 / 2}};
+
+  test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_structs, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(
+    keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent);
 }
 
 TEST(groupby_rank_scan_test, strings)
 {
-  strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"},
-                                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
-  strings_column_wrapper order_col1{
-    {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"},
-    {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  strings_column_wrapper order_col2{
-    {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"},
-    {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  strings_column_wrapper order_col3{
-    {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"},
-    {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  strings_column_wrapper order_col4{
-    {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"},
-    {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  strings_column_wrapper order_col5{
-    {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"},
-    {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
-  structs_column_wrapper struct_order{order_col2, order_col3};
-  structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5},
-                                                 {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0}};
-
-  fixed_width_column_wrapper<size_type> expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}};
-
-  test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals);
-  test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals);
+  using strings     = strings_column_wrapper;
+  using null_iter_t = decltype(nulls_at({}));
+
+  auto const keys =
+    strings{{"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, nulls_at({9, 10, 11})};
+  auto const nulls_2_8     = nulls_at({2, 8});
+  auto const make_order_by = [&] {
+    return strings{{"-1", "-2", "X", "-2", "-3", "-3", "-4", "-4", "X", "-5", "-5", "-6"},
+                   nulls_2_8};
+  };
+  auto const make_structs = [&](null_iter_t const& null_iter = all_valid) {
+    auto member_1 = make_order_by();
+    auto member_2 = make_order_by();
+    return structs_column_wrapper{{member_1, member_2}, null_iter};
+  };
+
+  auto const order_by                    = make_order_by();
+  auto const order_by_structs            = make_structs();
+  auto const order_by_structs_with_nulls = make_structs(nulls_at({4, 5, 11}));
+
+  auto const expected_dense   = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}};
+  auto const expected_rank    = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}};
+  auto const expected_percent = percent_result_col{
+    {0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}};
+
+  test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(keys, order_by_structs, expected_dense, expected_rank, expected_percent);
+  test_rank_scans(
+    keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent);
 }
 
 TEST_F(groupby_rank_scan_test_failures, test_exception_triggers)
 {
   using T = uint32_t;
 
-  fixed_width_column_wrapper<T> keys{{1, 2, 3}, {1, 1, 0}};
-  fixed_width_column_wrapper<T> col{3, 3, 1};
+  auto const keys = input<T>{{1, 2, 3}, null_at(2)};
+  auto const col  = input<T>{3, 3, 1};
 
   CUDF_EXPECT_THROW_MESSAGE(
     test_single_scan(keys,
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 5947e309bec..be7da4a784c 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -156,6 +156,27 @@ TYPED_TEST(groupby_sum_test, dictionary)
                   force_use_sort_impl::YES);
 }
 
+struct overflow_test : public cudf::test::BaseFixture {
+};
+TEST_F(overflow_test, overflow_integer)
+{
+  using int32_col = fixed_width_column_wrapper<int32_t>;
+  using int64_col = fixed_width_column_wrapper<int64_t>;
+
+  auto const keys        = int32_col{0, 0};
+  auto const vals        = int32_col{-2147483648, -2147483648};
+  auto const expect_keys = int32_col{0};
+  auto const expect_vals = int64_col{-4294967296L};
+
+  auto test_sum = [&](auto const use_sort) {
+    auto agg = make_sum_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), use_sort);
+  };
+
+  test_sum(force_use_sort_impl::NO);
+  test_sum(force_use_sort_impl::YES);
+}
+
 template <typename T>
 struct FixedPointTestAllReps : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp
new file mode 100644
index 00000000000..b546239fdca
--- /dev/null
+++ b/cpp/tests/io/file_io_test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+#include <src/io/utilities/file_io_utilities.hpp>
+
+#include <type_traits>
+
+// Base test fixture for tests
+struct CuFileIOTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(CuFileIOTest, SliceSize)
+{
+  std::vector<std::pair<size_t, size_t>> test_cases{
+    {1 << 20, 1 << 18}, {1 << 18, 1 << 20}, {1 << 20, 3333}, {0, 1 << 18}, {0, 0}, {1 << 20, 0}};
+  for (auto const& test_case : test_cases) {
+    auto const slices = cudf::io::detail::make_file_io_slices(test_case.first, test_case.second);
+    if (slices.empty()) {
+      ASSERT_EQ(test_case.first, 0);
+    } else {
+      ASSERT_EQ(slices.front().offset, 0);
+      ASSERT_EQ(slices.back().offset + slices.back().size, test_case.first);
+      for (auto i = 1u; i < slices.size(); ++i) {
+        ASSERT_EQ(slices[i].offset, slices[i - 1].offset + slices[i - 1].size);
+      }
+    }
+  }
+}
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index e6ae709f009..57041e448a2 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1004,7 +1004,7 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin)
     std::size_t const size_gold = 0;
     EXPECT_EQ(output_size, size_gold);
 
-    auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size);
+    auto result = hash_join.inner_join(t0, optional_size);
     column_wrapper<int32_t> col_gold_0{};
     column_wrapper<int32_t> col_gold_1{};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1043,7 +1043,7 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin)
     std::size_t const size_gold = 5;
     EXPECT_EQ(output_size, size_gold);
 
-    auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size);
+    auto result = hash_join.left_join(t0, optional_size);
     column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
     column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1082,7 +1082,7 @@ TEST_F(JoinTest, EmptyRightTableFullJoin)
     std::size_t const size_gold = 5;
     EXPECT_EQ(output_size, size_gold);
 
-    auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size);
+    auto result = hash_join.full_join(t0, optional_size);
     column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
     column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1310,7 +1310,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     std::size_t const size_gold = 9;
     EXPECT_EQ(output_size, size_gold);
 
-    auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size);
+    auto result = hash_join.full_join(t0, optional_size);
     column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}};
     column_wrapper<int32_t> col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1330,7 +1330,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     std::size_t const size_gold = 5;
     EXPECT_EQ(output_size, size_gold);
 
-    auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size);
+    auto result = hash_join.left_join(t0, optional_size);
     column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
     column_wrapper<int32_t> col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1350,7 +1350,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes)
     std::size_t const size_gold = 3;
     EXPECT_EQ(output_size, size_gold);
 
-    auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size);
+    auto result = hash_join.inner_join(t0, optional_size);
     column_wrapper<int32_t> col_gold_0{{2, 4, 0}};
     column_wrapper<int32_t> col_gold_1{{1, 1, 4}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1390,7 +1390,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls)
   {
     auto output_size = hash_join.left_join_size(t0);
     EXPECT_EQ(5, output_size);
-    auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, output_size);
+    auto result = hash_join.left_join(t0, output_size);
     column_wrapper<int32_t> col_gold_0{{0, 1, 2, 3, 4}};
     column_wrapper<int32_t> col_gold_1{{0, NoneValue, 2, NoneValue, NoneValue}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1400,7 +1400,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls)
   {
     auto output_size = hash_join.inner_join_size(t0);
     EXPECT_EQ(2, output_size);
-    auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, output_size);
+    auto result = hash_join.inner_join(t0, output_size);
     column_wrapper<int32_t> col_gold_0{{0, 2}};
     column_wrapper<int32_t> col_gold_1{{0, 2}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
@@ -1410,7 +1410,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls)
   {
     auto output_size = hash_join.full_join_size(t0);
     EXPECT_EQ(8, output_size);
-    auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, output_size);
+    auto result = hash_join.full_join(t0, output_size);
     column_wrapper<int32_t> col_gold_0{{NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}};
     column_wrapper<int32_t> col_gold_1{{1, 3, 4, 0, NoneValue, 2, NoneValue, NoneValue}};
     auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result);
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index 2249fac4e2e..ffcbce08163 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,22 +20,40 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
+#include <cudf/aggregation.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/types.hpp>
 
 using aggregation = cudf::aggregation;
 using cudf::null_policy;
 using cudf::scan_type;
-using namespace cudf::test::iterators;
+
+namespace cudf::test {
+
+using namespace iterators;
+
+template <typename T>
+using input           = fixed_width_column_wrapper<T>;
+using rank_result_col = fixed_width_column_wrapper<size_type>;
+using percent_result_t =
+  cudf::detail::target_type_t<int32_t, cudf::aggregation::Kind::PERCENT_RANK>;
+using percent_result_col = fixed_width_column_wrapper<percent_result_t>;
+
+auto const rank         = cudf::make_rank_aggregation();
+auto const dense_rank   = cudf::make_dense_rank_aggregation();
+auto const percent_rank = cudf::make_percent_rank_aggregation();
+
+auto constexpr INCLUSIVE_SCAN = cudf::scan_type::INCLUSIVE;
+auto constexpr INCLUDE_NULLS  = cudf::null_policy::INCLUDE;
 
 template <typename T>
 struct TypedRankScanTest : BaseScanTest<T> {
   inline void test_ungrouped_rank_scan(cudf::column_view const& input,
                                        cudf::column_view const& expect_vals,
-                                       std::unique_ptr<aggregation> const& agg,
-                                       null_policy null_handling)
+                                       std::unique_ptr<aggregation> const& agg)
   {
-    auto col_out = cudf::scan(input, agg, scan_type::INCLUSIVE, null_handling);
+    auto col_out = cudf::scan(input, agg, INCLUSIVE_SCAN, INCLUDE_NULLS);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
   }
 };
@@ -55,16 +73,25 @@ TYPED_TEST(TypedRankScanTest, Rank)
       return make_vector<TypeParam>({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113});
     return make_vector<TypeParam>({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34});
   }();
-  auto col = this->make_column(v);
-
-  auto const expected_dense_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6};
-  auto const expected_rank_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12};
-  this->test_ungrouped_rank_scan(
-    *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE);
-  this->test_ungrouped_rank_scan(
-    *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE);
+  auto const col = this->make_column(v);
+
+  auto const expected_dense   = rank_result_col{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6};
+  auto const expected_rank    = rank_result_col{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12};
+  auto const expected_percent = percent_result_col{0.0,
+                                                   0.0,
+                                                   0.0,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   5.0 / 11,
+                                                   6.0 / 11,
+                                                   6.0 / 11,
+                                                   6.0 / 11,
+                                                   6.0 / 11,
+                                                   10.0 / 11,
+                                                   11.0 / 11};
+  this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank);
+  this->test_ungrouped_rank_scan(*col, expected_rank, rank);
+  this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank);
 }
 
 TYPED_TEST(TypedRankScanTest, RankWithNulls)
@@ -74,132 +101,159 @@ TYPED_TEST(TypedRankScanTest, RankWithNulls)
       return make_vector<TypeParam>({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113});
     return make_vector<TypeParam>({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34});
   }();
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0});
-  auto col     = this->make_column(v, b);
-
-  auto const expected_dense_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8};
-  auto const expected_rank_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12};
-  this->test_ungrouped_rank_scan(
-    *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE);
-  this->test_ungrouped_rank_scan(
-    *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE);
+  auto const null_iter = nulls_at({3, 6, 7, 11});
+  auto const b         = thrust::host_vector<bool>(null_iter, null_iter + v.size());
+  auto col             = this->make_column(v, b);
+
+  auto const expected_dense   = rank_result_col{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8};
+  auto const expected_rank    = rank_result_col{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12};
+  auto const expected_percent = percent_result_col{0.0,
+                                                   0.0,
+                                                   0.0,
+                                                   3.0 / 11,
+                                                   4.0 / 11,
+                                                   5.0 / 11,
+                                                   6.0 / 11,
+                                                   6.0 / 11,
+                                                   8.0 / 11,
+                                                   8.0 / 11,
+                                                   10.0 / 11,
+                                                   11.0 / 11};
+  this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank);
+  this->test_ungrouped_rank_scan(*col, expected_rank, rank);
+  this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank);
 }
 
-TYPED_TEST(TypedRankScanTest, MixedStructs)
+namespace {
+template <typename TypeParam>
+auto make_input_column()
 {
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
-    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
-  }();
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
-  auto col     = this->make_column(v, b);
-  auto strings = cudf::test::strings_column_wrapper{
+  if constexpr (std::is_same_v<TypeParam, cudf::string_view>) {
+    return strings_column_wrapper{{"0", "0", "4", "4", "4", "5", "7", "7", "7", "9", "9", "9"},
+                                  null_at(5)};
+  } else {
+    return (std::is_signed_v<TypeParam>)
+             ? input<TypeParam>{{-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}, null_at(5)}
+             : input<TypeParam>{{0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}, null_at(5)};
+  }
+}
+
+auto make_strings_column()
+{
+  return strings_column_wrapper{
     {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  std::vector<std::unique_ptr<cudf::column>> vector_of_columns;
-  vector_of_columns.push_back(std::move(col));
-  vector_of_columns.push_back(strings.release());
-  auto struct_col = cudf::test::structs_column_wrapper{std::move(vector_of_columns)}.release();
-
-  auto expected_dense_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8};
-  auto expected_rank_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12};
-
-  this->test_ungrouped_rank_scan(
-    *struct_col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE);
-  this->test_ungrouped_rank_scan(
-    *struct_col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE);
+}
+
+template <typename TypeParam>
+auto make_mixed_structs_column()
+{
+  auto col     = make_input_column<TypeParam>();
+  auto strings = make_strings_column();
+  return structs_column_wrapper{{col, strings}};
+}
+}  // namespace
+
+TYPED_TEST(TypedRankScanTest, MixedStructs)
+{
+  auto const struct_col       = make_mixed_structs_column<TypeParam>();
+  auto const expected_dense   = rank_result_col{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8};
+  auto const expected_rank    = rank_result_col{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12};
+  auto const expected_percent = percent_result_col{0.0,
+                                                   0.0,
+                                                   2.0 / 11,
+                                                   2.0 / 11,
+                                                   4.0 / 11,
+                                                   5.0 / 11,
+                                                   6.0 / 11,
+                                                   6.0 / 11,
+                                                   8.0 / 11,
+                                                   9.0 / 11,
+                                                   9.0 / 11,
+                                                   11.0 / 11};
+
+  this->test_ungrouped_rank_scan(struct_col, expected_dense, dense_rank);
+  this->test_ungrouped_rank_scan(struct_col, expected_rank, rank);
+  this->test_ungrouped_rank_scan(struct_col, expected_percent, percent_rank);
 }
 
 TYPED_TEST(TypedRankScanTest, NestedStructs)
 {
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
-    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
+  auto const nested_col = [&] {
+    auto struct_col = [&] {
+      auto col     = make_input_column<TypeParam>();
+      auto strings = make_strings_column();
+      return structs_column_wrapper{{col, strings}};
+    }();
+    auto col = make_input_column<TypeParam>();
+    return structs_column_wrapper{{struct_col, col}};
   }();
-  auto const b  = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
-  auto col1     = this->make_column(v, b);
-  auto col2     = this->make_column(v, b);
-  auto col3     = this->make_column(v, b);
-  auto col4     = this->make_column(v, b);
-  auto strings1 = cudf::test::strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  auto strings2 = cudf::test::strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
 
-  std::vector<std::unique_ptr<cudf::column>> struct_columns;
-  struct_columns.push_back(std::move(col1));
-  struct_columns.push_back(strings1.release());
-  auto struct_col = cudf::test::structs_column_wrapper{std::move(struct_columns)};
-  std::vector<std::unique_ptr<cudf::column>> nested_columns;
-  nested_columns.push_back(struct_col.release());
-  nested_columns.push_back(std::move(col2));
-  auto nested_col = cudf::test::structs_column_wrapper{std::move(nested_columns)};
-  std::vector<std::unique_ptr<cudf::column>> flat_columns;
-  flat_columns.push_back(std::move(col3));
-  flat_columns.push_back(strings2.release());
-  flat_columns.push_back(std::move(col4));
-  auto flat_col = cudf::test::structs_column_wrapper{std::move(flat_columns)};
-
-  auto dense_out = cudf::scan(
-    nested_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto dense_expected = cudf::scan(
-    flat_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_out = cudf::scan(
-    nested_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_expected =
-    cudf::scan(flat_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto const flat_col = [&] {
+    auto col         = make_input_column<TypeParam>();
+    auto strings_col = make_strings_column();
+    auto nuther_col  = make_input_column<TypeParam>();
+    return structs_column_wrapper{{col, strings_col, nuther_col}};
+  }();
 
+  auto const dense_out      = cudf::scan(nested_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const dense_expected = cudf::scan(flat_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), dense_expected->view());
+
+  auto const rank_out      = cudf::scan(nested_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const rank_expected = cudf::scan(flat_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), rank_expected->view());
+
+  auto const percent_out      = cudf::scan(nested_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const percent_expected = cudf::scan(flat_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), percent_expected->view());
 }
 
-TYPED_TEST(TypedRankScanTest, structsWithNullPushdown)
+TYPED_TEST(TypedRankScanTest, StructsWithNullPushdown)
 {
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
-    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
-  }();
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
-  auto col     = this->make_column(v, b);
-  auto strings = cudf::test::strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  std::vector<std::unique_ptr<cudf::column>> struct_columns;
-  struct_columns.push_back(std::move(col));
-  struct_columns.push_back(strings.release());
-
-  auto struct_col =
-    cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{});
-
-  struct_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL));
-  auto expected_null_result =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  auto dense_null_out = cudf::scan(
-    *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_null_out = cudf::scan(
-    *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_null_out->view(), expected_null_result);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_null_out->view(), expected_null_result);
-
-  auto const struct_nulls =
-    thrust::host_vector<bool>(std::vector<bool>{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  struct_col->set_null_mask(
-    cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end()));
-  auto expected_dense_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9};
-  auto expected_rank_vals =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12};
-  auto dense_out = cudf::scan(
-    *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_out = cudf::scan(
-    *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense_vals);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank_vals);
+  auto struct_col = make_mixed_structs_column<TypeParam>().release();
+
+  // First, verify that if the structs column has only nulls, all output rows are ranked 1.
+  {
+    struct_col->set_null_mask(
+      create_null_mask(12, cudf::mask_state::ALL_NULL));  // Null mask not pushed down to members.
+    auto const expected_null_result = rank_result_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    auto const expected_percent_rank_null_result =
+      percent_result_col{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    auto const dense_out   = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const rank_out    = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_null_result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_null_result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent_rank_null_result);
+  }
+
+  // Next, verify that if the structs column a null mask that is NOT pushed down to members,
+  // the ranks are still correct.
+  {
+    auto const null_iter = nulls_at({1, 2});
+    struct_col->set_null_mask(
+      cudf::test::detail::make_null_mask(null_iter, null_iter + struct_col->size()));
+    auto const expected_dense   = rank_result_col{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9};
+    auto const expected_rank    = rank_result_col{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12};
+    auto const expected_percent = percent_result_col{0.0,
+                                                     1.0 / 11,
+                                                     1.0 / 11,
+                                                     3.0 / 11,
+                                                     4.0 / 11,
+                                                     5.0 / 11,
+                                                     6.0 / 11,
+                                                     6.0 / 11,
+                                                     8.0 / 11,
+                                                     9.0 / 11,
+                                                     9.0 / 11,
+                                                     11.0 / 11};
+    auto const dense_out   = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const rank_out    = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent);
+  }
 }
 
 struct RankScanTest : public cudf::test::BaseFixture {
@@ -207,49 +261,66 @@ struct RankScanTest : public cudf::test::BaseFixture {
 
 TEST(RankScanTest, BoolRank)
 {
-  cudf::test::fixed_width_column_wrapper<bool> vals{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_dense_vals{
-    1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_rank_vals{
-    1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4};
-
-  auto dense_out = cudf::scan(
-    vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_out =
-    cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, dense_out->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, rank_out->view());
+  auto const vals             = input<bool>{0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto const expected_dense   = rank_result_col{1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  auto const expected_rank    = rank_result_col{1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4};
+  auto const expected_percent = percent_result_col{0.0,
+                                                   0.0,
+                                                   0.0,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11};
+
+  auto const dense_out   = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const rank_out    = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, dense_out->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, rank_out->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, percent_out->view());
 }
 
 TEST(RankScanTest, BoolRankWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<bool> vals{{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34},
-                                                    {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
-  cudf::table_view order_table{std::vector<cudf::column_view>{vals}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_dense_vals{
-    1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_rank_vals{
-    1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9};
-
-  auto nullable_dense_out = cudf::scan(
-    vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto nullable_rank_out =
-    cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, nullable_dense_out->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, nullable_rank_out->view());
+  auto const vals = input<bool>{{0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}, nulls_at({8, 9, 10, 11})};
+  auto const expected_dense   = rank_result_col{1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3};
+  auto const expected_rank    = rank_result_col{1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9};
+  auto const expected_percent = percent_result_col{0.0,
+                                                   0.0,
+                                                   0.0,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   3.0 / 11,
+                                                   8.0 / 11,
+                                                   8.0 / 11,
+                                                   8.0 / 11,
+                                                   8.0 / 11};
+
+  auto nullable_dense_out   = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto nullable_rank_out    = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto nullable_percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, nullable_dense_out->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, nullable_rank_out->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, nullable_percent_out->view());
 }
 
 TEST(RankScanTest, ExclusiveScan)
 {
-  cudf::test::fixed_width_column_wrapper<uint32_t> vals{3, 4, 5};
-  cudf::test::fixed_width_column_wrapper<uint32_t> order_col{3, 3, 1};
-  cudf::table_view order_table{std::vector<cudf::column_view>{order_col}};
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    cudf::scan(
-      vals, cudf::make_dense_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE),
-    "Unsupported dense rank aggregation operator for exclusive scan");
-  CUDF_EXPECT_THROW_MESSAGE(
-    cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE),
-    "Unsupported rank aggregation operator for exclusive scan");
+  auto const vals = input<uint32_t>{3, 4, 5};
+
+  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
+                            "Dense rank aggregation operator requires an inclusive scan");
+  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
+                            "Rank aggregation operator requires an inclusive scan");
+  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
+                            "Percent rank aggregation operator requires an inclusive scan");
 }
+
+}  // namespace cudf::test
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index a72ec61dd8f..bacd62ac86e 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
 
 #include <algorithm>
 #include <vector>
@@ -250,6 +250,33 @@ TEST_F(StringsContainsTests, OctalTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsContainsTests, HexTest)
+{
+  std::vector<char> ascii_chars(  // all possible matchable chars
+    {thrust::make_counting_iterator<char>(0), thrust::make_counting_iterator<char>(127)});
+  auto const count = static_cast<cudf::size_type>(ascii_chars.size());
+  std::vector<cudf::offset_type> offsets(
+    {thrust::make_counting_iterator<cudf::offset_type>(0),
+     thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
+  auto d_chars   = cudf::detail::make_device_uvector_sync(ascii_chars);
+  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets);
+  auto input     = cudf::make_strings_column(d_chars, d_offsets);
+
+  auto strings_view = cudf::strings_column_view(input->view());
+  for (auto ch : ascii_chars) {
+    std::stringstream str;
+    str << "\\x" << std::setfill('0') << std::setw(2) << std::hex << static_cast<int32_t>(ch);
+    std::string pattern = str.str();
+
+    auto results = cudf::strings::contains_re(strings_view, pattern);
+    // only one element in the input should match ch
+    auto true_dat = cudf::detail::make_counting_transform_iterator(
+      0, [ch](auto idx) { return ch == static_cast<char>(idx); });
+    cudf::test::fixed_width_column_wrapper<bool> expected(true_dat, true_dat + count);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
 TEST_F(StringsContainsTests, EmbeddedNullCharacter)
 {
   std::vector<std::string> data(10);
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 516882bd8ad..9a28dbf0697 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,6 +150,39 @@ TEST_F(StringsExtractTests, ExtractEventTest)
   }
 }
 
+TEST_F(StringsExtractTests, MultiLine)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
+  auto view = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::extract(view, "(^[a-c]+$)", cudf::strings::regex_flags::MULTILINE);
+  cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc"},
+                                                        {1, 1, 1, 0, 1});
+  auto expected = cudf::table_view{{expected_multiline}};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+  results = cudf::strings::extract(view, "^([a-c]+)$");
+  cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", ""}, {0, 0, 1, 0, 0});
+  expected = cudf::table_view{{expected_default}};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+}
+
+TEST_F(StringsExtractTests, DotAll)
+{
+  auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::extract(view, "(a.*f)", cudf::strings::regex_flags::DOTALL);
+  cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""},
+                                                     {1, 1, 1, 0});
+  auto expected = cudf::table_view{{expected_dotall}};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+  results = cudf::strings::extract(view, "(a.*f)");
+  cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0});
+  expected = cudf::table_view{{expected_default}};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<const char*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
@@ -181,7 +214,7 @@ TEST_F(StringsExtractTests, ExtractAllTest)
   cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)");
+  auto results = cudf::strings::extract_all_record(sv, "(\\d+) (\\w+)");
 
   bool valids[] = {true, true, true, false, false, false, true};
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
@@ -201,7 +234,7 @@ TEST_F(StringsExtractTests, Errors)
   cudf::test::strings_column_wrapper input({"this column intentionally left blank"});
   auto sv = cudf::strings_column_view(input);
   EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::extract_all(sv, "\\w+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract_all_record(sv, "\\w+"), cudf::logic_error);
 }
 
 TEST_F(StringsExtractTests, MediumRegex)
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 5872a9e5bb7..81122b1c5d8 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,6 +112,22 @@ TEST_F(StringsConvertTest, ToFixedPointDecimal128)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
+TEST_F(StringsConvertTest, ToFixedPointLargeScale)
+{
+  using namespace numeric;
+  using RepType    = cudf::device_storage_type_t<decimal128>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const strings = cudf::test::strings_column_wrapper({"0.05", "0.06", "0.50", "5.01"});
+
+  auto const scale   = scale_type{-25};
+  auto const type    = cudf::data_type{cudf::type_to_id<decimal128>(), scale};
+  auto const results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), type);
+
+  auto const expected = fp_wrapper{{5, 6, 50, 501}, scale_type{-2}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
 TEST_F(StringsConvertTest, FromFixedPointDecimal128)
 {
   using namespace numeric;
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index de4e48fd70a..f0d7315929b 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,14 +18,15 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/split/partition.hpp>
 #include <cudf/strings/split/split.hpp>
+#include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <tests/strings/utilities.h>
 
 #include <vector>
 
@@ -240,41 +241,6 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, *expected);
 }
 
-TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
-{
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::split(zero_size_strings_column);
-  EXPECT_TRUE(results->num_columns() == 1);
-  cudf::test::expect_strings_empty(results->get_column(0));
-  results = cudf::strings::rsplit(zero_size_strings_column);
-  EXPECT_TRUE(results->num_columns() == 1);
-  cudf::test::expect_strings_empty(results->get_column(0));
-}
-
-// This test specifically for https://github.com/rapidsai/custrings/issues/119
-TEST_F(StringsSplitTest, AllNullsCase)
-{
-  std::vector<const char*> h_strings{nullptr, nullptr, nullptr};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  auto results = cudf::strings::split(cudf::strings_column_view(strings));
-  EXPECT_TRUE(results->num_columns() == 1);
-  auto column = results->get_column(0).view();
-  EXPECT_TRUE(column.size() == 3);
-  EXPECT_TRUE(column.has_nulls());
-  EXPECT_TRUE(column.null_count() == column.size());
-  results = cudf::strings::split(cudf::strings_column_view(strings), cudf::string_scalar("-"));
-  EXPECT_TRUE(results->num_columns() == 1);
-  column = results->get_column(0);
-  EXPECT_TRUE(column.size() == 3);
-  EXPECT_TRUE(column.has_nulls());
-  EXPECT_TRUE(column.null_count() == column.size());
-}
-
 TEST_F(StringsSplitTest, SplitRecord)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
@@ -339,6 +305,127 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRegex)
+{
+  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  {
+    auto result = cudf::strings::split_re(sv, "\\s+");
+
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity);
+    cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
+    cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
+    auto expected = cudf::table_view({col0, col1, col2});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+
+    // rsplit == split when using default parameters
+    result = cudf::strings::rsplit_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
+
+  {
+    auto result = cudf::strings::split_re(sv, "[eé]");
+
+    cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity);
+    cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
+                                            {1, 0, 1, 1, 0});
+    cudf::test::strings_column_wrapper col2({"s", "", "  ", "", ""}, {1, 0, 1, 0, 0});
+    cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
+    auto expected = cudf::table_view({col0, col1, col2, col3});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+
+    // rsplit == split when using default parameters
+    result = cudf::strings::rsplit_re(sv, "[eé]");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
+}
+
+TEST_F(StringsSplitTest, SplitRecordRegex)
+{
+  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  {
+    auto result = cudf::strings::split_record_re(sv, "\\s+");
+
+    LCW expected(
+      {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    // rsplit == split when using default parameters
+    result = cudf::strings::rsplit_record_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
+
+  {
+    auto result = cudf::strings::split_record_re(sv, "[eé]");
+
+    LCW expected({LCW{" H", "llo th", "s", ""},
+                  LCW{},
+                  LCW{"ar", " som", "  "},
+                  LCW{"t", "st String"},
+                  LCW{""}},
+                 validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    // rsplit == split when using default parameters
+    result = cudf::strings::rsplit_record_re(sv, "[eé]");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
+}
+
+TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+  {
+    auto result = cudf::strings::split_re(sv, "\\s+", 1);
+
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
+                                            {1, 0, 1, 1, 0});
+    auto expected = cudf::table_view({col0, col1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+
+    // split everything is the same output as maxsplit==2 for the test input column here
+    result         = cudf::strings::split_re(sv, "\\s+", 2);
+    auto expected2 = cudf::strings::split_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view());
+  }
+  {
+    auto result = cudf::strings::split_record_re(sv, "\\s", 1);
+
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected1(
+      {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1);
+
+    result = cudf::strings::split_record_re(sv, "\\s", 2);
+    LCW expected2(
+      {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2);
+
+    // split everything is the same output as maxsplit==3 for the test input column here
+    result         = cudf::strings::split_record_re(sv, "\\s", 3);
+    auto expected0 = cudf::strings::split_record_re(sv, "\\s");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
+  }
+}
+
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<const char*> h_strings{
@@ -430,14 +517,100 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
-TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)
+TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  {
+    auto result = cudf::strings::rsplit_re(sv, "\\s+", 1);
+
+    cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity);
+    cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0});
+    auto expected = cudf::table_view({col0, col1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
+  {
+    auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1);
+
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected(
+      {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    // split everything is the same output as any maxsplit > 2 for the test input column here
+    result         = cudf::strings::rsplit_record_re(sv, "\\s+", 3);
+    auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
+  }
+}
+
+TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto split_record_result = cudf::strings::split_record(zero_size_strings_column);
-  EXPECT_TRUE(split_record_result->size() == 0);
-  auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column);
-  EXPECT_TRUE(rsplit_record_result->size() == 0);
+  auto results = cudf::strings::split(zero_size_strings_column);
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::rsplit(zero_size_strings_column);
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::split_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+
+  auto list_result = cudf::strings::split_record(zero_size_strings_column);
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::rsplit_record(zero_size_strings_column);
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(list_result->size() == 0);
+}
+
+// This test specifically for https://github.com/rapidsai/custrings/issues/119
+TEST_F(StringsSplitTest, AllNullsCase)
+{
+  cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0});
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::split(sv);
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::split(sv, cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit(sv);
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit(sv, cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::split_re(sv, "-");
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit_re(sv, "-");
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+
+  auto list_result = cudf::strings::split_record(sv);
+  using LCW        = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::rsplit_record(sv);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::split_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::rsplit_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, Partition)
@@ -612,13 +785,20 @@ TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns)
 
 TEST_F(StringsSplitTest, InvalidParameter)
 {
-  std::vector<const char*> h_strings{"string left intentionally blank"};
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
-  auto strings_view = cudf::strings_column_view(strings);
+  cudf::test::strings_column_wrapper input({"string left intentionally blank"});
+  auto strings_view = cudf::strings_column_view(input);
   EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_record(strings_view, cudf::string_scalar("", false)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)),
diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index 190978a7581..141e5adba93 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -61,6 +61,7 @@ Computations / descriptive stats
    GroupBy.sum
    GroupBy.var
    GroupBy.corr
+   GroupBy.cov
    
 The following methods are available in both ``SeriesGroupBy`` and
 ``DataFrameGroupBy`` objects, but may differ slightly, usually in that
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
index cbc8f7e712f..f74853769f6 100644
--- a/docs/cudf/source/basics/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -180,7 +180,8 @@ that support them:
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
    | corr                               | ✅        |            |          |               |        |          |            | ✅        |
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
-   
+   | cov                                | ✅        |            |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
 
 GroupBy apply
 -------------
diff --git a/java/pom.xml b/java/pom.xml
old mode 100755
new mode 100644
index 8f0fb1000d8..02828a21e67
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -147,7 +147,7 @@
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
-            <version>3.1.0</version>
+            <version>3.1.4</version>
             <scope>test</scope>
         </dependency>
     </dependencies>
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 1d73bd71246..d10329ca0f2 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -66,8 +66,9 @@ enum Kind {
         MERGE_M2(27),
         RANK(28),
         DENSE_RANK(29),
-        TDIGEST(30), // This can take a delta argument for accuracy level
-        MERGE_TDIGEST(31); // This can take a delta argument for accuracy level
+        PERCENT_RANK(30),
+        TDIGEST(31), // This can take a delta argument for accuracy level
+        MERGE_TDIGEST(32); // This can take a delta argument for accuracy level
 
         final int nativeId;
 
@@ -754,6 +755,19 @@ static DenseRankAggregation denseRank() {
         return new DenseRankAggregation();
     }
 
+    static final class PercentRankAggregation extends NoParamAggregation {
+        private PercentRankAggregation() {
+            super(Kind.PERCENT_RANK);
+        }
+    }
+
+    /**
+     * Get the row's percent ranking.
+     */
+    static PercentRankAggregation percentRank() {
+        return new PercentRankAggregation();
+    }
+
     /**
      * Collect the values into a list. Nulls will be skipped.
      */
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 3ff2a370e4f..f91ee5535b1 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co
   /**
    * Creates a deep copy of a column while replacing the validity mask. The validity mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the current column.
-   * The result column will have the same number of rows as the current column. 
+   * The result column will have the same number of rows as the current column.
    * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the row value is undefined.
-   * 
+   *
    * @param boolColumn bool column whose value is to be used as the validity mask.
    * @return Deep copy of the column with replaced validity mask.
-   */    
+   */
   public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) {
     return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView()));
   }
@@ -2345,88 +2345,128 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) {
   }
 
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(Scalar delimiter, int maxSplit) {
+  public final Table stringSplit(String pattern, int limit, boolean splitByRegex) {
     assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
+    return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex));
   }
-  
+
   /**
-   * Returns a list of columns by splitting each string using the specified delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit(Scalar delimiter) {
-    return stringSplit(delimiter, -1);
+  public final Table stringSplit(String pattern, boolean splitByRegex) {
+    return stringSplit(pattern, -1, splitByRegex);
   }
 
   /**
-   * Returns a list of columns by splitting each string using whitespace as the delimiter.
-   * The number of rows in the output columns will be the same as the input column.
-   * Null entries are added for a row where split results have been exhausted.
-   * Null string entries return corresponding null output columns.
-   * @return New table of strings columns.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @return list of strings columns as a table.
    */
-  public final Table stringSplit() {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplit(emptyString, -1);
-    }
+  public final Table stringSplit(String delimiter, int limit) {
+    return stringSplit(delimiter, limit, false);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
+   * Returns a list of columns by splitting each string using the specified string literal
+   * delimiter. The number of rows in the output columns will be the same as the input column.
+   * Null entries are added for a row where split results have been exhausted. Null input entries
+   * result in all nulls in the corresponding rows of the output columns.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return list of strings columns as a table.
    */
-  public final ColumnVector stringSplitRecord() {
-    return stringSplitRecord(-1);
+  public final Table stringSplit(String delimiter) {
+    return stringSplit(delimiter, -1, false);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using whitespace as the delimiter.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(int maxSplit) {
-    try (Scalar emptyString = Scalar.fromString("")) {
-      return stringSplitRecord(emptyString, maxSplit);
-    }
+  public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) {
+    assert type.equals(DType.STRING) : "column type must be String";
+    assert pattern != null : "pattern is null";
+    assert pattern.length() > 0 : "empty pattern is not supported";
+    assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported";
+    return new ColumnVector(
+        stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex));
+  }
+
+  /**
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified pattern.
+   *
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   * @return a LIST column of string elements.
+   */
+  public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) {
+    return stringSplitRecord(pattern, -1, splitByRegex);
   }
 
   /**
-   * Returns a column of lists of strings by splitting each string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter) {
-    return stringSplitRecord(delimiter, -1);
+  public final ColumnVector stringSplitRecord(String delimiter, int limit) {
+    return stringSplitRecord(delimiter, limit, false);
   }
 
   /**
-   * Returns a column that is a list of strings. Each string list is made by splitting each input
-   * string using the specified delimiter.
-   * @param delimiter UTF-8 encoded string identifying the split points in each string.
-   *                  An empty string indicates split on whitespace.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
-   * @return New table of strings columns.
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string.
+   * @return a LIST column of string elements.
    */
-  public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) {
-    assert type.equals(DType.STRING) : "column type must be a String";
-    assert delimiter != null : "delimiter may not be null";
-    assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar";
-    return new ColumnVector(
-        stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit));
+  public final ColumnVector stringSplitRecord(String delimiter) {
+    return stringSplitRecord(delimiter, -1, false);
   }
 
   /**
@@ -3248,7 +3288,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST};
    * Create a column of int32 indices, indicating the position of the scalar search key
    * in each list row.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key is null.
    * 2. The list row is null.
    * @param key The scalar search key
@@ -3265,7 +3305,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
    * Create a column of int32 indices, indicating the position of each row in the
    * search key column in the corresponding row of the lists column.
    * All indices are 0-based. If a search key is not found, the index is set to -1.
-   * The index is set to null if one of the following is true: 
+   * The index is set to null if one of the following is true:
    * 1. The search key row is null.
    * 2. The list row is null.
    * @param keys ColumnView of search keys.
@@ -3531,15 +3571,36 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle
   private static native long substringLocate(long columnView, long substringScalar, int start, int end);
 
   /**
-   * Native method which returns array of columns by splitting each string using the specified
-   * delimiter.
-   * @param columnView native handle of the cudf::column_view being operated on.
-   * @param delimiter  UTF-8 encoded string identifying the split points in each string.
-   * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits.
+   * Returns a list of columns by splitting each string using the specified pattern. The number of
+   * rows in the output columns will be the same as the input column. Null entries are added for a
+   * row where split results have been exhausted. Null input entries result in all nulls in the
+   * corresponding rows of the output columns.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
    */
-  private static native long[] stringSplit(long columnView, long delimiter, int maxSplit);
+  private static native long[] stringSplit(long nativeHandle, String pattern, int limit,
+                                           boolean splitByRegex);
 
-  private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit);
+  /**
+   * Returns a column that are lists of strings in which each list is made by splitting the
+   * corresponding input string using the specified string literal delimiter.
+   *
+   * @param nativeHandle native handle of the input strings column that being operated on.
+   * @param pattern UTF-8 encoded string identifying the split pattern for each input string.
+   * @param limit the maximum size of the list resulting from splitting each input string,
+   *              or -1 for all possible splits. Note that limit = 0 (all possible splits without
+   *              trailing empty strings) and limit = 1 (no split at all) are not supported.
+   * @param splitByRegex a boolean flag indicating whether the input strings will be split by a
+   *                     regular expression pattern or just by a string literal delimiter.
+   */
+  private static native long stringSplitRecord(long nativeHandle, String pattern, int limit,
+                                               boolean splitByRegex);
 
   /**
    * Native method to calculate substring from a given string column. 0 indexing.
@@ -3714,7 +3775,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
   /**
    * Native method to search list rows for null elements.
    * @param nativeView the column view handle of the list
-   * @return column handle of the resultant boolean column 
+   * @return column handle of the resultant boolean column
    */
   private static native long listContainsNulls(long nativeView);
 
@@ -3896,20 +3957,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi
   /**
    * Native method to deep copy a column while replacing the null mask. The null mask is the
    * device_vector equivalent of the boolean column given as argument.
-   * 
+   *
    * The boolColumn must have the same number of rows as the exemplar column.
    * The result column will have the same number of rows as the exemplar.
    * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i.
    * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   * 
+   *
    * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
    * then the resultant row value is undefined.
-   * 
+   *
    * @param exemplarViewHandle column view of the column that is deep copied.
    * @param boolColumnViewHandle bool column whose value is to be used as the null mask.
    * @return Deep copy of the column with replaced null mask.
-   */                                                      
-  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
+   */
+  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle,
                                                              long boolColumnViewHandle) throws CudfException;
 
   ////////
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
index 0e49636fae6..78b3d5d52ec 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -40,6 +40,9 @@ private ColumnWriterOptions(AbstractStructBuilder builder) {
         (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]);
   }
 
+  // The sentinel value of unknown precision (default value)
+  public static int UNKNOWN_PRECISION = -1;
+
   /**
    * Constructor used for list
    */
@@ -103,7 +106,7 @@ protected ColumnWriterOptions withDecimal(String name, int precision,
 
     protected ColumnWriterOptions withTimestamp(String name, boolean isInt96,
                                                 boolean isNullable) {
-      return new ColumnWriterOptions(name, isInt96, 0, isNullable);
+      return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable);
     }
 
     /**
@@ -243,7 +246,7 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
 
   public ColumnWriterOptions(String columnName, boolean isNullable) {
     this.isTimestampTypeInt96 = false;
-    this.precision = 0;
+    this.precision = UNKNOWN_PRECISION;
     this.isNullable = isNullable;
     this.columnName = columnName;
   }
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java
index 219b6dde05d..006941deec6 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -115,4 +115,11 @@ public static GroupByScanAggregation rank() {
   public static GroupByScanAggregation denseRank() {
     return new GroupByScanAggregation(Aggregation.denseRank());
   }
+
+  /**
+   * Get the row's percent ranking.
+   */
+  public static GroupByScanAggregation percentRank() {
+    return new GroupByScanAggregation(Aggregation.percentRank());
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java
index 08489562adc..752fd57355b 100644
--- a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -97,4 +97,11 @@ public static ScanAggregation rank() {
   public static ScanAggregation denseRank() {
     return new ScanAggregation(Aggregation.denseRank());
   }
+
+  /**
+   * Get the row's percent rank.
+   */
+  public static ScanAggregation percentRank() {
+    return new ScanAggregation(Aggregation.percentRank());
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index bb0321d0a16..17e10933b65 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -539,14 +539,11 @@ private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long r
   private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
-  private static native long leftJoinRowCount(long leftTable, long rightHashJoin,
-                                              boolean nullsEqual) throws CudfException;
+  private static native long leftJoinRowCount(long leftTable, long rightHashJoin) throws CudfException;
 
-  private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin,
-                                                      boolean nullsEqual) throws CudfException;
+  private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin) throws CudfException;
 
   private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
-                                                               boolean nullsEqual,
                                                                long outputRowCount) throws CudfException;
 
   private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable,
@@ -555,14 +552,11 @@ private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long
   private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys,
                                                    boolean compareNullsEqual) throws CudfException;
 
-  private static native long innerJoinRowCount(long table, long hashJoin,
-                                               boolean nullsEqual) throws CudfException;
+  private static native long innerJoinRowCount(long table, long hashJoin) throws CudfException;
 
-  private static native long[] innerHashJoinGatherMaps(long table, long hashJoin,
-                                                       boolean nullsEqual) throws CudfException;
+  private static native long[] innerHashJoinGatherMaps(long table, long hashJoin) throws CudfException;
 
   private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin,
-                                                                boolean nullsEqual,
                                                                 long outputRowCount) throws CudfException;
 
   private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable,
@@ -571,14 +565,11 @@ private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long r
   private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
-  private static native long fullJoinRowCount(long leftTable, long rightHashJoin,
-                                              boolean nullsEqual) throws CudfException;
+  private static native long fullJoinRowCount(long leftTable, long rightHashJoin) throws CudfException;
 
-  private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin,
-                                                      boolean nullsEqual) throws CudfException;
+  private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin) throws CudfException;
 
   private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
-                                                               boolean nullsEqual,
                                                                long outputRowCount) throws CudfException;
 
   private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable,
@@ -2318,8 +2309,7 @@ public long leftJoinRowCount(HashJoin rightHash) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    return leftJoinRowCount(getNativeView(), rightHash.getNativeView(),
-        rightHash.getCompareNulls());
+    return leftJoinRowCount(getNativeView(), rightHash.getNativeView());
   }
 
   /**
@@ -2337,9 +2327,7 @@ public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    long[] gatherMapData =
-        leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
-            rightHash.getCompareNulls());
+    long[] gatherMapData = leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView());
     return buildJoinGatherMaps(gatherMapData);
   }
 
@@ -2363,9 +2351,8 @@ public GatherMap[] leftJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    long[] gatherMapData =
-        leftHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
-            rightHash.getCompareNulls(), outputRowCount);
+    long[] gatherMapData = leftHashJoinGatherMapsWithCount(getNativeView(),
+        rightHash.getNativeView(), outputRowCount);
     return buildJoinGatherMaps(gatherMapData);
   }
 
@@ -2545,8 +2532,7 @@ public long innerJoinRowCount(HashJoin otherHash) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "otherKeys: " + otherHash.getNumberOfColumns());
     }
-    return innerJoinRowCount(getNativeView(), otherHash.getNativeView(),
-        otherHash.getCompareNulls());
+    return innerJoinRowCount(getNativeView(), otherHash.getNativeView());
   }
 
   /**
@@ -2564,9 +2550,7 @@ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    long[] gatherMapData =
-        innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
-            rightHash.getCompareNulls());
+    long[] gatherMapData = innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView());
     return buildJoinGatherMaps(gatherMapData);
   }
 
@@ -2590,9 +2574,8 @@ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount)
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    long[] gatherMapData =
-        innerHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
-            rightHash.getCompareNulls(), outputRowCount);
+    long[] gatherMapData = innerHashJoinGatherMapsWithCount(getNativeView(),
+        rightHash.getNativeView(), outputRowCount);
     return buildJoinGatherMaps(gatherMapData);
   }
 
@@ -2778,8 +2761,7 @@ public long fullJoinRowCount(HashJoin rightHash) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    return fullJoinRowCount(getNativeView(), rightHash.getNativeView(),
-        rightHash.getCompareNulls());
+    return fullJoinRowCount(getNativeView(), rightHash.getNativeView());
   }
 
   /**
@@ -2797,9 +2779,7 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    long[] gatherMapData =
-        fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(),
-            rightHash.getCompareNulls());
+    long[] gatherMapData = fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView());
     return buildJoinGatherMaps(gatherMapData);
   }
 
@@ -2823,9 +2803,8 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) {
       throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() +
           "rightKeys: " + rightHash.getNumberOfColumns());
     }
-    long[] gatherMapData =
-        fullHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(),
-            rightHash.getCompareNulls(), outputRowCount);
+    long[] gatherMapData = fullHashJoinGatherMapsWithCount(getNativeView(),
+        rightHash.getNativeView(), outputRowCount);
     return buildJoinGatherMaps(gatherMapData);
   }
 
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index 93a01854ced..f8c448566c8 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -85,6 +85,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
           return cudf::make_rank_aggregation();
         case 29: // DENSE_RANK
           return cudf::make_dense_rank_aggregation();
+        case 30: // PERCENT_RANK
+          return cudf::make_percent_rank_aggregation();
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
       }
     }();
@@ -139,10 +141,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 30: // TDIGEST
+      case 31: // TDIGEST
         ret = cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
-      case 31: // MERGE_TDIGEST
+      case 32: // MERGE_TDIGEST
         ret = cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
       default: throw std::logic_error("Unsupported TDigest Aggregation Operation");
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index eec4a78a457..548844aa0d3 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -61,6 +61,7 @@
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/split/split.hpp>
+#include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/substring.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -561,34 +562,78 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env,
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
-                                                                        jlong column_view,
-                                                                        jlong delimiter_ptr,
-                                                                        jint max_split) {
-  JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0);
+                                                                        jlong input_handle,
+                                                                        jstring pattern_obj,
+                                                                        jint limit,
+                                                                        jboolean split_by_regex) {
+  JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
+
+  if (limit == 0 || limit == 1) {
+    // Cannot achieve the results of splitting with limit == 0 or limit == 1.
+    // This is because cudf operates on a different parameter (`max_split`) which is converted from
+    // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
+    // unlimited split.
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                  "limit == 0 and limit == 1 are not supported", 0);
+  }
+
   try {
     cudf::jni::auto_set_device(env);
-    cudf::strings_column_view const scv{*reinterpret_cast<cudf::column_view *>(column_view)};
-    auto delimiter = reinterpret_cast<cudf::string_scalar *>(delimiter_ptr);
+    auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
+    auto const strs_input = cudf::strings_column_view{*input};
 
-    return cudf::jni::convert_table_for_return(env,
-                                               cudf::strings::split(scv, *delimiter, max_split));
+    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
+    if (pattern_jstr.is_empty()) {
+      // Java's split API produces different behaviors than cudf when splitting with empty
+      // pattern.
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
+    }
+
+    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto result = split_by_regex ?
+                      cudf::strings::split_re(strs_input, pattern, max_split) :
+                      cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split);
+    return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong delimiter,
-                                                                         jint max_split) {
-  JNI_NULL_CHECK(env, column_view, "column is null", 0);
-  JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0);
+                                                                         jlong input_handle,
+                                                                         jstring pattern_obj,
+                                                                         jint limit,
+                                                                         jboolean split_by_regex) {
+  JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
+
+  if (limit == 0 || limit == 1) {
+    // Cannot achieve the results of splitting with limit == 0 or limit == 1.
+    // This is because cudf operates on a different parameter (`max_split`) which is converted from
+    // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
+    // unlimited split.
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                  "limit == 0 and limit == 1 are not supported", 0);
+  }
+
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
-    cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(delimiter);
-    return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split));
+    auto const input = reinterpret_cast<cudf::column_view *>(input_handle);
+    auto const strs_input = cudf::strings_column_view{*input};
+
+    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
+    if (pattern_jstr.is_empty()) {
+      // Java's split API produces different behaviors than cudf when splitting with empty
+      // pattern.
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0);
+    }
+
+    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto result =
+        split_by_regex ?
+            cudf::strings::split_record_re(strs_input, pattern, max_split) :
+            cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split);
+    return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index aeac1856db0..1cf56da35da 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -676,9 +676,10 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
   int write_index = 0;
   for (int i = 0; i < num_children; i++, write_index++) {
     cudf::io::column_in_metadata child;
-    child.set_name(col_names[read_index])
-        .set_decimal_precision(precisions[read_index])
-        .set_nullability(nullability[read_index]);
+    child.set_name(col_names[read_index]).set_nullability(nullability[read_index]);
+    if (precisions[read_index] > -1) {
+      child.set_decimal_precision(precisions[read_index]);
+    }
     if (!is_int96.is_null()) {
       child.set_int96_timestamps(is_int96[read_index]);
     }
@@ -717,8 +718,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) {
     metadata.column_metadata[write_index]
         .set_name(cpp_names[read_index])
-        .set_nullability(col_nullability[read_index])
-        .set_decimal_precision(precisions[read_index]);
+        .set_nullability(col_nullability[read_index]);
+    if (precisions[read_index] > -1) {
+      metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]);
+    }
     if (!is_int96.is_null()) {
       metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]);
     }
@@ -812,15 +815,14 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
 // a hash table built from the join's right table.
 template <typename T>
 jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join,
-                                 jboolean compare_nulls_equal, T join_func) {
+                                 T join_func) {
   JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL);
   JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
     auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
     auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    return gather_maps_to_java(env, join_func(*left_keys, *hash_join, nulleq));
+    return gather_maps_to_java(env, join_func(*left_keys, *hash_join));
   }
   CATCH_STD(env, NULL);
 }
@@ -2172,41 +2174,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join,
-                                                                   jboolean compare_nulls_equal) {
+                                                                   jlong j_right_hash_join) {
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
     auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    auto row_count = hash_join->left_join_size(*left_table, nulleq);
+    auto row_count = hash_join->left_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
-    jboolean compare_nulls_equal) {
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
-        return hash.left_join(left, nulleq);
+      env, j_left_table, j_right_hash_join,
+      [](cudf::table_view const &left, cudf::hash_join const &hash) {
+        return hash.left_join(left);
       });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
-    jlong j_output_row_count) {
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
-  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
-                                          [output_row_count](cudf::table_view const &left,
-                                                             cudf::hash_join const &hash,
-                                                             cudf::null_equality nulleq) {
-                                            return hash.left_join(left, nulleq, output_row_count);
-                                          });
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join,
+      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
+        return hash.left_join(left, output_row_count);
+      });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(JNIEnv *env, jclass,
@@ -2305,41 +2302,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass,
                                                                     jlong j_left_table,
-                                                                    jlong j_right_hash_join,
-                                                                    jboolean compare_nulls_equal) {
+                                                                    jlong j_right_hash_join) {
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
     auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    auto row_count = hash_join->inner_join_size(*left_table, nulleq);
+    auto row_count = hash_join->inner_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
-    jboolean compare_nulls_equal) {
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
-        return hash.inner_join(left, nulleq);
+      env, j_left_table, j_right_hash_join,
+      [](cudf::table_view const &left, cudf::hash_join const &hash) {
+        return hash.inner_join(left);
       });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
-    jlong j_output_row_count) {
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
-  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
-                                          [output_row_count](cudf::table_view const &left,
-                                                             cudf::hash_join const &hash,
-                                                             cudf::null_equality nulleq) {
-                                            return hash.inner_join(left, nulleq, output_row_count);
-                                          });
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join,
+      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
+        return hash.inner_join(left, output_row_count);
+      });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(JNIEnv *env, jclass,
@@ -2438,41 +2430,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join,
-                                                                   jboolean compare_nulls_equal) {
+                                                                   jlong j_right_hash_join) {
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
     auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    auto row_count = hash_join->full_join_size(*left_table, nulleq);
+    auto row_count = hash_join->full_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join,
-    jboolean compare_nulls_equal) {
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) {
-        return hash.full_join(left, nulleq);
+      env, j_left_table, j_right_hash_join,
+      [](cudf::table_view const &left, cudf::hash_join const &hash) {
+        return hash.full_join(left);
       });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal,
-    jlong j_output_row_count) {
+    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
-  return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal,
-                                          [output_row_count](cudf::table_view const &left,
-                                                             cudf::hash_join const &hash,
-                                                             cudf::null_equality nulleq) {
-                                            return hash.full_join(left, nulleq, output_row_count);
-                                          });
+  return cudf::jni::hash_join_gather_maps(
+      env, j_left_table, j_right_hash_join,
+      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
+        return hash.full_join(left, output_row_count);
+      });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps(
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 4a5265b1d2e..5a2aa44261d 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
@@ -34,6 +35,7 @@
 #include <rmm/exec_policy.hpp>
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 #include <type_traits>
@@ -187,8 +189,82 @@ struct batch_data {
   std::vector<row_batch> row_batches; // information about each batch such as byte count
 };
 
-struct row_offset_functor {
-  row_offset_functor(size_type fixed_width_only_row_size)
+/**
+ * @brief builds row size information for tables that contain strings
+ *
+ * @param tbl table from which to compute row size information
+ * @param fixed_width_and_validity_size size of fixed-width and validity data in this table
+ * @param stream cuda stream on which to operate
+ * @return device vector of size_types of the row sizes of the table
+ */
+rmm::device_uvector<size_type> build_string_row_sizes(table_view const &tbl,
+                                                      size_type fixed_width_and_validity_size,
+                                                      rmm::cuda_stream_view stream) {
+  auto const num_rows = tbl.num_rows();
+  rmm::device_uvector<size_type> d_row_sizes(num_rows, stream);
+  thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0);
+
+  auto d_offsets_iterators = [&]() {
+    std::vector<strings_column_view::offset_iterator> offsets_iterators;
+    auto offsets_iter = thrust::make_transform_iterator(
+        tbl.begin(), [](auto const &col) -> strings_column_view::offset_iterator {
+          if (!is_fixed_width(col.type())) {
+            CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!");
+            return strings_column_view(col).offsets_begin();
+          } else {
+            return nullptr;
+          }
+        });
+    std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(),
+                 std::back_inserter(offsets_iterators),
+                 [](auto const &offset_ptr) { return offset_ptr != nullptr; });
+    return make_device_uvector_async(offsets_iterators, stream);
+  }();
+
+  auto const num_columns = static_cast<size_type>(d_offsets_iterators.size());
+
+  thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(num_columns * num_rows),
+                   [d_offsets_iterators = d_offsets_iterators.data(), num_columns, num_rows,
+                    d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) {
+                     auto const row = element_idx % num_rows;
+                     auto const col = element_idx / num_rows;
+                     auto const val =
+                         d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row];
+                     atomicAdd(&d_row_sizes[row], val);
+                   });
+
+  // transform the row sizes to include fixed width size and alignment
+  thrust::transform(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(),
+                    d_row_sizes.begin(), [fixed_width_and_validity_size] __device__(auto row_size) {
+                      return util::round_up_unsafe(fixed_width_and_validity_size + row_size,
+                                                   JCUDF_ROW_ALIGNMENT);
+                    });
+
+  return d_row_sizes;
+}
+
+/**
+ * @brief functor to return the offset of a row in a table with string columns
+ *
+ */
+struct string_row_offset_functor {
+  string_row_offset_functor(device_span<size_type> _d_row_offsets)
+      : d_row_offsets(_d_row_offsets){};
+
+  __device__ inline size_type operator()(int row_number, int) const {
+    return d_row_offsets[row_number];
+  }
+
+  device_span<size_type> d_row_offsets;
+};
+
+/**
+ * @brief functor to return the offset of a row in a table with only fixed-width columns
+ *
+ */
+struct fixed_width_row_offset_functor {
+  fixed_width_row_offset_functor(size_type fixed_width_only_row_size)
       : _fixed_width_only_row_size(fixed_width_only_row_size){};
 
   __device__ inline size_type operator()(int row_number, int tile_row_start) const {
@@ -542,6 +618,10 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum
         auto const relative_col = el / num_fetch_rows;
         auto const relative_row = el % num_fetch_rows;
         auto const absolute_col = relative_col + fetch_tile.start_col;
+        if (input_data[absolute_col] == nullptr) {
+          // variable-width data
+          continue;
+        }
         auto const absolute_row = relative_row + fetch_tile.start_row;
         auto const col_size = col_sizes[absolute_col];
         auto const col_offset = col_offsets[absolute_col];
@@ -1194,10 +1274,8 @@ static size_type compute_column_information(iterator begin, iterator end,
   auto validity_offset = fixed_width_size_per_row;
   column_starts.push_back(validity_offset);
 
-  return util::round_up_unsafe(
-      fixed_width_size_per_row +
-          util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT),
-      JCUDF_ROW_ALIGNMENT);
+  return fixed_width_size_per_row +
+         util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT);
 }
 
 /**
@@ -1512,20 +1590,27 @@ void determine_tiles(std::vector<size_type> const &column_sizes,
   }
 }
 
-#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-
-} // namespace detail
-
-std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource *mr) {
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
-  auto const num_columns = tbl.num_columns();
-  auto const num_rows = tbl.num_rows();
-
-  auto const fixed_width_only = std::all_of(
-      tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); });
-
+/**
+ * @brief convert cudf table into JCUDF row format
+ *
+ * @tparam offsetFunctor functor type for offset functor
+ * @param tbl table to convert to JCUDF row format
+ * @param batch_info information about the batches of data
+ * @param offset_functor functor that returns the starting offset of each row
+ * @param column_starts starting offset of a column in a row
+ * @param column_sizes size of each element in a column
+ * @param fixed_width_size_per_row size of fixed-width data in a row of this table
+ * @param stream stream used
+ * @param mr selected memory resource for returned data
+ * @return vector of list columns containing byte columns of the JCUDF row data
+ */
+template <typename offsetFunctor>
+std::vector<std::unique_ptr<column>>
+convert_to_rows(table_view const &tbl, batch_data &batch_info, offsetFunctor offset_functor,
+                std::vector<size_type> const &column_starts,
+                std::vector<size_type> const &column_sizes,
+                size_type const fixed_width_size_per_row, rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource *mr) {
   int device_id;
   CUDA_TRY(cudaGetDevice(&device_id));
   int total_shmem_in_bytes;
@@ -1537,23 +1622,12 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
       sizeof(cuda::barrier<cuda::thread_scope_block>) * NUM_TILES_PER_KERNEL_LOADED;
   auto const shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED;
 
-  // break up the work into tiles, which are a starting and ending row/col #.
-  // this tile size is calculated based on the shared memory size available
-  // we want a single tile to fill up the entire shared memory space available
-  // for the transpose-like conversion.
-
-  // There are two different processes going on here. The GPU conversion of the data
-  // and the writing of the data into the list of byte columns that are a maximum of
-  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
-  // this limitation because the column must own the data inside and as a result it must be
-  // a distinct allocation for that column. Copying the data into these final buffers would
-  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
-  // The tiles are broken at the boundaries of specific rows based on the row sizes up
-  // to that point. These are row batches and they are decided first before building the
-  // tiles so the tiles can be properly cut around them.
+  auto const num_rows = tbl.num_rows();
+  auto const num_columns = tbl.num_columns();
+  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
 
   // Get the pointers to the input columnar data ready
-
   auto data_begin = thrust::make_transform_iterator(
       tbl.begin(), [](auto const &c) { return c.template data<int8_t>(); });
   std::vector<int8_t const *> input_data(data_begin, data_begin + tbl.num_columns());
@@ -1565,27 +1639,6 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
   auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
   auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
 
-  std::vector<size_type> column_sizes;  // byte size of each column
-  std::vector<size_type> column_starts; // offset of column inside a row including alignment
-  column_sizes.reserve(num_columns);
-  column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
-
-  auto schema_column_iter =
-      thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                      [&tbl](auto i) -> std::tuple<data_type, column_view const> {
-                                        return {tbl.column(i).type(), tbl.column(i)};
-                                      });
-
-  auto const fixed_width_size_per_row = detail::compute_column_information(
-      schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes);
-
-  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
-  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
-
-  // total encoded row size. This includes fixed-width data, validity, and variable-width data.
-  auto row_size_iter = thrust::make_constant_iterator<uint64_t>(fixed_width_size_per_row);
-  auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
-
   // the first batch always exists unless we were sent an empty table
   auto const first_batch_size = batch_info.row_batches[0].row_count;
 
@@ -1636,8 +1689,6 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
       util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL));
   dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
 
-  detail::row_offset_functor offset_functor(fixed_width_size_per_row);
-
   detail::copy_to_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(),
       dev_col_sizes.data(), dev_col_starts.data(), offset_functor,
@@ -1670,6 +1721,76 @@ std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
                  });
 
   return ret;
+}
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+} // namespace detail
+
+std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource *mr) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+  auto const num_columns = tbl.num_columns();
+  auto const num_rows = tbl.num_rows();
+
+  auto const fixed_width_only = std::all_of(
+      tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); });
+
+  // break up the work into tiles, which are a starting and ending row/col #.
+  // this tile size is calculated based on the shared memory size available
+  // we want a single tile to fill up the entire shared memory space available
+  // for the transpose-like conversion.
+
+  // There are two different processes going on here. The GPU conversion of the data
+  // and the writing of the data into the list of byte columns that are a maximum of
+  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+  // this limitation because the column must own the data inside and as a result it must be
+  // a distinct allocation for that column. Copying the data into these final buffers would
+  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+  // The tiles are broken at the boundaries of specific rows based on the row sizes up
+  // to that point. These are row batches and they are decided first before building the
+  // tiles so the tiles can be properly cut around them.
+
+  std::vector<size_type> column_sizes;  // byte size of each column
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  column_sizes.reserve(num_columns);
+  column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
+
+  auto schema_column_iter =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                      [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+                                        return {tbl.column(i).type(), tbl.column(i)};
+                                      });
+
+  auto const fixed_width_size_per_row = detail::compute_column_information(
+      schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes);
+  if (fixed_width_only) {
+    // total encoded row size. This includes fixed-width data and validity only. It does not include
+    // variable-width data since it isn't copied with the fixed-width and validity kernel.
+    auto row_size_iter = thrust::make_constant_iterator<uint64_t>(
+        util::round_up_unsafe(fixed_width_size_per_row, JCUDF_ROW_ALIGNMENT));
+
+    auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
+
+    detail::fixed_width_row_offset_functor offset_functor(
+        util::round_up_unsafe(fixed_width_size_per_row, JCUDF_ROW_ALIGNMENT));
+
+    return detail::convert_to_rows(tbl, batch_info, offset_functor, column_starts, column_sizes,
+                                   fixed_width_size_per_row, stream, mr);
+  } else {
+    auto row_sizes = detail::build_string_row_sizes(tbl, fixed_width_size_per_row, stream);
+
+    auto row_size_iter = cudf::detail::make_counting_transform_iterator(
+        0, detail::row_size_functor(num_rows, row_sizes.data(), 0));
+
+    auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
+
+    detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets);
+
+    return detail::convert_to_rows(tbl, batch_info, offset_functor, column_starts, column_sizes,
+                                   fixed_width_size_per_row, stream, mr);
+  }
+
 #else
   CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
   return {};
@@ -1768,8 +1889,9 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
   auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
     return std::make_tuple(schema[i], nullptr);
   });
-  auto const fixed_width_size_per_row =
-      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
+  auto const fixed_width_size_per_row = util::round_up_unsafe(
+      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes),
+      JCUDF_ROW_ALIGNMENT);
 
   // Ideally we would check that the offsets are all the same, etc. but for now
   // this is probably fine
@@ -1842,7 +1964,7 @@ std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
 
   dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
 
-  detail::row_offset_functor offset_functor(fixed_width_size_per_row);
+  detail::fixed_width_row_offset_functor offset_functor(fixed_width_size_per_row);
 
   detail::copy_from_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
       num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(),
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index f9c8029ed84..b759c746735 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3099,6 +3099,28 @@ void testScanDenseRank() {
     }
   }
 
+  @Test
+  void testScanPercentRank() {
+    try (ColumnVector col1 = ColumnVector.fromBoxedInts(-97, -97, -97, null, -16, 5, null, null, 6, 6, 34, null);
+         ColumnVector col2 = ColumnVector.fromBoxedInts(  3,   3,   4,    7,   7, 7,    7,    7, 8, 8,  8,    9);
+         ColumnVector struct_order = ColumnVector.makeStruct(col1, col2);
+         ColumnVector expected = ColumnVector.fromBoxedDoubles(
+            0.0, 0.0, 2.0/11, 3.0/11, 4.0/11, 5.0/11, 6.0/11, 6.0/11, 8.0/11, 8.0/11, 10.0/11, 1.0)) {
+      try (ColumnVector result = struct_order.scan(ScanAggregation.percentRank(),
+              ScanType.INCLUSIVE, NullPolicy.INCLUDE)) {
+        assertColumnsAreEqual(expected, result);
+      }
+
+      // Exclude should have identical results
+      try (ColumnVector result = struct_order.scan(ScanAggregation.percentRank(),
+              ScanType.INCLUSIVE, NullPolicy.EXCLUDE)) {
+        assertColumnsAreEqual(expected, result);
+      }
+
+      // Percent rank aggregations do not support ScanType.EXCLUSIVE
+    }
+  }
+
   @Test
   void testWindowStatic() {
     try (Scalar one = Scalar.fromInt(1);
@@ -4342,10 +4364,10 @@ void testExtractListElements() {
            ColumnVector expected = ColumnVector.fromStrings("Héllo",
                    "thésé",
                    null,
-                   null,
+                   "",
                    "ARé",
                    "test");
-           ColumnVector tmp = v.stringSplitRecord();
+           ColumnVector tmp = v.stringSplitRecord(" ");
            ColumnVector result = tmp.extractListElement(0)) {
           assertColumnsAreEqual(expected, result);
       }
@@ -4739,28 +4761,12 @@ void testListSortRowsWithStringChild() {
     }
   }
 
-  @Test
-  void testStringSplitRecord() {
-      try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings");
-           ColumnVector expected = ColumnVector.fromLists(
-                   new HostColumnVector.ListType(true,
-                       new HostColumnVector.BasicType(true, DType.STRING)),
-                   Arrays.asList("Héllo", "there"),
-                   Arrays.asList("thésé"),
-                   Arrays.asList("null"),
-                   Arrays.asList(""),
-                   Arrays.asList("ARé", "some"),
-                   Arrays.asList("test", "strings"));
-           Scalar pattern = Scalar.fromString(" ");
-           ColumnVector result = v.stringSplitRecord(pattern, -1)) {
-          assertColumnsAreEqual(expected, result);
-      }
-  }
-
   @Test
   void testStringSplit() {
-    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here");
-         Table expectedSplitOnce = new Table.TestBuilder()
+    String pattern = " ";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "",
+        "ARé some things", "test strings here");
+         Table expectedSplitLimit2 = new Table.TestBuilder()
          .column("Héllo", "thésé", null, "", "ARé", "test")
          .column("there all", null, null, null, "some things", "strings here")
          .build();
@@ -4769,41 +4775,92 @@ void testStringSplit() {
          .column("there", null, null, null, "some", "strings")
          .column("all", null, null, null, "things", "here")
          .build();
-         Scalar pattern = Scalar.fromString(" ");
-         Table resultSplitOnce = v.stringSplit(pattern, 1);
+         Table resultSplitLimit2 = v.stringSplit(pattern, 2);
          Table resultSplitAll = v.stringSplit(pattern)) {
-          assertTablesAreEqual(expectedSplitOnce, resultSplitOnce);
+          assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2);
           assertTablesAreEqual(expectedSplitAll, resultSplitAll);
     }
   }
 
   @Test
-  void teststringSplitWhiteSpace() {
-    try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " ");
-         Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null)
-         .column("thesé", null, "some", "String", null)
-         .build();
-         Table result = v.stringSplit()) {
-      assertTablesAreEqual(expected, result);
+  void testStringSplitByRegularExpression() {
+    String pattern = "[_ ]";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "",
+        "ARé some_things", "test_strings_here");
+         Table expectedSplitLimit2 = new Table.TestBuilder()
+             .column("Héllo", "thésé", null, "", "ARé", "test")
+             .column("there all", null, null, null, "some_things", "strings_here")
+             .build();
+         Table expectedSplitAll = new Table.TestBuilder()
+             .column("Héllo", "thésé", null, "", "ARé", "test")
+             .column("there", null, null, null, "some", "strings")
+             .column("all", null, null, null, "things", "here")
+             .build();
+         Table resultSplitLimit2 = v.stringSplit(pattern, 2, true);
+         Table resultSplitAll = v.stringSplit(pattern, true)) {
+      assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2);
+      assertTablesAreEqual(expectedSplitAll, resultSplitAll);
     }
   }
 
   @Test
-  void teststringSplitThrowsException() {
-    assertThrows(CudfException.class, () -> {
-      try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
-           Scalar delimiter = Scalar.fromString(null);
-           Table result = cv.stringSplit(delimiter)) {}
-    });
-    assertThrows(AssertionError.class, () -> {
-    try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
-         Scalar delimiter = Scalar.fromInt(1);
-         Table result = cv.stringSplit(delimiter)) {}
-    });
-    assertThrows(AssertionError.class, () -> {
-      try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings");
-           Table result = cv.stringSplit(null)) {}
-    });
+  void testStringSplitRecord() {
+    String pattern = " ";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "",
+        "ARé some things", "test strings here");
+         ColumnVector expectedSplitLimit2 = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there all"),
+             Arrays.asList("thésé"),
+             null,
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some things"),
+             Arrays.asList("test", "strings here"));
+         ColumnVector expectedSplitAll = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there", "all"),
+             Arrays.asList("thésé"),
+             null,
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some", "things"),
+             Arrays.asList("test", "strings", "here"));
+         ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2);
+         ColumnVector resultSplitAll = v.stringSplitRecord(pattern)) {
+      assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2);
+      assertColumnsAreEqual(expectedSplitAll, resultSplitAll);
+    }
+  }
+
+  @Test
+  void testStringSplitRecordByRegularExpression() {
+    String pattern = "[_ ]";
+    try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "",
+        "ARé some_things", "test_strings_here");
+         ColumnVector expectedSplitLimit2 = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there all"),
+             Arrays.asList("thésé"),
+             null,
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some_things"),
+             Arrays.asList("test", "strings_here"));
+         ColumnVector expectedSplitAll = ColumnVector.fromLists(
+             new HostColumnVector.ListType(true,
+                 new HostColumnVector.BasicType(true, DType.STRING)),
+             Arrays.asList("Héllo", "there", "all"),
+             Arrays.asList("thésé"),
+             null,
+             Arrays.asList(""),
+             Arrays.asList("ARé", "some", "things"),
+             Arrays.asList("test", "strings", "here"));
+         ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2, true);
+         ColumnVector resultSplitAll = v.stringSplitRecord(pattern, true)) {
+      assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2);
+      assertColumnsAreEqual(expectedSplitAll, resultSplitAll);
+    }
   }
 
   @Test
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index db1327c5471..f309b1ee703 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -3930,8 +3930,8 @@ void testValidityFill() {
   @Test
   void testGroupByScan() {
     try (Table t1 = new Table.TestBuilder()
-        .column( "1",  "1",  "1",  "1",  "1",  "1",  "1",  "2",  "2",  "2",  "2")
-        .column(   0,    1,    3,    3,    5,    5,    5,    5,    5,    5,    5)
+        .column(  "1",  "1",  "1",  "1",  "1",  "1",  "1",  "2",  "2",  "2",  "2") // GBY Key#0
+        .column(   0,    1,    3,    3,    5,    5,    5,    5,    5,    5,    5)  // GBY Key#1
         .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0, null, null, 11.0, null, 10.0)
         .column(  -9, null,   -5,   0,     4,    4,    8,    2,    2,    2, null)
         .build()) {
@@ -3945,16 +3945,18 @@ void testGroupByScan() {
               GroupByScanAggregation.min().onColumn(2),
               GroupByScanAggregation.max().onColumn(2),
               GroupByScanAggregation.rank().onColumn(3),
-              GroupByScanAggregation.denseRank().onColumn(3));
+              GroupByScanAggregation.denseRank().onColumn(3),
+              GroupByScanAggregation.percentRank().onColumn(3));
            Table expected = new Table.TestBuilder()
-               .column( "1",  "1",  "1",  "1",  "1",  "1",  "1",  "2",  "2",  "2",  "2")
+               .column(  "1",  "1",  "1",  "1",  "1",  "1",  "1",  "2",  "2",  "2",  "2")
                .column(   0,    1,    3,    3,    5,    5,    5,    5,    5,    5,    5)
                .column(12.0, 14.0, 13.0, 30.0, 17.0, 34.0, null, null, 11.0, null, 21.0)
                .column(   0,    0,    0,    1,    0,    1,    2,    0,    1,    2,    3) // odd why is this not 1 based?
                .column(12.0, 14.0, 13.0, 13.0, 17.0, 17.0, null, null, 11.0, null, 10.0)
                .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0, null, null, 11.0, null, 11.0)
-               .column(1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 4)
-               .column(1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2)
+               .column(   1,    1,    1,    2,    1,    1,    3,    1,    1,    1,    4)
+               .column(   1,    1,    1,    2,    1,    1,    2,    1,    1,    1,    2)
+               .column( 0.0,  0.0,  0.0,  1.0,  0.0,  0.0,  1.0,  0.0,  0.0,  0.0,  1.0)
                .build()) {
         assertTablesAreEqual(expected, result);
       }
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 68f7101b6ee..35b806909ec 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from enum import Enum, IntEnum
 
@@ -59,6 +59,7 @@ class AggregationKind(Enum):
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
     CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION
+    COVARIANCE = libcudf_aggregation.aggregation.Kind.COVARIANCE
 
 
 class CorrelationType(IntEnum):
@@ -354,6 +355,7 @@ cdef class Aggregation:
             ))
         return agg
 
+
 cdef class RollingAggregation:
     """A Cython wrapper for rolling window aggregations.
 
@@ -742,6 +744,21 @@ cdef class GroupbyAggregation:
             ))
         return agg
 
+    @classmethod
+    def cov(
+        cls,
+        libcudf_types.size_type min_periods,
+        libcudf_types.size_type ddof=1
+    ):
+        cdef GroupbyAggregation agg = cls()
+
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_covariance_aggregation[groupby_aggregation](
+                min_periods, ddof
+            ))
+        return agg
+
 
 cdef class GroupbyScanAggregation:
     """A Cython wrapper for groupby scan aggregations.
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 3982b4fecbb..04eead4ee1b 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -40,6 +40,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             PTX 'cudf::aggregation::PTX'
             CUDA 'cudf::aggregation::CUDA'
             CORRELATION 'cudf::aggregation::CORRELATION'
+            COVARIANCE 'cudf::aggregation::COVARIANCE'
 
         Kind kind
 
@@ -117,3 +118,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[T] make_correlation_aggregation[T](
         correlation_type type, size_type min_periods) except +
+
+    cdef unique_ptr[T] make_covariance_aggregation[T](
+        size_type min_periods, size_type ddof) except +
diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd
index fd9992152a6..41d10b7b6da 100644
--- a/python/cudf/cudf/_lib/cpp/hash.pxd
+++ b/python/cudf/cudf/_lib/cpp/hash.pxd
@@ -1,18 +1,25 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
+
+    ctypedef enum hash_id "cudf::hash_id":
+        HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
+        HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3"
+        HASH_MD5 "cudf::hash_id::HASH_MD5"
+        HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3"
+        HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3"
+
     cdef unique_ptr[column] hash "cudf::hash" (
         const table_view& input,
-        const libcudf_types.hash_id hash_function,
+        const hash_id hash_function,
         const uint32_t seed
     ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/contains.pxd b/python/cudf/cudf/_lib/cpp/strings/contains.pxd
index 8014a60617d..59131f6e1aa 100644
--- a/python/cudf/cudf/_lib/cpp/strings/contains.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/contains.pxd
@@ -1,20 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.strings.regex_flags cimport regex_flags
 
 
-cdef extern from "cudf/strings/regex/flags.hpp" \
-        namespace "cudf::strings" nogil:
-
-    ctypedef enum regex_flags:
-        DEFAULT 'cudf::strings::regex_flags::DEFAULT'
-        MULTILINE  'cudf::strings::regex_flags::MULTILINE'
-        DOTALL 'cudf::strings::regex_flags::DOTALL'
-
 cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] contains_re(
diff --git a/python/cudf/cudf/_lib/cpp/strings/extract.pxd b/python/cudf/cudf/_lib/cpp/strings/extract.pxd
index 606369c8994..43a43594997 100644
--- a/python/cudf/cudf/_lib/cpp/strings/extract.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/extract.pxd
@@ -1,10 +1,11 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.strings.contains cimport regex_flags
 from cudf._lib.cpp.table.table cimport table
 
 
@@ -12,4 +13,5 @@ cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[table] extract(
         column_view source_strings,
-        string pattern) except +
+        string pattern,
+        regex_flags flags) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd b/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd
new file mode 100644
index 00000000000..2a5701fa6a3
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+cdef extern from "cudf/strings/regex/flags.hpp" \
+        namespace "cudf::strings" nogil:
+
+    ctypedef enum regex_flags:
+        DEFAULT 'cudf::strings::regex_flags::DEFAULT'
+        MULTILINE  'cudf::strings::regex_flags::MULTILINE'
+        DOTALL 'cudf::strings::regex_flags::DOTALL'
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index 23727a20ec2..b1a257feedf 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -81,13 +81,6 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         DECIMAL64              "cudf::type_id::DECIMAL64"
         DECIMAL128             "cudf::type_id::DECIMAL128"
 
-    ctypedef enum hash_id "cudf::hash_id":
-        HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
-        HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3"
-        HASH_MD5 "cudf::hash_id::HASH_MD5"
-        HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3"
-        HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3"
-
     cdef cppclass data_type:
         data_type() except +
         data_type(const data_type&) except +
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 314542c9549..49a924c9104 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from collections import defaultdict
 
@@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
 _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
                 "UNIQUE"}
 _LIST_AGGS = {"COLLECT"}
-_STRUCT_AGGS = {"CORRELATION"}
+_STRUCT_AGGS = {"CORRELATION", "COVARIANCE"}
 _INTERVAL_AGGS = set()
 _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
                  "NTH", "COLLECT"}
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index adc48159aac..301f571f5fb 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 from libcpp cimport bool
@@ -10,7 +10,7 @@ from libcpp.vector cimport vector
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.hash cimport hash as cpp_hash
+from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id
 from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -58,11 +58,11 @@ def hash(source_table, str method, int seed=0):
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)
     cdef unique_ptr[column] c_result
-    cdef libcudf_types.hash_id c_hash_function
+    cdef cpp_hash_id c_hash_function
     if method == "murmur3":
-        c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
+        c_hash_function = cpp_hash_id.HASH_MURMUR3
     elif method == "md5":
-        c_hash_function = libcudf_types.hash_id.HASH_MD5
+        c_hash_function = cpp_hash_id.HASH_MD5
     else:
         raise ValueError(f"Unsupported hash function: {method}")
     with nogil:
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index f18d0eb7f36..31f0fb2ec04 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
@@ -12,8 +12,8 @@ from cudf._lib.cpp.strings.contains cimport (
     contains_re as cpp_contains_re,
     count_re as cpp_count_re,
     matches_re as cpp_matches_re,
-    regex_flags as regex_flags,
 )
+from cudf._lib.cpp.strings.regex_flags cimport regex_flags
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 9d554aa2162..bac282dccc5 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,5 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -8,12 +9,13 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.strings.extract cimport extract as cpp_extract
+from cudf._lib.cpp.strings.regex_flags cimport regex_flags
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
-def extract(Column source_strings, object pattern):
+def extract(Column source_strings, object pattern, uint32_t flags):
     """
     Returns data which contains extracted capture groups provided in
     `pattern` for all `source_strings`.
@@ -24,11 +26,13 @@ def extract(Column source_strings, object pattern):
     cdef column_view source_view = source_strings.view()
 
     cdef string pattern_string = <string>str(pattern).encode()
+    cdef regex_flags c_flags = <regex_flags>flags
 
     with nogil:
         c_result = move(cpp_extract(
             source_view,
-            pattern_string
+            pattern_string,
+            c_flags
         ))
 
     return data_from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index c4e4b6c38d8..2d62a3f9a4b 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -9,11 +9,11 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.contains cimport regex_flags
 from cudf._lib.cpp.strings.findall cimport (
     findall as cpp_findall,
     findall_record as cpp_findall_record,
 )
+from cudf._lib.cpp.strings.regex_flags cimport regex_flags
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 7fa745719c9..89e6dbf0826 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1,8 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import pickle
+from functools import cached_property
 from typing import Any, Set
 
 import pandas as pd
@@ -31,7 +32,6 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.utils import cached_property
 
 
 class BaseIndex(Serializable):
@@ -1201,9 +1201,9 @@ def rename(self, name, inplace=False):
             self.name = name
             return None
         else:
-            out = self.copy(deep=False)
+            out = self.copy(deep=True)
             out.name = name
-            return out.copy(deep=True)
+            return out
 
     def astype(self, dtype, copy=False):
         """
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 24f9dc83ca9..f57d88fb39d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -1048,7 +1048,9 @@ def find_and_replace(
                 f"got to_replace dtype: {to_replace_col.dtype} and "
                 f"value dtype: {replacement_col.dtype}"
             )
-        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
+        df = cudf.DataFrame._from_data(
+            {"old": to_replace_col, "new": replacement_col}
+        )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
             fill_value = df._data["new"][df._data["old"].isnull()][0]
@@ -1058,8 +1060,7 @@ def find_and_replace(
                 new_categories = self.categories.append(
                     column.as_column([fill_value])
                 )
-                replaced = self.copy()
-                replaced = replaced._set_categories(new_categories)
+                replaced = self._set_categories(new_categories)
                 replaced = replaced.fillna(fill_value)
             df = df.dropna(subset=["old"])
             to_replace_col = df._data["old"]
@@ -1078,35 +1079,41 @@ def find_and_replace(
             replacement_col = df._data["new"]
 
         # create a dataframe containing the pre-replacement categories
-        # and a copy of them to work with. The index of this dataframe
-        # represents the original ints that map to the categories
-        old_cats = cudf.DataFrame()
-        old_cats["cats"] = column.as_column(replaced.dtype.categories)
-        new_cats = old_cats.copy(deep=True)
-
-        # Create a column with the appropriate labels replaced
-        old_cats["cats_replace"] = old_cats["cats"].replace(
-            to_replace_col, replacement_col
+        # and a column with the appropriate labels replaced.
+        # The index of this dataframe represents the original
+        # ints that map to the categories
+        cats_col = column.as_column(replaced.dtype.categories)
+        old_cats = cudf.DataFrame._from_data(
+            {
+                "cats": cats_col,
+                "cats_replace": cats_col.find_and_replace(
+                    to_replace_col, replacement_col
+                ),
+            }
         )
 
         # Construct the new categorical labels
         # If a category is being replaced by an existing one, we
         # want to map it to None. If it's totally new, we want to
         # map it to the new label it is to be replaced by
-        dtype_replace = cudf.Series(replacement_col)
-        dtype_replace[dtype_replace.isin(old_cats["cats"])] = None
-        new_cats["cats"] = new_cats["cats"].replace(
-            to_replace_col, dtype_replace
+        dtype_replace = cudf.Series._from_data({None: replacement_col})
+        dtype_replace[dtype_replace.isin(cats_col)] = None
+        new_cats_col = cats_col.find_and_replace(
+            to_replace_col, dtype_replace._column
         )
 
         # anything we mapped to None, we want to now filter out since
         # those categories don't exist anymore
         # Resetting the index creates a column 'index' that associates
         # the original integers to the new labels
-        bmask = new_cats._data["cats"].notnull()
-        new_cats = cudf.DataFrame(
-            {"cats": new_cats._data["cats"].apply_boolean_mask(bmask)}
-        ).reset_index()
+        bmask = new_cats_col.notnull()
+        new_cats_col = new_cats_col.apply_boolean_mask(bmask)
+        new_cats = cudf.DataFrame._from_data(
+            {
+                "index": cudf.core.column.arange(len(new_cats_col)),
+                "cats": new_cats_col,
+            }
+        )
 
         # old_cats contains replaced categories and the ints that
         # previously mapped to those categories and the index of
@@ -1507,9 +1514,15 @@ def _set_categories(
         old_codes = column.arange(len(cur_cats), dtype=out_code_dtype)
         new_codes = column.arange(len(new_cats), dtype=out_code_dtype)
 
-        new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats})
-        old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats})
-        cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order})
+        new_df = cudf.DataFrame._from_data(
+            data={"new_codes": new_codes, "cats": new_cats}
+        )
+        old_df = cudf.DataFrame._from_data(
+            data={"old_codes": old_codes, "cats": cur_cats}
+        )
+        cur_df = cudf.DataFrame._from_data(
+            data={"old_codes": cur_codes, "order": cur_order}
+        )
 
         # Join the old and new categories and line up their codes
         df = old_df.merge(new_df, on="cats", how="left")
@@ -1519,7 +1532,7 @@ def _set_categories(
         df.reset_index(drop=True, inplace=True)
 
         ordered = ordered if ordered is not None else self.ordered
-        new_codes = df["new_codes"]._column
+        new_codes = df._data["new_codes"]
 
         # codes can't have masks, so take mask out before moving in
         return column.build_categorical_column(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2c3951c0e5e..1c1c2ef2bf6 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -177,34 +177,28 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
         return self.binary_operator("NULL_EQUALS", other).all()
 
     def all(self, skipna: bool = True) -> bool:
+        # The skipna argument is only used for numerical columns.
         # If all entries are null the result is True, including when the column
         # is empty.
-        result_col = self.nans_to_nulls() if skipna else self
 
-        if result_col.null_count == result_col.size:
+        if self.null_count == self.size:
             return True
 
-        if isinstance(result_col, ColumnBase):
-            return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
-
-        return result_col
+        return libcudf.reduce.reduce("all", self, dtype=np.bool_)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
-        result_col = self.nans_to_nulls() if skipna else self
-        if not skipna and result_col.has_nulls():
+
+        if not skipna and self.has_nulls():
             return True
-        elif skipna and result_col.null_count == result_col.size:
+        elif skipna and self.null_count == self.size:
             return False
 
-        if isinstance(result_col, ColumnBase):
-            return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
-
-        return result_col
+        return libcudf.reduce.reduce("any", self, dtype=np.bool_)
 
     def dropna(self, drop_nan: bool = False) -> ColumnBase:
-        col = self.nans_to_nulls() if drop_nan else self
-        return drop_nulls([col])[0]
+        # The drop_nan argument is only used for numerical columns.
+        return drop_nulls([self])[0]
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -341,6 +335,14 @@ def _fill(
         return self
 
     def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
+        # libcudf currently doesn't handle case when offset > len(df)
+        # ticket to fix the bug in link below:
+        # https://github.com/rapidsai/cudf/issues/10314
+        if abs(offset) > len(self):
+            if fill_value is None:
+                return column_empty_like(self, masked=True)
+            else:
+                return full(len(self), fill_value, dtype=self.dtype)
         return libcudf.copying.shift(self, offset, fill_value)
 
     @property
@@ -1164,8 +1166,9 @@ def corr(self, other: ColumnBase):
             f"cannot perform corr with types {self.dtype}, {other.dtype}"
         )
 
-    def nans_to_nulls(self: T) -> T:
-        return self
+    @property
+    def contains_na_entries(self) -> bool:
+        return self.null_count != 0
 
     def _process_for_reduction(
         self, skipna: bool = None, min_count: int = 0
@@ -1173,14 +1176,13 @@ def _process_for_reduction(
         skipna = True if skipna is None else skipna
 
         if skipna:
-            result_col = self.nans_to_nulls()
-            if result_col.has_nulls():
-                result_col = result_col.dropna()
+            if self.has_nulls():
+                result_col = self.dropna()
         else:
             if self.has_nulls():
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-            result_col = self
+        result_col = self
 
         if min_count > 0:
             valid_count = len(result_col) - result_col.null_count
@@ -1608,8 +1610,8 @@ def build_struct_column(
 
     Parameters
     ----------
-    names : list-like
-        Field names to map to children dtypes
+    names : sequence of strings
+        Field names to map to children dtypes, must be strings.
     children : tuple
 
     mask: Buffer
@@ -2093,24 +2095,16 @@ def as_column(
                         dtype = "bool"
                     np_type = np.dtype(dtype).type
                     pa_type = np_to_pa_dtype(np.dtype(dtype))
-                # TODO: A warning is emitted from pyarrow 5.0.0's function
-                # pyarrow.lib._sequence_to_array:
-                # "DeprecationWarning: an integer is required (got type float).
-                # Implicit conversion to integers using __int__ is deprecated,
-                # and may be removed in a future version of Python."
-                # This warning does not appear in pyarrow 6.0.1 and will be
-                # resolved by https://github.com/rapidsai/cudf/pull/9686/.
-                with warnings.catch_warnings():
-                    warnings.simplefilter("ignore", DeprecationWarning)
-                    pa_array = pa.array(
+                data = as_column(
+                    pa.array(
                         arbitrary,
                         type=pa_type,
                         from_pandas=True
                         if nan_as_null is None
                         else nan_as_null,
-                    )
-                data = as_column(
-                    pa_array, dtype=dtype, nan_as_null=nan_as_null,
+                    ),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
                 )
             except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
                 if is_categorical_dtype(dtype):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 9b54c4d9acd..2c483cbd00b 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -20,6 +20,7 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.stream_compaction import drop_nulls
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike
 from cudf.api.types import is_integer_dtype, is_number
 from cudf.core.buffer import Buffer
@@ -316,6 +317,27 @@ def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
             return self
         return libcudf.unary.cast(self, dtype)
 
+    def all(self, skipna: bool = True) -> bool:
+        # If all entries are null the result is True, including when the column
+        # is empty.
+        result_col = self.nans_to_nulls() if skipna else self
+
+        if result_col.null_count == result_col.size:
+            return True
+
+        return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+
+    def any(self, skipna: bool = True) -> bool:
+        # Early exit for fast cases.
+        result_col = self.nans_to_nulls() if skipna else self
+
+        if not skipna and result_col.has_nulls():
+            return True
+        elif skipna and result_col.null_count == result_col.size:
+            return False
+
+        return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+
     @property
     def nan_count(self) -> int:
         if self.dtype.kind != "f":
@@ -325,6 +347,14 @@ def nan_count(self) -> int:
             self._nan_count = nan_col.sum()
         return self._nan_count
 
+    def dropna(self, drop_nan: bool = False) -> NumericalColumn:
+        col = self.nans_to_nulls() if drop_nan else self
+        return drop_nulls([col])[0]
+
+    @property
+    def contains_na_entries(self) -> bool:
+        return (self.nan_count != 0) or (self.null_count != 0)
+
     def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
@@ -346,12 +376,14 @@ def _can_return_nan(self, skipna: bool = None) -> bool:
 
     def _process_for_reduction(
         self, skipna: bool = None, min_count: int = 0
-    ) -> Union[ColumnBase, ScalarLike]:
+    ) -> Union[NumericalColumn, ScalarLike]:
         skipna = True if skipna is None else skipna
 
         if self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-        return super()._process_for_reduction(
+
+        col = self.nans_to_nulls() if skipna else self
+        return super(NumericalColumn, col)._process_for_reduction(
             skipna=skipna, min_count=min_count
         )
 
@@ -399,7 +431,6 @@ def find_and_replace(
             replacement_col = _normalize_find_and_replace_input(
                 self.dtype, replacement
             )
-        replaced = self.copy()
         if len(replacement_col) == 1 and len(to_replace_col) > 1:
             replacement_col = column.as_column(
                 utils.scalar_broadcast_to(
@@ -407,11 +438,13 @@ def find_and_replace(
                 )
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
-            return replaced
+            return self.copy()
         to_replace_col, replacement_col, replaced = numeric_normalize_types(
-            to_replace_col, replacement_col, replaced
+            to_replace_col, replacement_col, self
+        )
+        df = cudf.DataFrame._from_data(
+            {"old": to_replace_col, "new": replacement_col}
         )
-        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
             replaced = replaced.fillna(
@@ -420,7 +453,7 @@ def find_and_replace(
             df = df.dropna(subset=["old"])
 
         return libcudf.replace.replace(
-            replaced, df["old"]._column, df["new"]._column
+            replaced, df._data["old"], df._data["new"]
         )
 
     def fillna(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ee1ddb58abc..5e40356ed0f 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -574,6 +574,8 @@ def extract(
         ----------
         pat : str
             Regular expression pattern with capturing groups.
+        flags : int, default 0 (no flags)
+            Flags to pass through to the regex engine (e.g. re.MULTILINE)
         expand : bool, default True
             If True, return DataFrame with one column per capture group.
             If False, return a Series/Index if there is one capture group or
@@ -588,8 +590,8 @@ def extract(
 
         Notes
         -----
-        The `flags` parameter is not yet supported and will raise a
-        NotImplementedError if anything other than the default value is passed.
+        The `flags` parameter currently only supports re.DOTALL and
+        re.MULTILINE.
 
         Examples
         --------
@@ -618,10 +620,12 @@ def extract(
         2    <NA>
         dtype: object
         """  # noqa W605
-        if flags != 0:
-            raise NotImplementedError("`flags` parameter is not yet supported")
+        if not _is_supported_regex_flags(flags):
+            raise NotImplementedError(
+                "unsupported value for `flags` parameter"
+            )
 
-        data, index = libstrings.extract(self._column, pat)
+        data, index = libstrings.extract(self._column, pat, flags)
         if len(data) == 1 and expand is False:
             data = next(iter(data.values()))
         else:
@@ -752,7 +756,9 @@ def contains(
             flags = pat.flags & ~re.U
             pat = pat.pattern
         if not _is_supported_regex_flags(flags):
-            raise ValueError("invalid `flags` parameter value")
+            raise NotImplementedError(
+                "unsupported value for `flags` parameter"
+            )
 
         if pat is None:
             result_col = column.column_empty(
@@ -3393,7 +3399,9 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
             flags = pat.flags & ~re.U
             pat = pat.pattern
         if not _is_supported_regex_flags(flags):
-            raise ValueError("invalid `flags` parameter value")
+            raise NotImplementedError(
+                "unsupported value for `flags` parameter"
+            )
 
         return self._return_or_inplace(
             libstrings.count_re(self._column, pat, flags)
@@ -3969,7 +3977,9 @@ def match(
             flags = pat.flags & ~re.U
             pat = pat.pattern
         if not _is_supported_regex_flags(flags):
-            raise ValueError("invalid `flags` parameter value")
+            raise NotImplementedError(
+                "unsupported value for `flags` parameter"
+            )
 
         return self._return_or_inplace(
             libstrings.match_re(self._column, pat, flags)
@@ -5329,7 +5339,9 @@ def find_and_replace(
             and replacement_col.dtype != self.dtype
         ):
             return self.copy()
-        df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
+        df = cudf.DataFrame._from_data(
+            {"old": to_replace_col, "new": replacement_col}
+        )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
             res = self.fillna(df._data["new"][df._data["old"].isnull()][0])
@@ -5401,7 +5413,7 @@ def binary_operator(
                 return cast(
                     "column.ColumnBase",
                     libstrings.concatenate(
-                        cudf.DataFrame({0: lhs, 1: rhs}),
+                        cudf.DataFrame._from_data(data={0: lhs, 1: rhs}),
                         sep=cudf.Scalar(""),
                         na_rep=cudf.Scalar(None, "str"),
                     ),
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 67976ac27d4..9cb86ca1cd2 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -4,7 +4,7 @@
 
 import itertools
 from collections.abc import MutableMapping
-from functools import reduce
+from functools import cached_property, reduce
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -20,7 +20,6 @@
 
 import cudf
 from cudf.core import column
-from cudf.utils.utils import cached_property
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
@@ -360,9 +359,9 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
             start, stop, step = index.indices(len(self._data))
             keys = self.names[start:stop:step]
         elif pd.api.types.is_integer(index):
-            keys = [self.names[index]]
+            keys = (self.names[index],)
         else:
-            keys = (self.names[i] for i in index)
+            keys = tuple(self.names[i] for i in index)
         data = {k: self._data[k] for k in keys}
         return self.__class__(
             data, multiindex=self.multiindex, level_names=self.level_names,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 99b5de757a6..b31f9f2c30e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -113,6 +113,7 @@ def __setitem__(self, key, value):
             key = (key, slice(None))
         return self._setitem_tuple_arg(key, value)
 
+    @annotate("_CAN_DOWNCAST_TO_SERIES", color="green", domain="cudf_python")
     def _can_downcast_to_series(self, df, arg):
         """
         This method encapsulates the logic used
@@ -153,6 +154,7 @@ def _can_downcast_to_series(self, df, arg):
                 return True
         return False
 
+    @annotate("_DOWNCAST_TO_SERIES", color="green", domain="cudf_python")
     def _downcast_to_series(self, df, arg):
         """
         "Downcast" from a DataFrame to a Series
@@ -194,6 +196,7 @@ class _DataFrameLocIndexer(_DataFrameIndexer):
     For selection by label.
     """
 
+    @annotate("_GETITEM_SCALAR", color="green", domain="cudf_python")
     def _getitem_scalar(self, arg):
         return self._frame[arg[1]].loc[arg[0]]
 
@@ -635,6 +638,9 @@ def __init__(
         if dtype:
             self._data = self.astype(dtype)._data
 
+    @annotate(
+        "DATAFRAME_INIT_FROM_SERIES_LIST", color="blue", domain="cudf_python"
+    )
     def _init_from_series_list(self, data, columns, index):
         if index is None:
             # When `index` is `None`, the final index of
@@ -733,6 +739,9 @@ def _init_from_series_list(self, data, columns, index):
                     )
             self._data = self._data.select_by_label(columns)
 
+    @annotate(
+        "DATAFRAME_INIT_FROM_LIST_LIKE", color="blue", domain="cudf_python"
+    )
     def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
@@ -769,6 +778,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
 
             self.columns = columns
 
+    @annotate(
+        "DATAFRAME_INIT_FROM_DICT_LIKE", color="blue", domain="cudf_python"
+    )
     def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
@@ -820,8 +832,8 @@ def _init_from_dict_like(
                 self._data.multiindex = self._data.multiindex and isinstance(
                     col_name, tuple
                 )
-                self.insert(
-                    i, col_name, data[col_name], nan_as_null=nan_as_null
+                self._insert(
+                    i, col_name, data[col_name], nan_as_null=nan_as_null,
                 )
 
         if columns is not None:
@@ -842,6 +854,11 @@ def _from_data(
         return out
 
     @staticmethod
+    @annotate(
+        "DATAFRAME_ALIGN_INPUT_SERIES_INDICES",
+        color="blue",
+        domain="cudf_python",
+    )
     def _align_input_series_indices(data, index):
         data = data.copy()
 
@@ -1077,7 +1094,7 @@ def __setitem__(self, arg, value):
                 )
             else:
                 for col_name in self._data:
-                    scatter_map = arg[col_name]
+                    scatter_map = arg._data[col_name]
                     if is_scalar(value):
                         self._data[col_name][scatter_map] = value
                     else:
@@ -1187,6 +1204,7 @@ def __delitem__(self, name):
         """
         self._drop_column(name)
 
+    @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python")
     def _slice(self: T, arg: slice) -> T:
         """
         _slice : slice the frame as per the arg
@@ -1248,18 +1266,13 @@ def _slice(self: T, arg: slice) -> T:
                 result.columns = self.columns
                 return result
 
+    @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python")
     def memory_usage(self, index=True, deep=False):
         return Series(
             {str(k): v for k, v in super().memory_usage(index, deep).items()}
         )
 
-    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-        if method == "__call__" and hasattr(cudf, ufunc.__name__):
-            func = getattr(cudf, ufunc.__name__)
-            return func(self)
-        else:
-            return NotImplemented
-
+    @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python")
     def __array_function__(self, func, types, args, kwargs):
 
         cudf_df_module = DataFrame
@@ -1298,6 +1311,7 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
     # The _get_numeric_data method is necessary for dask compatibility.
+    @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python")
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1307,6 +1321,7 @@ def _get_numeric_data(self):
         ]
         return self[columns]
 
+    @annotate("DATAFRAME_ASSIGN", color="blue", domain="cudf_python")
     def assign(self, **kwargs):
         """
         Assign columns to DataFrame from keyword arguments.
@@ -1322,10 +1337,16 @@ def assign(self, **kwargs):
         1  1  4
         2  2  5
         """
-        new = self.copy()
+        new_df = cudf.DataFrame(index=self.index.copy())
+        for name, col in self._data.items():
+            if name in kwargs:
+                new_df[name] = kwargs.pop(name)
+            else:
+                new_df._data[name] = col.copy()
+
         for k, v in kwargs.items():
-            new[k] = v
-        return new
+            new_df[k] = v
+        return new_df
 
     @classmethod
     @annotate("CONCAT", color="orange", domain="cudf_python")
@@ -1785,10 +1806,12 @@ def _get_renderable_dataframe(self):
 
         return output
 
+    @annotate("DATAFRAME_REPR", color="blue", domain="cudf_python")
     def __repr__(self):
         output = self._get_renderable_dataframe()
         return self._clean_renderable_dataframe(output)
 
+    @annotate("DATAFRAME_REPR_HTML", color="blue", domain="cudf_python")
     def _repr_html_(self):
         lines = (
             self._get_renderable_dataframe()
@@ -1805,9 +1828,13 @@ def _repr_html_(self):
             lines.append("</div>")
         return "\n".join(lines)
 
+    @annotate("DATAFRAME_REPR_LATEX", color="blue", domain="cudf_python")
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
+    @annotate(
+        "DATAFRAME_GET_COLUMNS_BY_LABEL", color="blue", domain="cudf_python"
+    )
     def _get_columns_by_label(self, labels, downcast=False):
         """
         Return columns of dataframe by `labels`
@@ -1830,7 +1857,7 @@ def _get_columns_by_label(self, labels, downcast=False):
         )
         return out
 
-    def _binaryop(
+    def _prep_for_binop(
         self,
         other: Any,
         fn: str,
@@ -1850,6 +1877,7 @@ def _binaryop(
         # implementation assumes that binary operations between a column and
         # NULL are always commutative, even for binops (like subtraction) that
         # are normally anticommutative.
+        # TODO: We probably should support pandas DataFrame/Series objects.
         if isinstance(rhs, Sequence):
             # TODO: Consider validating sequence length (pandas does).
             operands = {
@@ -1913,13 +1941,33 @@ def _binaryop(
                     right = right_dict[col]
                 operands[col] = (left, right, reflect, fill_value)
         else:
+            return NotImplemented, None
+
+        return operands, lhs._index
+
+    @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
+    def _binaryop(
+        self,
+        other: Any,
+        fn: str,
+        fill_value: Any = None,
+        reflect: bool = False,
+        can_reindex: bool = False,
+        *args,
+        **kwargs,
+    ):
+        operands, out_index = self._prep_for_binop(
+            other, fn, fill_value, reflect, can_reindex
+        )
+        if operands is NotImplemented:
             return NotImplemented
 
         return self._from_data(
             ColumnAccessor(type(self)._colwise_binop(operands, fn)),
-            index=lhs._index,
+            index=out_index,
         )
 
+    @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python")
     def update(
         self,
         other,
@@ -2013,14 +2061,27 @@ def update(
 
         self._mimic_inplace(source_df, inplace=True)
 
+    @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python")
     def __iter__(self):
         return iter(self.columns)
 
+    @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python")
     def iteritems(self):
+        """Iterate over column names and series pairs"""
+        warnings.warn(
+            "iteritems is deprecated and will be removed in a future version. "
+            "Use .items instead.",
+            FutureWarning,
+        )
+        return self.items()
+
+    @annotate("DATAFRAME_ITEMS", color="blue", domain="cudf_python")
+    def items(self):
         """Iterate over column names and series pairs"""
         for k in self:
             yield (k, self[k])
 
+    @annotate("DATAFRAME_EQUALS", color="blue", domain="cudf_python")
     def equals(self, other, **kwargs):
         ret = super().equals(other)
         # If all other checks matched, validate names.
@@ -2083,6 +2144,7 @@ def columns(self, columns):
             data, multiindex=is_multiindex, level_names=columns.names,
         )
 
+    @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python")
     def _reindex(
         self, columns, dtypes=None, deep=False, index=None, inplace=False
     ):
@@ -2159,6 +2221,7 @@ def _reindex(
 
         return self._mimic_inplace(result, inplace=inplace)
 
+    @annotate("DATAFRAME_REINDEX", color="blue", domain="cudf_python")
     def reindex(
         self, labels=None, axis=None, index=None, columns=None, copy=True
     ):
@@ -2237,6 +2300,7 @@ def reindex(
             inplace=False,
         )
 
+    @annotate("DATAFRAME_SET_INDEX", color="blue", domain="cudf_python")
     def set_index(
         self,
         keys,
@@ -2279,7 +2343,7 @@ def set_index(
         3  4  d  4.0
         4  5  e  5.0
 
-        Set the index to become the ‘b’ column:
+        Set the index to become the 'b' column:
 
         >>> df.set_index('b')
            a    c
@@ -2290,7 +2354,7 @@ def set_index(
         d  4  4.0
         e  5  5.0
 
-        Create a MultiIndex using columns ‘a’ and ‘b’:
+        Create a MultiIndex using columns 'a' and 'b':
 
         >>> df.set_index(["a", "b"])
                c
@@ -2497,7 +2561,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @annotate("INSERT", color="green", domain="cudf_python")
+    @annotate("DATAFRAME_INSERT", color="green", domain="cudf_python")
     def insert(self, loc, name, value, nan_as_null=None):
         """Add a column to DataFrame at the index specified by loc.
 
@@ -2508,6 +2572,29 @@ def insert(self, loc, name, value, nan_as_null=None):
         name : number or string
             name or label of column to be inserted
         value : Series or array-like
+        nan_as_null : bool, Default None
+            If ``None``/``True``, converts ``np.nan`` values to
+            ``null`` values.
+            If ``False``, leaves ``np.nan`` values as is.
+        """
+        return self._insert(
+            loc=loc,
+            name=name,
+            value=value,
+            nan_as_null=nan_as_null,
+            ignore_index=False,
+        )
+
+    @annotate("DATAFRAME__INSERT", color="green", domain="cudf_python")
+    def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
+        """
+        Same as `insert`, with additional `ignore_index` param.
+
+        ignore_index : bool, default True
+            If True, there will be no index equality check & reindexing
+            happening.
+            If False, a reindexing operation is performed if
+            `value.index` is not equal to `self.index`.
         """
         if name in self._data:
             raise NameError(f"duplicated column name {name}")
@@ -2528,7 +2615,8 @@ def insert(self, loc, name, value, nan_as_null=None):
 
         if len(self) == 0:
             if isinstance(value, (pd.Series, Series)):
-                self._index = as_index(value.index)
+                if not ignore_index:
+                    self._index = as_index(value.index)
             elif len(value) > 0:
                 self._index = RangeIndex(start=0, stop=len(value))
                 new_data = self._data.__class__()
@@ -2541,9 +2629,11 @@ def insert(self, loc, name, value, nan_as_null=None):
                         )
                 self._data = new_data
         elif isinstance(value, (pd.Series, Series)):
-            value = Series(value, nan_as_null=nan_as_null)._align_to_index(
-                self._index, how="right", sort=False
-            )
+            value = Series(value, nan_as_null=nan_as_null)
+            if not ignore_index:
+                value = value._align_to_index(
+                    self._index, how="right", sort=False
+                )
 
         value = column.as_column(value, nan_as_null=nan_as_null)
 
@@ -2623,6 +2713,7 @@ def diff(self, periods=1, axis=0):
 
         return self - self.shift(periods=periods)
 
+    @annotate("DATAFRAME_DROP", color="green", domain="cudf_python")
     def drop(
         self,
         labels=None,
@@ -2796,12 +2887,14 @@ def drop(
         if not inplace:
             return out
 
+    @annotate("DATAFRAME_DROP_COLUMN", color="green", domain="cudf_python")
     def _drop_column(self, name):
         """Drop a column by *name*"""
         if name not in self._data:
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
+    @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python")
     def drop_duplicates(
         self, subset=None, keep="first", inplace=False, ignore_index=False
     ):
@@ -2879,12 +2972,14 @@ def drop_duplicates(
 
         return self._mimic_inplace(outdf, inplace=inplace)
 
+    @annotate("DATAFRAME_POP", color="green", domain="cudf_python")
     def pop(self, item):
         """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
 
+    @annotate("DATAFRAME_RENAME", color="green", domain="cudf_python")
     def rename(
         self,
         mapper=None,
@@ -3028,6 +3123,7 @@ def rename(
         else:
             return out.copy(deep=copy)
 
+    @annotate("DATAFRAME_ADD_PREFIX", color="green", domain="cudf_python")
     def add_prefix(self, prefix):
         out = self.copy(deep=True)
         out.columns = [
@@ -3035,6 +3131,7 @@ def add_prefix(self, prefix):
         ]
         return out
 
+    @annotate("DATAFRAME_ADD_SUFFIX", color="green", domain="cudf_python")
     def add_suffix(self, suffix):
         out = self.copy(deep=True)
         out.columns = [
@@ -3042,6 +3139,7 @@ def add_suffix(self, suffix):
         ]
         return out
 
+    @annotate("DATAFRAME_AGG", color="green", domain="cudf_python")
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3173,6 +3271,7 @@ def agg(self, aggs, axis=None):
         else:
             raise ValueError("argument must be a string, list or dict")
 
+    @annotate("DATAFRAME_NLARGEST", color="green", domain="cudf_python")
     def nlargest(self, n, columns, keep="first"):
         """Get the rows of the DataFrame sorted by the n largest value of *columns*
 
@@ -3304,6 +3403,7 @@ def nsmallest(self, n, columns, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, columns, keep)
 
+    @annotate("DATAFRAME_TRANSPOSE", color="green", domain="cudf_python")
     def transpose(self):
         """Transpose index and columns.
 
@@ -3334,6 +3434,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
+    @annotate("DATAFRAME_MELT", color="green", domain="cudf_python")
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
@@ -3363,7 +3464,7 @@ def melt(self, **kwargs):
 
         return melt(self, **kwargs)
 
-    @annotate("JOIN", color="blue", domain="cudf_python")
+    @annotate("DATAFRAME_JOIN", color="blue", domain="cudf_python")
     def merge(
         self,
         right,
@@ -3545,6 +3646,7 @@ def join(
         )
         return df
 
+    @annotate("DATAFRAME_GROUPBY", color="green", domain="cudf_python")
     @copy_docstring(DataFrameGroupBy)
     def groupby(
         self,
@@ -3660,7 +3762,7 @@ def query(self, expr, local_dict=None):
         """
         # can't use `annotate` decorator here as we inspect the calling
         # environment.
-        with annotate("QUERY", color="purple", domain="cudf_python"):
+        with annotate("DATAFRAME_QUERY", color="purple", domain="cudf_python"):
             if local_dict is None:
                 local_dict = {}
 
@@ -3684,6 +3786,7 @@ def query(self, expr, local_dict=None):
             boolmask = queryutils.query_execute(self, expr, callenv)
             return self._apply_boolean_mask(boolmask)
 
+    @annotate("DATAFRAME_APPLY", color="green", domain="cudf_python")
     def apply(
         self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
     ):
@@ -3832,6 +3935,7 @@ def apply(
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
+    @annotate("DATAFRAME_APPLY_ROWS", color="green", domain="cudf_python")
     @applyutils.doc_apply()
     def apply_rows(
         self,
@@ -3910,6 +4014,7 @@ def apply_rows(
             cache_key=cache_key,
         )
 
+    @annotate("DATAFRAME_APPLY_CHUNKS", color="green", domain="cudf_python")
     @applyutils.doc_applychunks()
     def apply_chunks(
         self,
@@ -3977,6 +4082,9 @@ def apply_chunks(
             tpb=tpb,
         )
 
+    @annotate(
+        "DATAFRAME_PARTITION_BY_HASH", color="green", domain="cudf_python"
+    )
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
 
@@ -4314,6 +4422,7 @@ def _sizeof_fmt(num, size_qualifier):
 
         cudf.utils.ioutils.buffer_write_lines(buf, lines)
 
+    @annotate("DATAFRAME_DESCRIBE", color="green", domain="cudf_python")
     @docutils.doc_describe()
     def describe(
         self,
@@ -4373,6 +4482,7 @@ def describe(
                 sort=False,
             )
 
+    @annotate("DATAFRAME_TO_PANDAS", color="green", domain="cudf_python")
     def to_pandas(self, nullable=False, **kwargs):
         """
         Convert to a Pandas DataFrame.
@@ -4459,6 +4569,7 @@ def to_pandas(self, nullable=False, **kwargs):
         return out_df
 
     @classmethod
+    @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python")
     def from_pandas(cls, dataframe, nan_as_null=None):
         """
         Convert from a Pandas DataFrame.
@@ -4496,7 +4607,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
 
         df = cls()
         # Set columns
-        for col_name, col_value in dataframe.iteritems():
+        for col_name, col_value in dataframe.items():
             # necessary because multi-index can return multiple
             # columns for a single key
             if len(col_value.shape) == 1:
@@ -4528,6 +4639,7 @@ def from_pandas(cls, dataframe, nan_as_null=None):
         return result
 
     @classmethod
+    @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python")
     def from_arrow(cls, table):
         """
         Convert from PyArrow Table to DataFrame.
@@ -4583,6 +4695,7 @@ def from_arrow(cls, table):
 
         return out
 
+    @annotate("DATAFRAME_TO_ARROW", color="green", domain="cudf_python")
     def to_arrow(self, preserve_index=True):
         """
         Convert to a PyArrow Table.
@@ -4606,10 +4719,17 @@ def to_arrow(self, preserve_index=True):
         a: int64
         b: int64
         index: int64
+        ----
+        a: [[1,2,3]]
+        b: [[4,5,6]]
+        index: [[1,2,3]]
         >>> df.to_arrow(preserve_index=False)
         pyarrow.Table
         a: int64
         b: int64
+        ----
+        a: [[1,2,3]]
+        b: [[4,5,6]]
         """
 
         data = self.copy(deep=False)
@@ -4638,8 +4758,8 @@ def to_arrow(self, preserve_index=True):
                 for gen_name, col_name in zip(
                     gen_names, self.index._data.names
                 ):
-                    data.insert(
-                        data.shape[1], gen_name, self.index._data[col_name]
+                    data._insert(
+                        data.shape[1], gen_name, self.index._data[col_name],
                     )
                 descr = gen_names[0]
             index_descr.append(descr)
@@ -4657,6 +4777,7 @@ def to_arrow(self, preserve_index=True):
 
         return out.replace_schema_metadata(metadata)
 
+    @annotate("DATAFRAME_TO_RECORDS", color="green", domain="cudf_python")
     def to_records(self, index=True):
         """Convert to a numpy recarray
 
@@ -4680,6 +4801,7 @@ def to_records(self, index=True):
         return ret
 
     @classmethod
+    @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python")
     def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         """
         Convert structured or record ndarray to DataFrame.
@@ -4741,6 +4863,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         return df
 
     @classmethod
+    @annotate(
+        "DATAFRAME_FROM_ARRAYS_INTERNAL", color="green", domain="cudf_python"
+    )
     def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         """Convert a numpy/cupy array to DataFrame.
 
@@ -4800,6 +4925,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._index = as_index(index)
         return df
 
+    @annotate("DATAFRAME_INTERPOLATE", color="green", domain="cudf_python")
     def interpolate(
         self,
         method="linear",
@@ -4830,6 +4956,7 @@ def interpolate(
             **kwargs,
         )
 
+    @annotate("DATAFRAME_QUANTILE", color="green", domain="cudf_python")
     def quantile(
         self,
         q=0.5,
@@ -4945,6 +5072,7 @@ def quantile(
             result.index = q
             return result
 
+    @annotate("DATAFRAME_QUANTILES", color="green", domain="cudf_python")
     def quantiles(self, q=0.5, interpolation="nearest"):
         """
         Return values at the given quantile.
@@ -4984,6 +5112,7 @@ def quantiles(self, q=0.5, interpolation="nearest"):
             result.index = as_index(q)
             return result
 
+    @annotate("DATAFRAME_ISIN", color="green", domain="cudf_python")
     def isin(self, values):
         """
         Whether each element in the DataFrame is contained in values.
@@ -5122,6 +5251,9 @@ def make_false_column_like_self():
     #
     # Stats
     #
+    @annotate(
+        "DATAFRAME_PREPARE_FOR_ROWWISE_OP", color="green", domain="cudf_python"
+    )
     def _prepare_for_rowwise_op(self, method, skipna):
         """Prepare a DataFrame for CuPy-based row-wise operations."""
 
@@ -5171,6 +5303,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
             coerced = coerced.astype("int64", copy=False)
         return coerced, mask, common_dtype
 
+    @annotate("DATAFRAME_COUNT", color="green", domain="cudf_python")
     def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         """
         Count ``non-NA`` cells for each column or row.
@@ -5217,6 +5350,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs):
         "columns": 1,
     }
 
+    @annotate("DATAFRAME_REDUCE", color="green", domain="cudf_python")
     def _reduce(
         self, op, axis=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -5241,6 +5375,7 @@ def _reduce(
         elif axis == 1:
             return self._apply_cupy_method_axis_1(op, **kwargs)
 
+    @annotate("DATAFRAME_SCAN", color="green", domain="cudf_python")
     def _scan(
         self, op, axis=None, *args, **kwargs,
     ):
@@ -5251,6 +5386,7 @@ def _scan(
         elif axis == 1:
             return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs)
 
+    @annotate("DATAFRAME_MODE", color="green", domain="cudf_python")
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Get the mode(s) of each element along the selected axis.
@@ -5350,6 +5486,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
 
         return df
 
+    @annotate("DATAFRAME_KURTOSIS", color="green", domain="cudf_python")
     def kurtosis(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -5358,6 +5495,7 @@ def kurtosis(
             axis, skipna, level, numeric_only, **kwargs
         )
 
+    @annotate("DATAFRAME_SKEW", color="green", domain="cudf_python")
     def skew(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -5366,14 +5504,17 @@ def skew(
             axis, skipna, level, numeric_only, **kwargs
         )
 
+    @annotate("DATAFRAME_ALL", color="green", domain="cudf_python")
     def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).all(axis, skipna, level, **kwargs)
 
+    @annotate("DATAFRAME_ANY", color="green", domain="cudf_python")
     def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).any(axis, skipna, level, **kwargs)
 
+    @annotate("DATAFRAME_APPLY_CUPY", color="green", domain="cudf_python")
     def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         # This method uses cupy to perform scans and reductions along rows of a
         # DataFrame. Since cuDF is designed around columnar storage and
@@ -5474,6 +5615,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_df.columns = prepared.columns
             return result_df
 
+    @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python")
     def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
@@ -5482,6 +5624,7 @@ def _columns_view(self, columns):
             {col: self._data[col] for col in columns}, index=self.index
         )
 
+    @annotate("DATAFRAME_SELECT_DTYPES", color="green", domain="cudf_python")
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame’s columns based on the column dtypes.
 
@@ -5610,7 +5753,7 @@ def select_dtypes(self, include=None, exclude=None):
         for k, col in self._data.items():
             infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
             if infered_type in inclusion:
-                df.insert(len(df._data), k, col)
+                df._insert(len(df._data), k, col)
 
         return df
 
@@ -5668,6 +5811,7 @@ def to_orc(self, fname, compression=None, *args, **kwargs):
 
         orc.to_orc(self, fname, compression, *args, **kwargs)
 
+    @annotate("DATAFRAME_STACK", color="green", domain="cudf_python")
     def stack(self, level=-1, dropna=True):
         """Stack the prescribed level(s) from columns to index
 
@@ -5729,6 +5873,7 @@ def stack(self, level=-1, dropna=True):
         else:
             return result
 
+    @annotate("DATAFRAME_COV", color="green", domain="cudf_python")
     def cov(self, **kwargs):
         """Compute the covariance matrix of a DataFrame.
 
@@ -5746,6 +5891,7 @@ def cov(self, **kwargs):
         df.columns = self.columns
         return df
 
+    @annotate("DATAFRAME_CORR", color="green", domain="cudf_python")
     def corr(self):
         """Compute the correlation matrix of a DataFrame."""
         corr = cupy.corrcoef(self.values, rowvar=False)
@@ -5753,6 +5899,7 @@ def corr(self):
         df.columns = self.columns
         return df
 
+    @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python")
     def to_struct(self, name=None):
         """
         Return a struct Series composed of the columns of the DataFrame.
@@ -5766,8 +5913,16 @@ def to_struct(self, name=None):
         -----
         Note that a copy of the columns is made.
         """
+        if not all(isinstance(name, str) for name in self._data.names):
+            warnings.warn(
+                "DataFrame contains non-string column name(s). Struct column "
+                "requires field name to be string. Non-string column names "
+                "will be casted to string as the field name."
+            )
+        field_names = [str(name) for name in self._data.names]
+
         col = cudf.core.column.build_struct_column(
-            names=self._data.names, children=self._data.columns, size=len(self)
+            names=field_names, children=self._data.columns, size=len(self)
         )
         return cudf.Series._from_data(
             cudf.core.column_accessor.ColumnAccessor(
@@ -5777,6 +5932,7 @@ def to_struct(self, name=None):
             name=name,
         )
 
+    @annotate("DATAFRAME_KEYS", color="green", domain="cudf_python")
     def keys(self):
         """
         Get the columns.
@@ -5824,6 +5980,7 @@ def iterrows(self):
             "if you wish to iterate over each row."
         )
 
+    @annotate("DATAFRAME_APPEND", color="green", domain="cudf_python")
     def append(
         self, other, ignore_index=False, verify_integrity=False, sort=False
     ):
@@ -5976,6 +6133,7 @@ def append(
 
         return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort)
 
+    @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python")
     @copy_docstring(reshape.pivot)
     def pivot(self, index, columns, values=None):
 
@@ -5983,12 +6141,14 @@ def pivot(self, index, columns, values=None):
             self, index=index, columns=columns, values=values
         )
 
+    @annotate("DATAFRAME_UNSTACK", color="green", domain="cudf_python")
     @copy_docstring(reshape.unstack)
     def unstack(self, level=-1, fill_value=None):
         return cudf.core.reshape.unstack(
             self, level=level, fill_value=fill_value
         )
 
+    @annotate("DATAFRAME_EXPLODE", color="green", domain="cudf_python")
     def explode(self, column, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -6038,6 +6198,46 @@ def explode(self, column, ignore_index=False):
 
         return super()._explode(column, ignore_index)
 
+    def pct_change(
+        self, periods=1, fill_method="ffill", limit=None, freq=None
+    ):
+        """
+        Calculates the percent change between sequential elements
+        in the DataFrame.
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for forming percent change.
+        fill_method : str, default 'ffill'
+            How to handle NAs before computing percent changes.
+        limit : int, optional
+            The number of consecutive NAs to fill before stopping.
+            Not yet implemented.
+        freq : str, optional
+            Increment to use from time series API.
+            Not yet implemented.
+
+        Returns
+        -------
+        DataFrame
+        """
+        if limit is not None:
+            raise NotImplementedError("limit parameter not supported yet.")
+        if freq is not None:
+            raise NotImplementedError("freq parameter not supported yet.")
+        elif fill_method not in {"ffill", "pad", "bfill", "backfill"}:
+            raise ValueError(
+                "fill_method must be one of 'ffill', 'pad', "
+                "'bfill', or 'backfill'."
+            )
+
+        data = self.fillna(method=fill_method, limit=limit)
+
+        return data.diff(periods=periods) / data.shift(
+            periods=periods, freq=freq
+        )
+
     def __dataframe__(
         self, nan_as_null: bool = False, allow_copy: bool = True
     ):
@@ -6194,6 +6394,7 @@ def func(left, right, output):
     )
 
 
+@annotate("CUDF_FROM_PANDAS", color="green", domain="cudf_python")
 def from_pandas(obj, nan_as_null=None):
     """
     Convert certain Pandas objects into the cudf equivalent.
@@ -6314,6 +6515,7 @@ def from_pandas(obj, nan_as_null=None):
         )
 
 
+@annotate("CUDF_MERGE", color="green", domain="cudf_python")
 def merge(left, right, *args, **kwargs):
     return left.merge(right, *args, **kwargs)
 
@@ -6406,7 +6608,11 @@ def _setitem_with_dataframe(
                 raise ValueError("Can not insert new column with a bool mask")
             else:
                 # handle append case
-                input_df.insert(len(input_df._data), col_1, replace_df[col_2])
+                input_df._insert(
+                    loc=len(input_df._data),
+                    name=col_1,
+                    value=replace_df[col_2],
+                )
 
 
 def extract_col(df, col):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d55ab901b59..d478baf2d69 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -90,22 +90,22 @@ def _num_rows(self) -> int:
         return len(self._data.columns[0])
 
     @property
-    def _column_names(self) -> List[Any]:  # TODO: List[str]?
-        return self._data.names
+    def _column_names(self) -> Tuple[Any, ...]:  # TODO: Tuple[str]?
+        return tuple(self._data.names)
 
     @property
-    def _index_names(self) -> List[Any]:  # TODO: List[str]?
+    def _index_names(self) -> Optional[Tuple[Any, ...]]:  # TODO: Tuple[str]?
         # TODO: Temporarily suppressing mypy warnings to avoid introducing bugs
         # by returning an empty list where one is not expected.
         return (
             None  # type: ignore
             if self._index is None
-            else self._index._data.names
+            else tuple(self._index._data.names)
         )
 
     @property
-    def _columns(self) -> List[Any]:  # TODO: List[Column]?
-        return self._data.columns
+    def _columns(self) -> Tuple[Any, ...]:  # TODO: Tuple[Column]?
+        return tuple(self._data.columns)
 
     def serialize(self):
         header = {
@@ -123,6 +123,7 @@ def deserialize(cls, header, frames):
         return cls_deserialize._from_data(dict(zip(column_names, columns)))
 
     @classmethod
+    @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python")
     def _from_data(
         cls,
         data: MutableMapping,
@@ -133,6 +134,7 @@ def _from_data(
         return obj
 
     @classmethod
+    @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python")
     def _from_columns(
         cls,
         columns: List[ColumnBase],
@@ -163,6 +165,9 @@ def _from_columns(
 
         return cls._from_data(data, index)
 
+    @annotate(
+        "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python"
+    )
     def _from_columns_like_self(
         self,
         columns: List[ColumnBase],
@@ -360,6 +365,7 @@ def memory_usage(self, deep=False):
     def __len__(self):
         return self._num_rows
 
+    @annotate("FRAME_COPY", color="green", domain="cudf_python")
     def copy(self: T, deep: bool = True) -> T:
         """
         Make a copy of this object's indices and data.
@@ -445,6 +451,7 @@ def copy(self: T, deep: bool = True) -> T:
 
         return new_frame
 
+    @annotate("FRAME_EQUALS", color="green", domain="cudf_python")
     def equals(self, other, **kwargs):
         """
         Test whether two objects contain the same elements.
@@ -527,6 +534,7 @@ def equals(self, other, **kwargs):
         else:
             return self._index.equals(other._index)
 
+    @annotate("FRAME_EXPLODE", color="green", domain="cudf_python")
     def _explode(self, explode_column: Any, ignore_index: bool):
         """Helper function for `explode` in `Series` and `Dataframe`, explodes
         a specified nested column. Other columns' corresponding rows are
@@ -550,6 +558,9 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             res.index.names = self._index.names
         return res
 
+    @annotate(
+        "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python"
+    )
     def _get_columns_by_label(self, labels, downcast=False):
         """
         Returns columns of the Frame specified by `labels`
@@ -557,6 +568,9 @@ def _get_columns_by_label(self, labels, downcast=False):
         """
         return self._data.select_by_label(labels)
 
+    @annotate(
+        "FRAME_GET_COLUMNS_BY_INDEX", color="green", domain="cudf_python"
+    )
     def _get_columns_by_index(self, indices):
         """
         Returns columns of the Frame specified by `labels`
@@ -580,6 +594,7 @@ def _as_column(self):
 
         return self._data[None].copy(deep=False)
 
+    @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python")
     def _empty_like(self, keep_index=True):
         result = self.__class__._from_data(
             *libcudf.copying.table_empty_like(self, keep_index)
@@ -669,6 +684,13 @@ def get_column_values_na(col):
             matrix[:, i] = get_column_values_na(col)
         return matrix
 
+    # TODO: As of now, calling cupy.asarray is _much_ faster than calling
+    # to_cupy. We should investigate the reasons why and whether we can provide
+    # a more efficient method here by exploiting __cuda_array_interface__. In
+    # particular, we need to benchmark how much of the overhead is coming from
+    # (potentially unavoidable) local copies in to_cupy and how much comes from
+    # inefficiencies in the implementation.
+    @annotate("FRAME_TO_CUPY", color="green", domain="cudf_python")
     def to_cupy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -703,6 +725,7 @@ def to_cupy(
             na_value,
         )
 
+    @annotate("FRAME_TO_NUMPY", color="green", domain="cudf_python")
     def to_numpy(
         self,
         dtype: Union[Dtype, None] = None,
@@ -737,6 +760,7 @@ def to_numpy(
             (lambda col: col.values_host), np.empty, dtype, na_value
         )
 
+    @annotate("FRAME_CLIP", color="green", domain="cudf_python")
     def clip(self, lower=None, upper=None, inplace=False, axis=1):
         """
         Trim values at input threshold(s).
@@ -864,6 +888,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
 
         return self._mimic_inplace(output, inplace=inplace)
 
+    @annotate("FRAME_WHERE", color="green", domain="cudf_python")
     def where(self, cond, other=None, inplace=False):
         """
         Replace values where the condition is False.
@@ -922,6 +947,7 @@ def where(self, cond, other=None, inplace=False):
             frame=self, cond=cond, other=other, inplace=inplace
         )
 
+    @annotate("FRAME_MASK", color="green", domain="cudf_python")
     def mask(self, cond, other=None, inplace=False):
         """
         Replace values where the condition is True.
@@ -983,6 +1009,7 @@ def mask(self, cond, other=None, inplace=False):
 
         return self.where(cond=~cond, other=other, inplace=inplace)
 
+    @annotate("FRAME_PIPE", color="green", domain="cudf_python")
     def pipe(self, func, *args, **kwargs):
         """
         Apply ``func(self, *args, **kwargs)``.
@@ -1113,6 +1140,7 @@ def scatter_by_map(
 
         return result
 
+    @annotate("FRAME_FILLNA", color="green", domain="cudf_python")
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1238,7 +1266,7 @@ def fillna(
                 value = value
         elif not isinstance(value, abc.Mapping):
             value = {name: copy.deepcopy(value) for name in self._data.names}
-        elif isinstance(value, abc.Mapping):
+        else:
             value = {
                 key: value.reindex(self.index)
                 if isinstance(value, cudf.Series)
@@ -1246,19 +1274,28 @@ def fillna(
                 for key, value in value.items()
             }
 
-        copy_data = self._data.copy(deep=True)
-
-        for name in copy_data.keys():
+        filled_data = {}
+        for col_name, col in self._data.items():
+            if col_name in value and method is None:
+                replace_val = value[col_name]
+            else:
+                replace_val = None
             should_fill = (
-                name in value
-                and not libcudf.scalar._is_null_host_scalar(value[name])
+                col_name in value
+                and col.contains_na_entries
+                and not libcudf.scalar._is_null_host_scalar(replace_val)
             ) or method is not None
             if should_fill:
-                copy_data[name] = copy_data[name].fillna(value[name], method)
-        result = self._from_data(copy_data, self._index)
+                filled_data[col_name] = col.fillna(replace_val, method)
+            else:
+                filled_data[col_name] = col.copy(deep=True)
 
-        return self._mimic_inplace(result, inplace=inplace)
+        return self._mimic_inplace(
+            self._from_data(data=filled_data, index=self._index),
+            inplace=inplace,
+        )
 
+    @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python")
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
         """
         Drop columns containing nulls
@@ -1276,16 +1313,21 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
             else:
                 thresh = len(df)
 
-        for col in self._data.names:
+        for name, col in df._data.items():
+            try:
+                check_col = col.nans_to_nulls()
+            except AttributeError:
+                check_col = col
             no_threshold_valid_count = (
-                len(df[col]) - df[col].nans_to_nulls().null_count
+                len(col) - check_col.null_count
             ) < thresh
             if no_threshold_valid_count:
                 continue
-            out_cols.append(col)
+            out_cols.append(name)
 
         return self[out_cols]
 
+    @annotate("FRAME_INTERPOLATE", color="green", domain="cudf_python")
     def interpolate(
         self,
         method="linear",
@@ -1355,6 +1397,7 @@ def interpolate(
             else result._gather(perm_sort.argsort())
         )
 
+    @annotate("FRAME_QUANTILES", color="green", domain="cudf_python")
     def _quantiles(
         self,
         q,
@@ -1387,6 +1430,7 @@ def _quantiles(
         result._copy_type_metadata(self)
         return result
 
+    @annotate("FRAME_RANK", color="green", domain="cudf_python")
     def rank(
         self,
         axis=0,
@@ -1463,6 +1507,7 @@ def rank(
 
         return self._from_data(data, index).astype(np.float64)
 
+    @annotate("FRAME_REPEAT", color="green", domain="cudf_python")
     def repeat(self, repeats, axis=None):
         """Repeats elements consecutively.
 
@@ -1552,6 +1597,7 @@ def repeat(self, repeats, axis=None):
         result._copy_type_metadata(self)
         return result
 
+    @annotate("FRAME_SHIFT", color="green", domain="cudf_python")
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
@@ -1567,7 +1613,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             zip(self._column_names, data_columns), self._index
         )
 
-    @annotate("SAMPLE", color="orange", domain="cudf_python")
+    @annotate("FRAME_SAMPLE", color="orange", domain="cudf_python")
     def sample(
         self,
         n=None,
@@ -1761,7 +1807,7 @@ def sample(
             return result
 
     @classmethod
-    @annotate("FROM_ARROW", color="orange", domain="cudf_python")
+    @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python")
     def from_arrow(cls, data):
         """Convert from PyArrow Table to Frame
 
@@ -1901,7 +1947,7 @@ def from_arrow(cls, data):
 
         return cls._from_data({name: result[name] for name in column_names})
 
-    @annotate("TO_ARROW", color="orange", domain="cudf_python")
+    @annotate("FRAME_TO_ARROW", color="orange", domain="cudf_python")
     def to_arrow(self):
         """
         Convert to arrow Table
@@ -1916,6 +1962,10 @@ def to_arrow(self):
         a: int64
         b: int64
         index: int64
+        ----
+        a: [[1,2,3]]
+        b: [[4,5,6]]
+        index: [[1,2,3]]
         """
         return pa.Table.from_pydict(
             {name: col.to_arrow() for name, col in self._data.items()}
@@ -1933,6 +1983,7 @@ def _positions_from_column_names(self, column_names):
             if name in set(column_names)
         ]
 
+    @annotate("FRAME_REPLACE", color="green", domain="cudf_python")
     def replace(
         self,
         to_replace=None,
@@ -2146,7 +2197,7 @@ def replace(
             )
 
         if not (to_replace is None and value is None):
-            copy_data = self._data.copy(deep=False)
+            copy_data = {}
             (
                 all_na_per_column,
                 to_replace_per_column,
@@ -2155,11 +2206,11 @@ def replace(
                 to_replace=to_replace,
                 value=value,
                 columns_dtype_map={
-                    col: copy_data._data[col].dtype for col in copy_data._data
+                    col: self._data[col].dtype for col in self._data
                 },
             )
 
-            for name, col in copy_data.items():
+            for name, col in self._data.items():
                 try:
                     copy_data[name] = col.find_and_replace(
                         to_replace_per_column[name],
@@ -2219,6 +2270,7 @@ def _copy_type_metadata(
 
         return self
 
+    @annotate("FRAME_ISNULL", color="green", domain="cudf_python")
     def isnull(self):
         """
         Identify missing values.
@@ -2300,6 +2352,7 @@ def isnull(self):
     # Alias for isnull
     isna = isnull
 
+    @annotate("FRAME_NOTNULL", color="green", domain="cudf_python")
     def notnull(self):
         """
         Identify non-missing values.
@@ -2381,6 +2434,7 @@ def notnull(self):
     # Alias for notnull
     notna = notnull
 
+    @annotate("FRAME_INTERLEAVE_COLUMNS", color="green", domain="cudf_python")
     def interleave_columns(self):
         """
         Interleave Series columns of a table into a single column.
@@ -2420,6 +2474,7 @@ def interleave_columns(self):
 
         return result
 
+    @annotate("FRAME_TILE", color="green", domain="cudf_python")
     def tile(self, count):
         """
         Repeats the rows from `self` DataFrame `count` times to form a
@@ -2449,6 +2504,7 @@ def tile(self, count):
         result._copy_type_metadata(self)
         return result
 
+    @annotate("FRAME_SEARCHSORTED", color="green", domain="cudf_python")
     def searchsorted(
         self, values, side="left", ascending=True, na_position="last"
     ):
@@ -2533,7 +2589,7 @@ def searchsorted(
         else:
             return result
 
-    @annotate("ARGSORT", color="yellow", domain="cudf_python")
+    @annotate("FRAME_ARGSORT", color="yellow", domain="cudf_python")
     def argsort(
         self,
         by=None,
@@ -2636,6 +2692,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
 
         return libcudf.sort.order_by(to_sort, ascending, na_position)
 
+    @annotate("FRAME_SIN", color="green", domain="cudf_python")
     def sin(self):
         """
         Get Trigonometric sine, element-wise.
@@ -2697,6 +2754,7 @@ def sin(self):
         """
         return self._unaryop("sin")
 
+    @annotate("FRAME_COS", color="green", domain="cudf_python")
     def cos(self):
         """
         Get Trigonometric cosine, element-wise.
@@ -2758,6 +2816,7 @@ def cos(self):
         """
         return self._unaryop("cos")
 
+    @annotate("FRAME_TAN", color="green", domain="cudf_python")
     def tan(self):
         """
         Get Trigonometric tangent, element-wise.
@@ -2819,6 +2878,7 @@ def tan(self):
         """
         return self._unaryop("tan")
 
+    @annotate("FRAME_ASIN", color="green", domain="cudf_python")
     def asin(self):
         """
         Get Trigonometric inverse sine, element-wise.
@@ -2869,6 +2929,7 @@ def asin(self):
         """
         return self._unaryop("asin")
 
+    @annotate("FRAME_ACOS", color="green", domain="cudf_python")
     def acos(self):
         """
         Get Trigonometric inverse cosine, element-wise.
@@ -2927,6 +2988,7 @@ def acos(self):
         result = result.mask((result < 0) | (result > np.pi + 1))
         return result
 
+    @annotate("FRAME_ATAN", color="green", domain="cudf_python")
     def atan(self):
         """
         Get Trigonometric inverse tangent, element-wise.
@@ -2987,6 +3049,7 @@ def atan(self):
         """
         return self._unaryop("atan")
 
+    @annotate("FRAME_EXP", color="green", domain="cudf_python")
     def exp(self):
         """
         Get the exponential of all elements, element-wise.
@@ -3049,6 +3112,7 @@ def exp(self):
         """
         return self._unaryop("exp")
 
+    @annotate("FRAME_LOG", color="green", domain="cudf_python")
     def log(self):
         """
         Get the natural logarithm of all elements, element-wise.
@@ -3110,6 +3174,7 @@ def log(self):
         """
         return self._unaryop("log")
 
+    @annotate("FRAME_SQRT", color="green", domain="cudf_python")
     def sqrt(self):
         """
         Get the non-negative square-root of all elements, element-wise.
@@ -3165,6 +3230,7 @@ def sqrt(self):
         """
         return self._unaryop("sqrt")
 
+    @annotate("FRAME_ABS", color="green", domain="cudf_python")
     def abs(self):
         """
         Return a Series/DataFrame with absolute numeric value of each element.
@@ -3191,6 +3257,7 @@ def abs(self):
         return self._unaryop("abs")
 
     # Rounding
+    @annotate("FRAME_CEIL", color="green", domain="cudf_python")
     def ceil(self):
         """
         Rounds each value upward to the smallest integral value not less
@@ -3227,6 +3294,7 @@ def ceil(self):
 
         return self._unaryop("ceil")
 
+    @annotate("FRAME_FLOOR", color="green", domain="cudf_python")
     def floor(self):
         """Rounds each value downward to the largest integral value not greater
         than the original.
@@ -3266,6 +3334,7 @@ def floor(self):
 
         return self._unaryop("floor")
 
+    @annotate("FRAME_SCALE", color="green", domain="cudf_python")
     def scale(self):
         """
         Scale values to [0, 1] in float64
@@ -3300,6 +3369,7 @@ def scale(self):
         scaled._index = self._index.copy(deep=False)
         return scaled
 
+    @annotate("FRAME_INTERNAL_MERGE", color="green", domain="cudf_python")
     def _merge(
         self,
         right,
@@ -3343,6 +3413,7 @@ def _merge(
             suffixes=suffixes,
         ).perform_merge()
 
+    @annotate("FRAME_IS_SORTED", color="green", domain="cudf_python")
     def _is_sorted(self, ascending=None, null_position=None):
         """
         Returns a boolean indicating whether the data of the Frame are sorted
@@ -3373,12 +3444,14 @@ def _is_sorted(self, ascending=None, null_position=None):
             self, ascending=ascending, null_position=null_position
         )
 
+    @annotate("FRAME_SPLIT", color="green", domain="cudf_python")
     def _split(self, splits, keep_index=True):
         results = libcudf.copying.table_split(
             self, splits, keep_index=keep_index
         )
         return [self.__class__._from_data(*result) for result in results]
 
+    @annotate("FRAME_ENCODE", color="green", domain="cudf_python")
     def _encode(self):
         data, index, indices = libcudf.transform.table_encode(self)
         for name, col in data.items():
@@ -3386,6 +3459,7 @@ def _encode(self):
         keys = self.__class__._from_data(data, index)
         return keys, indices
 
+    @annotate("FRAME_UNARYOP", color="green", domain="cudf_python")
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         return self.__class__._from_data(
@@ -3425,6 +3499,7 @@ def _binaryop(
         raise NotImplementedError
 
     @classmethod
+    @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python")
     def _colwise_binop(
         cls,
         operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
@@ -3583,6 +3658,7 @@ def _colwise_binop(
 
         return output
 
+    @annotate("FRAME_DOT", color="green", domain="cudf_python")
     def dot(self, other, reflect=False):
         """
         Get dot product of frame and other, (binary operator `dot`).
@@ -3622,6 +3698,8 @@ def dot(self, other, reflect=False):
         >>> [1, 2, 3, 4] @ s
         10
         """
+        # TODO: This function does not currently support nulls.
+        # TODO: This function does not properly support misaligned indexes.
         lhs = self.values
         if isinstance(other, Frame):
             rhs = other.values
@@ -3632,6 +3710,16 @@ def dot(self, other, reflect=False):
         ):
             rhs = cupy.asarray(other)
         else:
+            # TODO: This should raise an exception, not return NotImplemented,
+            # but __matmul__ relies on the current behavior. We should either
+            # move this implementation to __matmul__ and call it from here
+            # (checking for NotImplemented and raising NotImplementedError if
+            # that's what's returned), or __matmul__ should catch a
+            # NotImplementedError from here and return NotImplemented. The
+            # latter feels cleaner (putting the implementation in this method
+            # rather than in the operator) but will be slower in the (highly
+            # unlikely) case that we're multiplying a cudf object with another
+            # type of object that somehow supports this behavior.
             return NotImplemented
         if reflect:
             lhs, rhs = rhs, lhs
@@ -3743,6 +3831,7 @@ def _reduce(self, *args, **kwargs):
             f"Reductions are not supported for objects of type {type(self)}."
         )
 
+    @annotate("FRAME_MIN", color="green", domain="cudf_python")
     def min(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -3788,6 +3877,7 @@ def min(
             **kwargs,
         )
 
+    @annotate("FRAME_MAX", color="green", domain="cudf_python")
     def max(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs,
     ):
@@ -3833,6 +3923,7 @@ def max(
             **kwargs,
         )
 
+    @annotate("FRAME_SUM", color="green", domain="cudf_python")
     def sum(
         self,
         axis=None,
@@ -3891,6 +3982,7 @@ def sum(
             **kwargs,
         )
 
+    @annotate("FRAME_PRODUCT", color="green", domain="cudf_python")
     def product(
         self,
         axis=None,
@@ -3955,6 +4047,7 @@ def product(
     # Alias for pandas compatibility.
     prod = product
 
+    @annotate("FRAME_MEAN", color="green", domain="cudf_python")
     def mean(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -3999,6 +4092,7 @@ def mean(
             **kwargs,
         )
 
+    @annotate("FRAME_STD", color="green", domain="cudf_python")
     def std(
         self,
         axis=None,
@@ -4055,6 +4149,7 @@ def std(
             **kwargs,
         )
 
+    @annotate("FRAME_VAR", color="green", domain="cudf_python")
     def var(
         self,
         axis=None,
@@ -4110,6 +4205,7 @@ def var(
             **kwargs,
         )
 
+    @annotate("FRAME_KURTOSIS", color="green", domain="cudf_python")
     def kurtosis(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -4178,6 +4274,7 @@ def kurt(
             **kwargs,
         )
 
+    @annotate("FRAME_SKEW", color="green", domain="cudf_python")
     def skew(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -4235,6 +4332,7 @@ def skew(
             **kwargs,
         )
 
+    @annotate("FRAME_ALL", color="green", domain="cudf_python")
     def all(self, axis=0, skipna=True, level=None, **kwargs):
         """
         Return whether all elements are True in DataFrame.
@@ -4270,6 +4368,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs):
             "all", axis=axis, skipna=skipna, level=level, **kwargs,
         )
 
+    @annotate("FRAME_ANY", color="green", domain="cudf_python")
     def any(self, axis=0, skipna=True, level=None, **kwargs):
         """
         Return whether any elements is True in DataFrame.
@@ -4305,6 +4404,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs):
             "any", axis=axis, skipna=skipna, level=level, **kwargs,
         )
 
+    @annotate("FRAME_SUM_OF_SQUARES", color="green", domain="cudf_python")
     def sum_of_squares(self, dtype=None):
         """Return the sum of squares of values.
 
@@ -4328,6 +4428,7 @@ def sum_of_squares(self, dtype=None):
         """
         return self._reduce("sum_of_squares", dtype=dtype)
 
+    @annotate("FRAME_MEDIAN", color="green", domain="cudf_python")
     def median(
         self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
     ):
@@ -4373,22 +4474,28 @@ def median(
         )
 
     # Scans
+    @annotate("FRAME_SCAN", color="green", domain="cudf_python")
     def _scan(self, op, axis=None, skipna=True, cast_to_int=False):
         skipna = True if skipna is None else skipna
 
         results = {}
         for name, col in self._data.items():
             if skipna:
-                result_col = self._data[name].nans_to_nulls()
+                try:
+                    result_col = col.nans_to_nulls()
+                except AttributeError:
+                    result_col = col
             else:
-                result_col = self._data[name].copy()
-                if result_col.has_nulls(include_nan=True):
+                if col.has_nulls(include_nan=True):
                     # Workaround as find_first_value doesn't seem to work
                     # incase of bools.
                     first_index = int(
-                        result_col.isnull().astype("int8").find_first_value(1)
+                        col.isnull().astype("int8").find_first_value(1)
                     )
+                    result_col = col.copy()
                     result_col[first_index:] = None
+                else:
+                    result_col = col
 
             if (
                 cast_to_int
@@ -4407,6 +4514,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False):
         # for Index._from_data and simplify.
         return self._from_data(results, index=self._index)
 
+    @annotate("FRAME_CUMMIN", color="green", domain="cudf_python")
     def cummin(self, axis=None, skipna=True, *args, **kwargs):
         """
         Return cumulative minimum of the Series or DataFrame.
@@ -4450,6 +4558,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs):
         """
         return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs)
 
+    @annotate("FRAME_CUMMAX", color="green", domain="cudf_python")
     def cummax(self, axis=None, skipna=True, *args, **kwargs):
         """
         Return cumulative maximum of the Series or DataFrame.
@@ -4493,6 +4602,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs):
         """
         return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs)
 
+    @annotate("FRAME_CUMSUM", color="green", domain="cudf_python")
     def cumsum(self, axis=None, skipna=True, *args, **kwargs):
         """
         Return cumulative sum of the Series or DataFrame.
@@ -4539,6 +4649,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs):
             "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs
         )
 
+    @annotate("FRAME_CUMPROD", color="green", domain="cudf_python")
     def cumprod(self, axis=None, skipna=True, *args, **kwargs):
         """
         Return cumulative product of the Series or DataFrame.
@@ -4584,6 +4695,7 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs):
             "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs
         )
 
+    @annotate("FRAME_TO_JSON", color="green", domain="cudf_python")
     @ioutils.doc_to_json()
     def to_json(self, path_or_buf=None, *args, **kwargs):
         """{docstring}"""
@@ -4592,18 +4704,21 @@ def to_json(self, path_or_buf=None, *args, **kwargs):
             self, path_or_buf=path_or_buf, *args, **kwargs
         )
 
+    @annotate("FRAME_TO_HDF", color="green", domain="cudf_python")
     @ioutils.doc_to_hdf()
     def to_hdf(self, path_or_buf, key, *args, **kwargs):
         """{docstring}"""
 
         cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)
 
+    @annotate("FRAME_TO_DLPACK", color="green", domain="cudf_python")
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
         """{docstring}"""
 
         return cudf.io.dlpack.to_dlpack(self)
 
+    @annotate("FRAME_TO_STRING", color="green", domain="cudf_python")
     def to_string(self):
         """
         Convert to string
@@ -4629,12 +4744,15 @@ def to_string(self):
     def __str__(self):
         return self.to_string()
 
+    @annotate("FRAME_DEEP_COPY", color="green", domain="cudf_python")
     def __deepcopy__(self, memo):
         return self.copy(deep=True)
 
+    @annotate("FRAME_COPY", color="green", domain="cudf_python")
     def __copy__(self):
         return self.copy(deep=False)
 
+    @annotate("FRAME_HEAD", color="green", domain="cudf_python")
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -4718,6 +4836,7 @@ def head(self, n=5):
         """
         return self.iloc[:n]
 
+    @annotate("FRAME_TAIL", color="green", domain="cudf_python")
     def tail(self, n=5):
         """
         Returns the last n rows as a new DataFrame or Series
@@ -4749,6 +4868,7 @@ def tail(self, n=5):
 
         return self.iloc[-n:]
 
+    @annotate("FRAME_ROLLING", color="green", domain="cudf_python")
     @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
@@ -4762,6 +4882,7 @@ def rolling(
             win_type=win_type,
         )
 
+    @annotate("FRAME_NANS_TO_NULLS", color="green", domain="cudf_python")
     def nans_to_nulls(self):
         """
         Convert nans (if any) to nulls
@@ -4808,14 +4929,15 @@ def nans_to_nulls(self):
         1  <NA>  3.14
         2  <NA>  <NA>
         """
-        return self._from_data(
-            {
-                name: col.copy().nans_to_nulls()
-                for name, col in self._data.items()
-            },
-            self._index,
-        )
+        result_data = {}
+        for name, col in self._data.items():
+            try:
+                result_data[name] = col.nans_to_nulls()
+            except AttributeError:
+                result_data[name] = col.copy()
+        return self._from_data(result_data, self._index)
 
+    @annotate("FRAME_INVERT", color="green", domain="cudf_python")
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data(
@@ -4826,6 +4948,7 @@ def __invert__(self):
             self._index,
         )
 
+    @annotate("FRAME_ADD", color="green", domain="cudf_python")
     def add(self, other, axis, level=None, fill_value=None):
         """
         Get Addition of dataframe or series and other, element-wise (binary
@@ -4896,6 +5019,7 @@ def add(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "add", fill_value)
 
+    @annotate("FRAME_RADD", color="green", domain="cudf_python")
     def radd(self, other, axis, level=None, fill_value=None):
         """
         Get Addition of dataframe or series and other, element-wise (binary
@@ -4975,6 +5099,7 @@ def radd(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "add", fill_value, reflect=True)
 
+    @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python")
     def subtract(self, other, axis, level=None, fill_value=None):
         """
         Get Subtraction of dataframe or series and other, element-wise (binary
@@ -5057,6 +5182,7 @@ def subtract(self, other, axis, level=None, fill_value=None):
 
     sub = subtract
 
+    @annotate("FRAME_RSUB", color="green", domain="cudf_python")
     def rsub(self, other, axis, level=None, fill_value=None):
         """
         Get Subtraction of dataframe or series and other, element-wise (binary
@@ -5140,6 +5266,7 @@ def rsub(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "sub", fill_value, reflect=True)
 
+    @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python")
     def multiply(self, other, axis, level=None, fill_value=None):
         """
         Get Multiplication of dataframe or series and other, element-wise
@@ -5224,6 +5351,7 @@ def multiply(self, other, axis, level=None, fill_value=None):
 
     mul = multiply
 
+    @annotate("FRAME_RMUL", color="green", domain="cudf_python")
     def rmul(self, other, axis, level=None, fill_value=None):
         """
         Get Multiplication of dataframe or series and other, element-wise
@@ -5308,6 +5436,7 @@ def rmul(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "mul", fill_value, reflect=True)
 
+    @annotate("FRAME_MOD", color="green", domain="cudf_python")
     def mod(self, other, axis, level=None, fill_value=None):
         """
         Get Modulo division of dataframe or series and other, element-wise
@@ -5378,6 +5507,7 @@ def mod(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "mod", fill_value)
 
+    @annotate("FRAME_RMOD", color="green", domain="cudf_python")
     def rmod(self, other, axis, level=None, fill_value=None):
         """
         Get Modulo division of dataframe or series and other, element-wise
@@ -5460,6 +5590,7 @@ def rmod(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "mod", fill_value, reflect=True)
 
+    @annotate("FRAME_POW", color="green", domain="cudf_python")
     def pow(self, other, axis, level=None, fill_value=None):
         """
         Get Exponential power of dataframe series and other, element-wise
@@ -5539,6 +5670,7 @@ def pow(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "pow", fill_value)
 
+    @annotate("FRAME_RPOW", color="green", domain="cudf_python")
     def rpow(self, other, axis, level=None, fill_value=None):
         """
         Get Exponential power of dataframe or series and other, element-wise
@@ -5618,6 +5750,7 @@ def rpow(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "pow", fill_value, reflect=True)
 
+    @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python")
     def floordiv(self, other, axis, level=None, fill_value=None):
         """
         Get Integer division of dataframe or series and other, element-wise
@@ -5697,6 +5830,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "floordiv", fill_value)
 
+    @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python")
     def rfloordiv(self, other, axis, level=None, fill_value=None):
         """
         Get Integer division of dataframe or series and other, element-wise
@@ -5793,6 +5927,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):
 
         return self._binaryop(other, "floordiv", fill_value, reflect=True)
 
+    @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python")
     def truediv(self, other, axis, level=None, fill_value=None):
         """
         Get Floating division of dataframe or series and other, element-wise
@@ -5881,6 +6016,7 @@ def truediv(self, other, axis, level=None, fill_value=None):
     div = truediv
     divide = truediv
 
+    @annotate("FRAME_RTRUEDIV", color="green", domain="cudf_python")
     def rtruediv(self, other, axis, level=None, fill_value=None):
         """
         Get Floating division of dataframe or series and other, element-wise
@@ -5973,6 +6109,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):
     # Alias for rtruediv
     rdiv = rtruediv
 
+    @annotate("FRAME_EQ", color="green", domain="cudf_python")
     def eq(self, other, axis="columns", level=None, fill_value=None):
         """Equal to, element-wise (binary operator eq).
 
@@ -6048,6 +6185,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="eq", fill_value=fill_value, can_reindex=True
         )
 
+    @annotate("FRAME_NE", color="green", domain="cudf_python")
     def ne(self, other, axis="columns", level=None, fill_value=None):
         """Not equal to, element-wise (binary operator ne).
 
@@ -6123,6 +6261,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="ne", fill_value=fill_value, can_reindex=True
         )
 
+    @annotate("FRAME_LT", color="green", domain="cudf_python")
     def lt(self, other, axis="columns", level=None, fill_value=None):
         """Less than, element-wise (binary operator lt).
 
@@ -6198,6 +6337,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="lt", fill_value=fill_value, can_reindex=True
         )
 
+    @annotate("FRAME_LE", color="green", domain="cudf_python")
     def le(self, other, axis="columns", level=None, fill_value=None):
         """Less than or equal, element-wise (binary operator le).
 
@@ -6273,6 +6413,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="le", fill_value=fill_value, can_reindex=True
         )
 
+    @annotate("FRAME_GT", color="green", domain="cudf_python")
     def gt(self, other, axis="columns", level=None, fill_value=None):
         """Greater than, element-wise (binary operator gt).
 
@@ -6348,6 +6489,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="gt", fill_value=fill_value, can_reindex=True
         )
 
+    @annotate("FRAME_GE", color="green", domain="cudf_python")
     def ge(self, other, axis="columns", level=None, fill_value=None):
         """Greater than or equal, element-wise (binary operator ge).
 
@@ -6446,6 +6588,11 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True):
         }
 
 
+@annotate(
+    "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS",
+    color="green",
+    domain="cudf_python",
+)
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
 ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]:
@@ -6610,6 +6757,7 @@ def _is_series(obj):
     return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
 
 
+@annotate("FRAME_DROP_ROWS_BY_LABELS", color="green", domain="cudf_python")
 def _drop_rows_by_labels(
     obj: DataFrameOrSeries,
     labels: Union[ColumnLike, abc.Iterable, str],
@@ -6658,10 +6806,9 @@ def _drop_rows_by_labels(
         join_res = working_df.join(to_join, how="leftanti")
 
         # 4. Reconstruct original layout, and rename
-        join_res.insert(
+        join_res._insert(
             ilevel, name=join_res._index.name, value=join_res._index
         )
-        join_res = join_res.reset_index(drop=True)
 
         midx = cudf.MultiIndex.from_frame(
             join_res.iloc[:, 0:idx_nlv], names=obj._index.names
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ff700144bed..b90f857ce84 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -4,6 +4,7 @@
 import itertools
 import pickle
 import warnings
+from functools import cached_property
 
 import numpy as np
 import pandas as pd
@@ -11,12 +12,13 @@
 
 import cudf
 from cudf._lib import groupby as libgroupby
+from cudf._lib.reshape import interleave_columns
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
 from cudf.core.column.column import arange, as_column
 from cudf.core.multiindex import MultiIndex
-from cudf.utils.utils import GetAttrGetItemMixin, cached_property
+from cudf.utils.utils import GetAttrGetItemMixin
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
@@ -286,7 +288,7 @@ def agg(self, func):
 
         if not self._as_index:
             for col_name in reversed(self.grouping._named_columns):
-                result.insert(
+                result._insert(
                     0,
                     col_name,
                     result.index.get_level_values(col_name)._values,
@@ -958,6 +960,177 @@ def corr(self, method="pearson", min_periods=1):
 
         return res
 
+    def cov(self, min_periods=0, ddof=1):
+        """
+        Compute the pairwise covariance among the columns of a DataFrame,
+        excluding NA/null values.
+
+        The returned DataFrame is the covariance matrix of the columns of
+        the DataFrame.
+
+        Both NA and null values are automatically excluded from the
+        calculation. See the note below about bias from missing values.
+
+        A threshold can be set for the minimum number of observations
+        for each value created. Comparisons with observations below this
+        threshold will be returned as `NA`.
+
+        This method is generally used for the analysis of time series data to
+        understand the relationship between different measures across time.
+
+        Parameters
+        ----------
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
+
+        ddof: int, optional
+            Delta degrees of freedom, default is 1.
+
+        Returns
+        -------
+        DataFrame
+            Covariance matrix.
+
+        Notes
+        -----
+        Returns the covariance matrix of the DataFrame's time series.
+        The covariance is normalized by N-ddof.
+
+        For DataFrames that have Series that are missing data
+        (assuming that data is missing at random) the returned covariance
+        matrix will be an unbiased estimate of the variance and covariance
+        between the member Series.
+
+        However, for many applications this estimate may not be acceptable
+        because the estimate covariance matrix is not guaranteed to be
+        positive semi-definite. This could lead to estimate correlations
+        having absolute values which are greater than one, and/or a
+        non-invertible covariance matrix. See
+        `Estimation of covariance matrices
+        <https://en.wikipedia.org/wiki/Estimation_of_covariance_matrices>`
+        for more details.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...     "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...     "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...     "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...     "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ... })
+        >>> gdf
+          id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").cov()
+                    val1       val2       val3
+        id
+        a  val1  1.000000   0.500000   0.500000
+           val2  0.500000   1.000000   1.000000
+           val3  0.500000   1.000000   1.000000
+        b  val1  4.333333   3.500000   3.500000
+           val2  3.500000  19.000000  19.000000
+           val3  3.500000  19.000000  19.000000
+        c  val1  2.333333   3.833333   3.833333
+           val2  3.833333  12.333333  12.333333
+           val3  3.833333  12.333333  12.333333
+        """
+
+        # create expanded dataframe consisting all combinations of the
+        # struct columns-pairs used in the covariance calculation
+        # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
+        column_names = self.grouping.values.columns.tolist()
+        num_cols = len(column_names)
+
+        column_pair_structs = {}
+        for x, y in itertools.combinations_with_replacement(column_names, 2):
+            # The number of output columns is the number of input columns
+            # squared. We directly call the struct column factory here to
+            # reduce overhead and avoid copying data. Since libcudf groupby
+            # maintains a cache of aggregation requests, reusing the same
+            # column also makes use of previously cached column means and
+            # reduces kernel costs.
+
+            # checks if input column names are string, raise a warning if
+            # not so and cast them to strings
+            if not (isinstance(x, str) and isinstance(y, str)):
+                warnings.warn(
+                    "DataFrame contains non-string column name(s). "
+                    "Struct columns require field names to be strings. "
+                    "Non-string column names will be cast to strings "
+                    "in the result's field names."
+                )
+                x, y = str(x), str(y)
+
+            column_pair_structs[(x, y)] = cudf.core.column.build_struct_column(
+                names=(x, y),
+                children=(self.obj._data[x], self.obj._data[y]),
+                size=len(self.obj),
+            )
+
+        column_pair_groupby = cudf.DataFrame._from_data(
+            column_pair_structs
+        ).groupby(by=self.grouping.keys)
+
+        try:
+            gb_cov = column_pair_groupby.agg(
+                lambda x: x.cov(min_periods, ddof)
+            )
+        except RuntimeError as e:
+            if "Unsupported groupby reduction type-agg combination" in str(e):
+                raise TypeError(
+                    "Covariance accepts only numerical column-pairs"
+                )
+            raise
+
+        # ensure that column-pair labels are arranged in ascending order
+        cols_list = [
+            (y, x) if i > j else (x, y)
+            for j, y in enumerate(column_names)
+            for i, x in enumerate(column_names)
+        ]
+        cols_split = [
+            cols_list[i : i + num_cols]
+            for i in range(0, len(cols_list), num_cols)
+        ]
+
+        def combine_columns(gb_cov, ys):
+            list_of_columns = [gb_cov._data[y] for y in ys]
+            frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys)
+            return interleave_columns(frame)
+
+        # interleave: combine the correlation results for each column-pair
+        # into a single column
+        res = cudf.DataFrame._from_data(
+            {
+                x: combine_columns(gb_cov, ys)
+                for ys, x in zip(cols_split, column_names)
+            }
+        )
+
+        # create a multiindex for the groupby correlated dataframe,
+        # to match pandas behavior
+        unsorted_idx = gb_cov.index.repeat(num_cols)
+        idx_sort_order = unsorted_idx._get_sorted_inds()
+        sorted_idx = unsorted_idx._gather(idx_sort_order)
+        if len(gb_cov):
+            # TO-DO: Should the operation below be done on the CPU instead?
+            sorted_idx._data[None] = as_column(
+                np.tile(column_names, len(gb_cov.index))
+            )
+        res.index = MultiIndex._from_data(sorted_idx._data)
+
+        return res
+
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f71f930a21c..5b60e8dbd1c 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import math
 import pickle
 import warnings
+from functools import cached_property
 from numbers import Number
 from typing import (
     Any,
@@ -54,7 +55,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.utils import cached_property, search_range
+from cudf.utils.utils import search_range
 
 T = TypeVar("T", bound="Frame")
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fab5d75f62b..bc7337d0a42 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 """Base class for Frame types that have an index."""
 
 from __future__ import annotations
@@ -6,6 +6,7 @@
 import operator
 import warnings
 from collections import Counter, abc
+from functools import cached_property
 from typing import Callable, Type, TypeVar
 from uuid import uuid4
 
@@ -24,13 +25,11 @@
     is_integer_dtype,
     is_list_like,
 )
-from cudf.core.column import arange, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame
-from cudf.utils.utils import cached_property
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -59,15 +58,14 @@
 
 
 def _indices_from_labels(obj, labels):
-    from cudf.core.column import column
 
     if not isinstance(labels, cudf.MultiIndex):
-        labels = column.as_column(labels)
+        labels = cudf.core.column.as_column(labels)
 
         if is_categorical_dtype(obj.index):
             labels = labels.astype("category")
             codes = labels.codes.astype(obj.index._values.codes.dtype)
-            labels = column.build_categorical_column(
+            labels = cudf.core.column.build_categorical_column(
                 categories=labels.dtype.categories,
                 codes=codes,
                 ordered=labels.dtype.ordered,
@@ -78,8 +76,12 @@ def _indices_from_labels(obj, labels):
     # join is not guaranteed to maintain the index ordering
     # so we will sort it with its initial ordering which is stored
     # in column "__"
-    lhs = cudf.DataFrame({"__": arange(len(labels))}, index=labels)
-    rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index)
+    lhs = cudf.DataFrame(
+        {"__": cudf.core.column.arange(len(labels))}, index=labels
+    )
+    rhs = cudf.DataFrame(
+        {"_": cudf.core.column.arange(len(obj))}, index=obj.index
+    )
     return lhs.join(rhs).sort_values("__")["_"]
 
 
@@ -256,8 +258,6 @@ def iloc(self):
 
         Selecting rows and column by position.
 
-        Examples
-        --------
         >>> df = cudf.DataFrame({'a': range(20),
         ...                      'b': range(20),
         ...                      'c': range(20)})
@@ -334,7 +334,7 @@ def sort_index(
 
         Parameters
         ----------
-        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
+        axis : {0 or 'index', 1 or 'columns'}, default 0
             The axis along which to sort. The value 0 identifies the rows,
             and 1 identifies the columns.
         level : int or level name or list of ints or list of level names
@@ -346,7 +346,7 @@ def sort_index(
             If True, perform operation in-place.
         kind : sorting method such as `quick sort` and others.
             Not yet supported.
-        na_position : {‘first’, ‘last’}, default ‘last’
+        na_position : {'first', 'last'}, default 'last'
             Puts NaNs at the beginning if first; last puts NaNs at the end.
         sort_remaining : bool, default True
             Not yet supported
@@ -858,7 +858,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         except Exception as e:
             raise RuntimeError("UDF kernel execution failed.") from e
 
-        col = as_column(ans_col)
+        col = cudf.core.column.as_column(ans_col)
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
         result = cudf.Series._from_data({None: col}, self._index)
 
@@ -1000,9 +1000,9 @@ def _align_to_index(
         # to recover ordering after index alignment.
         sort_col_id = str(uuid4())
         if how == "left":
-            lhs[sort_col_id] = arange(len(lhs))
+            lhs[sort_col_id] = cudf.core.column.arange(len(lhs))
         elif how == "right":
-            rhs[sort_col_id] = arange(len(rhs))
+            rhs[sort_col_id] = cudf.core.column.arange(len(rhs))
 
         result = lhs.join(rhs, how=how, sort=sort)
         if how in ("left", "right"):
@@ -1695,6 +1695,154 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
+    # For more detail on this function and how it should work, see
+    # https://numpy.org/doc/stable/reference/ufuncs.html
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        # We don't currently support reduction, accumulation, etc. We also
+        # don't support any special kwargs or higher arity ufuncs than binary.
+        if method != "__call__" or kwargs or ufunc.nin > 2:
+            return NotImplemented
+
+        # Binary operations
+        binary_operations = {
+            # Arithmetic binary operations.
+            "add": "add",
+            "subtract": "sub",
+            "multiply": "mul",
+            "matmul": "matmul",
+            "divide": "truediv",
+            "true_divide": "truediv",
+            "floor_divide": "floordiv",
+            "power": "pow",
+            "float_power": "pow",
+            "remainder": "mod",
+            "mod": "mod",
+            "fmod": "mod",
+            # Bitwise binary operations.
+            "bitwise_and": "and",
+            "bitwise_or": "or",
+            "bitwise_xor": "xor",
+            # Comparison binary operators
+            "greater": "gt",
+            "greater_equal": "ge",
+            "less": "lt",
+            "less_equal": "le",
+            "not_equal": "ne",
+            "equal": "eq",
+        }
+
+        # First look for methods of the class.
+        fname = ufunc.__name__
+        if fname in binary_operations:
+            reflect = self is not inputs[0]
+            other = inputs[0] if reflect else inputs[1]
+
+            # These operators need to be mapped to their inverses when
+            # performing a reflected operation because no reflected version of
+            # the operators themselves exist.
+            ops_without_reflection = {
+                "gt": "lt",
+                "ge": "le",
+                "lt": "gt",
+                "le": "ge",
+                # ne and eq are symmetric, so they are their own inverse op
+                "ne": "ne",
+                "eq": "eq",
+            }
+
+            op = binary_operations[fname]
+            if reflect and op in ops_without_reflection:
+                op = ops_without_reflection[op]
+                reflect = False
+            op = f"__{'r' if reflect else ''}{op}__"
+
+            # pandas bitwise operations return bools if indexes are misaligned.
+            if (
+                "bitwise" in fname
+                and isinstance(other, IndexedFrame)
+                and not self.index.equals(other.index)
+            ):
+                return getattr(self, op)(other).astype(bool)
+            # Float_power returns float irrespective of the input type.
+            if fname == "float_power":
+                return getattr(self, op)(other).astype(float)
+            return getattr(self, op)(other)
+
+        # Special handling for unary operations.
+        if fname == "negative":
+            return self * -1
+        if fname == "positive":
+            return self.copy(deep=True)
+        if fname == "invert":
+            return ~self
+        if fname == "absolute":
+            return self.abs()
+        if fname == "fabs":
+            return self.abs().astype(np.float64)
+
+        # Note: There are some operations that may be supported by libcudf but
+        # are not supported by pandas APIs. In particular, libcudf binary
+        # operations support logical and/or operations, but those operations
+        # are not defined on pd.Series/DataFrame. For now those operations will
+        # dispatch to cupy, but if ufuncs are ever a bottleneck we could add
+        # special handling to dispatch those (or any other) functions that we
+        # could implement without cupy.
+
+        # Attempt to dispatch all other functions to cupy.
+        cupy_func = getattr(cp, fname)
+        if cupy_func:
+            # Indices must be aligned before converting to arrays.
+            if ufunc.nin == 2:
+                other = inputs[self is inputs[0]]
+                inputs, index = self._prep_for_binop(other, fname)
+            else:
+                inputs = {
+                    name: (col, None, False, None)
+                    for name, col in self._data.items()
+                }
+                index = self._index
+
+            mask = None
+            data = [{} for _ in range(ufunc.nout)]
+            for name, (left, right, _, _) in inputs.items():
+                cupy_inputs = []
+                # TODO: I'm jumping through multiple hoops to get the unary
+                # behavior to match up with the binary. I should see if there
+                # are better patterns to employ here.
+                for inp in (left, right) if ufunc.nin == 2 else (left,):
+                    if (
+                        isinstance(inp, cudf.core.column.ColumnBase)
+                        and inp.has_nulls()
+                    ):
+                        new_mask = cudf.core.column.as_column(inp.nullmask)
+
+                        # TODO: This is a hackish way to perform a bitwise and
+                        # of bitmasks. Once we expose
+                        # cudf::detail::bitwise_and, then we can use that
+                        # instead.
+                        mask = new_mask if mask is None else (mask & new_mask)
+
+                        # Arbitrarily fill with zeros. For ufuncs, we assume
+                        # that the end result propagates nulls via a bitwise
+                        # and, so these elements are irrelevant.
+                        inp = inp.fillna(0)
+                    cupy_inputs.append(cp.asarray(inp))
+
+                cp_output = cupy_func(*cupy_inputs, **kwargs)
+                if ufunc.nout == 1:
+                    cp_output = (cp_output,)
+                for i, out in enumerate(cp_output):
+                    data[i][name] = cudf.core.column.as_column(out).set_mask(
+                        mask
+                    )
+
+            out = tuple(
+                self.__class__._from_data(out, index=index) for out in data
+            )
+            return out[0] if ufunc.nout == 1 else out
+
+        return NotImplemented
+
 
 def _check_duplicate_level_names(specified, level_names):
     """Raise if any of `specified` has duplicates in `level_names`."""
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 39ff4718550..c7e46cf0165 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,7 +1,7 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Callable, cast
+from typing import TYPE_CHECKING, Any, Callable, List, cast
 
 import cudf
 from cudf import _lib as libcudf
@@ -320,7 +320,7 @@ def _sort_result(self, result: Frame) -> Frame:
         # same order as given in 'on'. If the indices are used as
         # keys, the index will be sorted. If one index is specified,
         # the key columns on the other side will be used to sort.
-        by = []
+        by: List[Any] = []
         if self._using_left_index and self._using_right_index:
             if result._index is not None:
                 by.extend(result._index._data.columns)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 8581b97c217..5e0cd2ca8cb 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -6,6 +6,7 @@
 import numbers
 import pickle
 from collections.abc import Sequence
+from functools import cached_property
 from numbers import Integral
 from typing import Any, List, MutableMapping, Optional, Tuple, Union
 
@@ -22,11 +23,7 @@
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import (
-    NotIterable,
-    _maybe_indices_to_slice,
-    cached_property,
-)
+from cudf.utils.utils import NotIterable, _maybe_indices_to_slice
 
 
 class MultiIndex(Frame, BaseIndex, NotIterable):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 2fd062152de..8574a152c44 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -958,14 +958,6 @@ def to_frame(self, name=None):
     def memory_usage(self, index=True, deep=False):
         return sum(super().memory_usage(index, deep).values())
 
-    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-        if method == "__call__":
-            return get_appropriate_dispatched_func(
-                cudf, cudf.Series, cupy, ufunc, inputs, kwargs
-            )
-        else:
-            return NotImplemented
-
     def __array_function__(self, func, types, args, kwargs):
         handled_types = [cudf.Series]
         for t in types:
@@ -1214,9 +1206,9 @@ def __repr__(self):
             lines.append(category_memory)
         return "\n".join(lines)
 
-    def _binaryop(
+    def _prep_for_binop(
         self,
-        other: Frame,
+        other: Any,
         fn: str,
         fill_value: Any = None,
         reflect: bool = False,
@@ -1248,24 +1240,55 @@ def _binaryop(
             lhs = self
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
+        return operands, lhs._index
+
+    def _binaryop(
+        self,
+        other: Frame,
+        fn: str,
+        fill_value: Any = None,
+        reflect: bool = False,
+        can_reindex: bool = False,
+        *args,
+        **kwargs,
+    ):
+        operands, out_index = self._prep_for_binop(
+            other, fn, fill_value, reflect, can_reindex
+        )
         return (
-            lhs._from_data(
-                data=lhs._colwise_binop(operands, fn), index=lhs._index,
+            self._from_data(
+                data=self._colwise_binop(operands, fn), index=out_index,
             )
             if operands is not NotImplemented
             else NotImplemented
         )
 
     def logical_and(self, other):
+        warnings.warn(
+            "Series.logical_and is deprecated and will be removed.",
+            FutureWarning,
+        )
         return self._binaryop(other, "l_and").astype(np.bool_)
 
     def remainder(self, other):
+        warnings.warn(
+            "Series.remainder is deprecated and will be removed.",
+            FutureWarning,
+        )
         return self._binaryop(other, "mod")
 
     def logical_or(self, other):
+        warnings.warn(
+            "Series.logical_or is deprecated and will be removed.",
+            FutureWarning,
+        )
         return self._binaryop(other, "l_or").astype(np.bool_)
 
     def logical_not(self):
+        warnings.warn(
+            "Series.logical_not is deprecated and will be removed.",
+            FutureWarning,
+        )
         return self._unaryop("not")
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
@@ -1782,10 +1805,7 @@ def astype(self, dtype, copy=False, errors="raise"):
         try:
             data = self._column.astype(dtype)
 
-            return self._from_data(
-                {self.name: (data.copy(deep=True) if copy else data)},
-                index=self._index,
-            )
+            return self._from_data({self.name: data}, index=self._index)
 
         except Exception as e:
             if errors == "raise":
@@ -3289,14 +3309,16 @@ def merge(
         return result
 
     def add_prefix(self, prefix):
-        result = self.copy(deep=True)
-        result.index = prefix + self.index.astype(str)
-        return result
+        return Series._from_data(
+            data=self._data.copy(deep=True),
+            index=prefix + self.index.astype(str),
+        )
 
     def add_suffix(self, suffix):
-        result = self.copy(deep=True)
-        result.index = self.index.astype(str) + suffix
-        return result
+        return Series._from_data(
+            data=self._data.copy(deep=True),
+            index=self.index.astype(str) + suffix,
+        )
 
     def keys(self):
         """
@@ -3838,8 +3860,8 @@ def is_leap_year(self):
         Series
         Booleans indicating if dates belong to a leap year.
 
-        Example
-        -------
+        Examples
+        --------
         >>> import pandas as pd, cudf
         >>> s = cudf.Series(
         ...     pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y'))
@@ -3896,7 +3918,7 @@ def quarter(self):
         Integer indicating which quarter the date belongs to.
 
         Examples
-        -------
+        --------
         >>> import cudf
         >>> s = cudf.Series(["2020-05-31 08:00:00","1999-12-31 18:40:00"],
         ...     dtype="datetime64[ms]")
@@ -3972,8 +3994,8 @@ def days_in_month(self):
         Series
         Integers representing the number of days in month
 
-        Example
-        -------
+        Examples
+        --------
         >>> import pandas as pd, cudf
         >>> s = cudf.Series(
         ...     pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M'))
@@ -4023,8 +4045,8 @@ def is_month_end(self):
         Series
         Booleans indicating if dates are the last day of the month.
 
-        Example
-        -------
+        Examples
+        --------
         >>> import pandas as pd, cudf
         >>> s = cudf.Series(
         ...     pd.date_range(start='2000-08-26', end='2000-09-03', freq='1D'))
@@ -4069,8 +4091,8 @@ def is_quarter_start(self):
         Series
         Booleans indicating if dates are the begining of a quarter
 
-        Example
-        -------
+        Examples
+        --------
         >>> import pandas as pd, cudf
         >>> s = cudf.Series(
         ...     pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D'))
@@ -4115,8 +4137,8 @@ def is_quarter_end(self):
         Series
         Booleans indicating if dates are the end of a quarter
 
-        Example
-        -------
+        Examples
+        --------
         >>> import pandas as pd, cudf
         >>> s = cudf.Series(
         ...     pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D'))
@@ -4163,8 +4185,8 @@ def is_year_start(self):
         Series
         Booleans indicating if dates are the first day of the year.
 
-        Example
-        -------
+        Examples
+        --------
         >>> import pandas as pd, cudf
         >>> s = cudf.Series(pd.date_range("2017-12-30", periods=3))
         >>> dates
@@ -4197,8 +4219,8 @@ def is_year_end(self):
         Series
         Booleans indicating if dates are the last day of the year.
 
-        Example
-        -------
+        Examples
+        --------
         >>> import pandas as pd, cudf
         >>> dates = cudf.Series(pd.date_range("2017-12-30", periods=3))
         >>> dates
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index bf867923b57..50b206d3388 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 """Base class for Frame types that only have a single column."""
 
 from __future__ import annotations
@@ -274,7 +274,7 @@ def factorize(self, na_sentinel=-1):
 
     def _make_operands_for_binop(
         self,
-        other: T,
+        other: Any,
         fill_value: Any = None,
         reflect: bool = False,
         *args,
@@ -310,7 +310,7 @@ def _make_operands_for_binop(
         else:
             result_name = self.name
 
-        # This needs to be tested correctly
+        # TODO: This needs to be tested correctly
         if isinstance(other, SingleColumnFrame):
             other = other._column
         elif not _is_scalar_or_zero_d_array(other):
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 0f4256e49a6..8ffd75b1d76 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -302,8 +302,8 @@ def apply(self, func, *args, **kwargs):
         -----
         See notes of the :meth:`cudf.Series.applymap`
 
-        Example
-        -------
+        Examples
+        --------
 
         >>> import cudf
         >>> def count_if_gt_3(window):
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index a09fb1f8e12..62260cbb822 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -20,8 +20,8 @@ def _make_empty_df(filepath_or_buffer, columns):
     orc_file = orc.ORCFile(filepath_or_buffer)
     schema = orc_file.schema
     col_names = schema.names if columns is None else columns
-    return cudf.DataFrame(
-        {
+    return cudf.DataFrame._from_data(
+        data={
             col_name: cudf.core.column.column_empty(
                 row_count=0,
                 dtype=schema.field(col_name).type.to_pandas_dtype(),
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index a919b00692d..948428de4f0 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -7,6 +7,7 @@
 from uuid import uuid4
 
 import numpy as np
+from nvtx import annotate
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -16,6 +17,7 @@
 from cudf.utils import ioutils
 
 
+@annotate("_WRITE_PARQUET", color="green", domain="cudf_python")
 def _write_parquet(
     df,
     paths,
@@ -73,6 +75,7 @@ def _write_parquet(
 
 # Logic chosen to match: https://arrow.apache.org/
 # docs/_modules/pyarrow/parquet.html#write_to_dataset
+@annotate("WRITE_TO_DATASET", color="green", domain="cudf_python")
 def write_to_dataset(
     df,
     root_path,
@@ -161,6 +164,7 @@ def write_to_dataset(
 
 
 @ioutils.doc_read_parquet_metadata()
+@annotate("READ_PARQUET_METADATA", color="green", domain="cudf_python")
 def read_parquet_metadata(path):
     """{docstring}"""
 
@@ -173,6 +177,7 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
+@annotate("_PROCESS_DATASET", color="green", domain="cudf_python")
 def _process_dataset(
     paths, fs, filters=None, row_groups=None, categorical_partitions=True,
 ):
@@ -308,6 +313,7 @@ def _process_dataset(
 
 
 @ioutils.doc_read_parquet()
+@annotate("READ_PARQUET", color="green", domain="cudf_python")
 def read_parquet(
     filepath_or_buffer,
     engine="cudf",
@@ -435,6 +441,7 @@ def read_parquet(
     )
 
 
+@annotate("_PARQUET_TO_FRAME", color="green", domain="cudf_python")
 def _parquet_to_frame(
     paths_or_buffers,
     *args,
@@ -502,6 +509,7 @@ def _parquet_to_frame(
     )
 
 
+@annotate("_WRITE_PARQUET", color="green", domain="cudf_python")
 def _read_parquet(
     filepaths_or_buffers,
     engine,
@@ -535,6 +543,7 @@ def _read_parquet(
 
 
 @ioutils.doc_to_parquet()
+@annotate("TO_PARQUET", color="green", domain="cudf_python")
 def to_parquet(
     df,
     path,
@@ -646,6 +655,7 @@ def _generate_filename():
     return uuid4().hex + ".parquet"
 
 
+@annotate("_GET_PARTITIONED", color="green", domain="cudf_python")
 def _get_partitioned(
     df,
     root_path,
@@ -689,6 +699,7 @@ def _get_partitioned(
 
 
 class ParquetDatasetWriter:
+    @annotate("ParquetDatasetWriter_INIT", color="green", domain="cudf_python")
     def __init__(
         self,
         path,
@@ -765,6 +776,9 @@ def __init__(
         self.path_cw_map: Dict[str, int] = {}
         self.filename = None
 
+    @annotate(
+        "ParquetDatasetWriter_WRITE_TABLE", color="green", domain="cudf_python"
+    )
     def write_table(self, df):
         """
         Write a dataframe to the file/dataset
@@ -821,6 +835,9 @@ def write_table(self, df):
         self.path_cw_map.update({k: new_cw_idx for k in new_paths})
         self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
+    @annotate(
+        "ParquetDatasetWriter_CLOSE", color="green", domain="cudf_python"
+    )
     def close(self, return_metadata=False):
         """
         Close all open files and optionally return footer metadata as a binary
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 6c602d321eb..e767c0c62be 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -1,5 +1,6 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+import itertools
 import re
 import warnings
 from collections.abc import Mapping, Sequence
@@ -46,7 +47,7 @@ def set_random_null_mask_inplace(series, null_probability=0.5, seed=None):
     probs = [null_probability, 1 - null_probability]
     rng = np.random.default_rng(seed=seed)
     mask = rng.choice([False, True], size=len(series), p=probs)
-    series[mask] = None
+    series.iloc[mask] = None
 
 
 # TODO: This function should be removed. Anywhere that it is being used should
@@ -330,3 +331,9 @@ def does_not_raise():
 
 def xfail_param(param, **kwargs):
     return pytest.param(param, marks=pytest.mark.xfail(**kwargs))
+
+
+parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
+    "left_dtype,right_dtype",
+    list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
+)
diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py
index 041bd055f0a..4d5b5926d6e 100644
--- a/python/cudf/cudf/tests/conftest.py
+++ b/python/cudf/cudf/tests/conftest.py
@@ -1,10 +1,41 @@
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
+import os
 import pathlib
 
 import pytest
 
 import rmm  # noqa: F401
 
+_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent)
+
 
 @pytest.fixture(scope="session")
 def datadir():
     return pathlib.Path(__file__).parent / "data"
+
+
+# To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use
+# the sessionstart and sessionfinish hooks rather than a simple autouse,
+# session-scope fixture because we need to set these variable before collection
+# occurs because the environment variable will be checked as soon as cudf is
+# imported anywhere.
+def pytest_sessionstart(session):
+    """
+    Called after the Session object has been created and
+    before performing collection and entering the run test loop.
+    """
+    os.environ["NO_EXTERNAL_ONLY_APIS"] = "1"
+    os.environ["_CUDF_TEST_ROOT"] = _CURRENT_DIRECTORY
+
+
+def pytest_sessionfinish(session, exitstatus):
+    """
+    Called after whole test run finished, right before
+    returning the exit status to the system.
+    """
+    try:
+        del os.environ["NO_EXTERNAL_ONLY_APIS"]
+        del os.environ["_CUDF_TEST_ROOT"]
+    except KeyError:
+        pass
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index 3fe0321ec54..e4b4d5020ea 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -1,175 +1,208 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+import operator
+import warnings
+from contextlib import contextmanager
+from functools import reduce
+
 import cupy as cp
 import numpy as np
-import pandas as pd
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
-
-
-@pytest.fixture
-def np_ar_tup():
-    np.random.seed(0)
-    return (np.random.random(100), np.random.random(100))
-
+from cudf.testing._utils import assert_eq, set_random_null_mask_inplace
 
-comparison_ops_ls = [
-    np.greater,
-    np.greater_equal,
-    np.less,
-    np.less_equal,
-    np.equal,
-    np.not_equal,
+_UFUNCS = [
+    obj
+    for obj in (getattr(np, name) for name in dir(np))
+    if isinstance(obj, np.ufunc)
 ]
 
 
-@pytest.mark.parametrize(
-    "func", comparison_ops_ls + [np.subtract, np.fmod, np.power]
-)
-def test_ufunc_cudf_non_nullseries(np_ar_tup, func):
-    x, y = np_ar_tup[0], np_ar_tup[1]
-    s_1, s_2 = cudf.Series(x), cudf.Series(y)
-    expect = func(x, y)
-    got = func(s_1, s_2)
-    assert_eq(expect, got.to_numpy())
-
-
-@pytest.mark.parametrize(
-    "func", [np.bitwise_and, np.bitwise_or, np.bitwise_xor],
-)
-def test_ufunc_cudf_series_bitwise(func):
-    np.random.seed(0)
-    x = np.random.randint(size=100, low=0, high=100)
-    y = np.random.randint(size=100, low=0, high=100)
-
-    s_1, s_2 = cudf.Series(x), cudf.Series(y)
-    expect = func(x, y)
-    got = func(s_1, s_2)
-    assert_eq(expect, got.to_numpy())
-
-
-@pytest.mark.parametrize(
-    "func",
-    [
-        np.subtract,
-        np.multiply,
-        np.floor_divide,
-        np.true_divide,
-        np.power,
-        np.remainder,
-        np.divide,
-    ],
-)
-def test_ufunc_cudf_null_series(np_ar_tup, func):
-    x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32)
-    x[0] = np.nan
-    y[1] = np.nan
-    s_1, s_2 = cudf.Series(x), cudf.Series(y)
-    expect = func(x, y)
-    got = func(s_1, s_2)
-    assert_eq(expect, got.fillna(np.nan).to_numpy())
-
-    scalar = 0.5
-    expect = func(x, scalar)
-    got = func(s_1, scalar)
-    assert_eq(expect, got.fillna(np.nan).to_numpy())
-
-    expect = func(scalar, x)
-    got = func(scalar, s_1)
-    assert_eq(expect, got.fillna(np.nan).to_numpy())
-
-
-@pytest.mark.xfail(
-    reason="""cuDF comparison operations with <NA> incorrectly
-    returns False rather than <NA>"""
-)
-@pytest.mark.parametrize(
-    "func", comparison_ops_ls,
-)
-def test_ufunc_cudf_null_series_comparison_ops(np_ar_tup, func):
-    x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32)
-    x[0] = np.nan
-    y[1] = np.nan
-    s_1, s_2 = cudf.Series(x), cudf.Series(y)
-    expect = func(x, y)
-    got = func(s_1, s_2)
-    assert_eq(expect, got.fillna(np.nan).to_numpy())
-
-    scalar = 0.5
-    expect = func(x, scalar)
-    got = func(s_1, scalar)
-    assert_eq(expect, got.fillna(np.nan).to_numpy())
-
-    expect = func(scalar, x)
-    got = func(scalar, s_1)
-    assert_eq(expect, got.fillna(np.nan).to_numpy())
-
-
-@pytest.mark.parametrize(
-    "func", [np.logaddexp, np.fmax, np.fmod],
-)
-def test_ufunc_cudf_series_cupy_array(np_ar_tup, func):
-    x, y = np_ar_tup[0], np_ar_tup[1]
-    expect = func(x, y)
-
-    cudf_s = cudf.Series(x)
-    cupy_ar = cp.array(y)
-    got = func(cudf_s, cupy_ar)
-    assert_eq(expect, got.to_numpy())
-
-
-@pytest.mark.parametrize(
-    "func",
-    [np.fmod, np.logaddexp, np.bitwise_and, np.bitwise_or, np.bitwise_xor],
-)
-def test_error_with_null_cudf_series(func):
-    s_1 = cudf.Series([1, 2])
-    s_2 = cudf.Series([1, None])
-
-    # this thows a value error
-    # because of nulls in cudf.Series
-    with pytest.raises(ValueError):
-        func(s_1, s_2)
-
-    s_1 = cudf.Series([1, 2])
-    s_2 = cudf.Series([1, 2, None])
-
-    # this throws a value-error if indexes are not aligned
-    # following pandas behavior for ufunc numpy dispatching
-    with pytest.raises(
-        ValueError, match="Can only compare identically-labeled Series objects"
+@contextmanager
+def _hide_ufunc_warnings(ufunc):
+    # pandas raises warnings for some inputs to the following ufuncs:
+    name = ufunc.__name__
+    if name in {
+        "arccos",
+        "arccosh",
+        "arcsin",
+        "arctanh",
+        "fmod",
+        "log",
+        "log10",
+        "log2",
+        "reciprocal",
+    }:
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                f"invalid value encountered in {name}",
+                category=RuntimeWarning,
+            )
+            warnings.filterwarnings(
+                "ignore",
+                f"divide by zero encountered in {name}",
+                category=RuntimeWarning,
+            )
+            yield
+    else:
+        yield
+
+
+@pytest.mark.parametrize("ufunc", _UFUNCS)
+@pytest.mark.parametrize("has_nulls", [True, False])
+@pytest.mark.parametrize("indexed", [True, False])
+def test_ufunc_series(ufunc, has_nulls, indexed):
+    # Note: This test assumes that all ufuncs are unary or binary.
+    fname = ufunc.__name__
+    if indexed and fname in (
+        "greater",
+        "greater_equal",
+        "less",
+        "less_equal",
+        "not_equal",
+        "equal",
     ):
-        func(s_1, s_2)
+        pytest.skip("Comparison operators do not support misaligned indexes.")
+
+    if (indexed or has_nulls) and fname == "matmul":
+        pytest.xfail("Frame.dot currently does not support indexes or nulls")
+
+    N = 100
+    # Avoid zeros in either array to skip division by 0 errors. Also limit the
+    # scale to avoid issues with overflow, etc. We use ints because some
+    # operations (like bitwise ops) are not defined for floats.
+    pandas_args = args = [
+        cudf.Series(
+            cp.random.randint(low=1, high=10, size=N),
+            index=cp.random.choice(range(N), N, False) if indexed else None,
+        )
+        for _ in range(ufunc.nin)
+    ]
+
+    if has_nulls:
+        # Converting nullable integer cudf.Series to pandas will produce a
+        # float pd.Series, so instead we replace nulls with an arbitrary
+        # integer value, precompute the mask, and then reapply it afterwards.
+        for arg in args:
+            set_random_null_mask_inplace(arg)
+        pandas_args = [arg.fillna(0) for arg in args]
+
+        # Note: Different indexes must be aligned before the mask is computed.
+        # This requires using an internal function (_align_indices), and that
+        # is unlikely to change for the foreseeable future.
+        aligned = (
+            cudf.core.series._align_indices(args, allow_non_unique=True)
+            if indexed and ufunc.nin == 2
+            else args
+        )
+        mask = reduce(operator.or_, (a.isna() for a in aligned)).to_pandas()
+
+    try:
+        got = ufunc(*args)
+    except AttributeError as e:
+        # We xfail if we don't have an explicit dispatch and cupy doesn't have
+        # the method so that we can easily identify these methods. As of this
+        # writing, the only missing methods are isnat and heaviside.
+        if "module 'cupy' has no attribute" in str(e):
+            pytest.xfail(reason="Operation not supported by cupy")
+        raise
+
+    with _hide_ufunc_warnings(ufunc):
+        expect = ufunc(*(arg.to_pandas() for arg in pandas_args))
+
+    try:
+        if ufunc.nout > 1:
+            for g, e in zip(got, expect):
+                if has_nulls:
+                    e[mask] = np.nan
+                assert_eq(g, e)
+        else:
+            if has_nulls:
+                expect[mask] = np.nan
+            assert_eq(got, expect)
+    except AssertionError:
+        # TODO: This branch can be removed when
+        # https://github.com/rapidsai/cudf/issues/10178 is resolved
+        if fname in ("power", "float_power"):
+            not_equal = cudf.from_pandas(expect) != got
+            not_equal[got.isna()] = False
+            diffs = got[not_equal] - expect[not_equal.to_pandas()]
+            if diffs.abs().max() == 1:
+                pytest.xfail("https://github.com/rapidsai/cudf/issues/10178")
+        raise
 
 
 @pytest.mark.parametrize(
-    "func", [np.absolute, np.sign, np.exp2, np.tanh],
+    "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and]
 )
-def test_ufunc_cudf_series_with_index(func):
-    data = [-1, 2, 3, 0]
-    index = [2, 3, 1, 0]
-    cudf_s = cudf.Series(data=data, index=index)
-    pd_s = pd.Series(data=data, index=index)
-
-    expect = func(pd_s)
-    got = func(cudf_s)
-
-    assert_eq(got, expect)
-
-
-@pytest.mark.parametrize(
-    "func", [np.logaddexp2],
-)
-def test_ufunc_cudf_series_with_nonaligned_index(func):
-    cudf_s1 = cudf.Series(data=[-1, 2, 3, 0], index=[2, 3, 1, 0])
-    cudf_s2 = cudf.Series(data=[-1, 2, 3, 0], index=[3, 1, 0, 2])
-
-    # this throws a value-error if indexes are not aligned
-    # following pandas behavior for ufunc numpy dispatching
-    with pytest.raises(
-        ValueError, match="Can only compare identically-labeled Series objects"
-    ):
-        func(cudf_s1, cudf_s2)
+@pytest.mark.parametrize("has_nulls", [True, False])
+@pytest.mark.parametrize("indexed", [True, False])
+@pytest.mark.parametrize("type_", ["cupy", "numpy", "list"])
+@pytest.mark.parametrize("reflect", [True, False])
+def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect):
+    fname = ufunc.__name__
+    if fname in ("greater", "greater_equal", "logical_and") and has_nulls:
+        pytest.xfail(
+            "The way cudf casts nans in arrays to nulls during binops with "
+            "cudf objects is currently incompatible with pandas."
+        )
+    if reflect and has_nulls and type_ == "cupy":
+        pytest.skip(
+            "When cupy is the left operand there is no way for us to avoid "
+            "calling its binary operators, which cannot handle cudf objects "
+            "that contain nulls."
+        )
+    N = 100
+    # Avoid zeros in either array to skip division by 0 errors. Also limit the
+    # scale to avoid issues with overflow, etc. We use ints because some
+    # operations (like bitwise ops) are not defined for floats.
+    args = [
+        cudf.Series(
+            cp.random.rand(N),
+            index=cp.random.choice(range(N), N, False) if indexed else None,
+        )
+        for _ in range(ufunc.nin)
+    ]
+
+    if has_nulls:
+        # Converting nullable integer cudf.Series to pandas will produce a
+        # float pd.Series, so instead we replace nulls with an arbitrary
+        # integer value, precompute the mask, and then reapply it afterwards.
+        for arg in args:
+            set_random_null_mask_inplace(arg)
+
+        # Cupy doesn't support nulls, so we fill with nans before converting.
+        args[1] = args[1].fillna(cp.nan)
+        mask = args[0].isna().to_pandas()
+
+    arg1 = args[1].to_cupy() if type_ == "cupy" else args[1].to_numpy()
+    if type_ == "list":
+        arg1 = arg1.tolist()
+
+    if reflect:
+        got = ufunc(arg1, args[0])
+        expect = ufunc(args[1].to_numpy(), args[0].to_pandas())
+    else:
+        got = ufunc(args[0], arg1)
+        expect = ufunc(args[0].to_pandas(), args[1].to_numpy())
+
+    if ufunc.nout > 1:
+        for g, e in zip(got, expect):
+            if has_nulls:
+                e[mask] = np.nan
+            if type_ == "cupy" and reflect:
+                assert (cp.asnumpy(g) == e).all()
+            else:
+                assert_eq(g, e)
+    else:
+        if has_nulls:
+            expect[mask] = np.nan
+        if type_ == "cupy" and reflect:
+            assert (cp.asnumpy(got) == expect).all()
+        else:
+            assert_eq(got, expect)
 
 
 @pytest.mark.parametrize(
@@ -182,3 +215,103 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func):
     # this throws a value-error because of presence of out kwarg
     with pytest.raises(TypeError):
         func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3)
+
+
+# Skip matmul since it requires aligned shapes.
+@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul))
+@pytest.mark.parametrize("has_nulls", [True, False])
+@pytest.mark.parametrize("indexed", [True, False])
+def test_ufunc_dataframe(ufunc, has_nulls, indexed):
+    # Note: This test assumes that all ufuncs are unary or binary.
+    fname = ufunc.__name__
+    # TODO: When pandas starts supporting misaligned indexes properly, remove
+    # this check but enable the one below.
+    if indexed:
+        pytest.xfail(
+            "pandas does not currently support misaligned indexes in "
+            "DataFrames, but we do. Until this is fixed we will skip these "
+            "tests. See the error here: "
+            "https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L212, "  # noqa: E501
+            "called from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L258"  # noqa: E501
+        )
+    # TODO: Enable the check below when we remove the check above.
+    # if indexed and fname in (
+    #     "greater",
+    #     "greater_equal",
+    #     "less",
+    #     "less_equal",
+    #     "not_equal",
+    #     "equal",
+    # ):
+    #     pytest.skip("Comparison operators do not support misaligned indexes.")  # noqa: E501
+
+    N = 100
+    # Avoid zeros in either array to skip division by 0 errors. Also limit the
+    # scale to avoid issues with overflow, etc. We use ints because some
+    # operations (like bitwise ops) are not defined for floats.
+    # TODO: Add tests of mismatched columns etc.
+    pandas_args = args = [
+        cudf.DataFrame(
+            {"foo": cp.random.randint(low=1, high=10, size=N)},
+            index=cp.random.choice(range(N), N, False) if indexed else None,
+        )
+        for _ in range(ufunc.nin)
+    ]
+
+    if has_nulls:
+        # Converting nullable integer cudf.Series to pandas will produce a
+        # float pd.Series, so instead we replace nulls with an arbitrary
+        # integer value, precompute the mask, and then reapply it afterwards.
+        for arg in args:
+            set_random_null_mask_inplace(arg["foo"])
+        pandas_args = [arg.copy() for arg in args]
+        for arg in pandas_args:
+            arg["foo"] = arg["foo"].fillna(0)
+
+        # Note: Different indexes must be aligned before the mask is computed.
+        # This requires using an internal function (_align_indices), and that
+        # is unlikely to change for the foreseeable future.
+        aligned = (
+            cudf.core.dataframe._align_indices(*args)
+            if indexed and ufunc.nin == 2
+            else args
+        )
+        mask = reduce(
+            operator.or_, (a["foo"].isna() for a in aligned)
+        ).to_pandas()
+
+    try:
+        got = ufunc(*args)
+    except AttributeError as e:
+        # We xfail if we don't have an explicit dispatch and cupy doesn't have
+        # the method so that we can easily identify these methods. As of this
+        # writing, the only missing methods are isnat and heaviside.
+        if "module 'cupy' has no attribute" in str(e):
+            pytest.xfail(reason="Operation not supported by cupy")
+        raise
+
+    with _hide_ufunc_warnings(ufunc):
+        expect = ufunc(*(arg.to_pandas() for arg in pandas_args))
+
+    try:
+        if ufunc.nout > 1:
+            for g, e in zip(got, expect):
+                if has_nulls:
+                    e[mask] = np.nan
+                assert_eq(g, e)
+        else:
+            if has_nulls:
+                expect[mask] = np.nan
+            assert_eq(got, expect)
+    except AssertionError:
+        # TODO: This branch can be removed when
+        # https://github.com/rapidsai/cudf/issues/10178 is resolved
+        if fname in ("power", "float_power"):
+            not_equal = cudf.from_pandas(expect) != got
+            not_equal[got.isna()] = False
+            diffs = got[not_equal] - cudf.from_pandas(
+                expect[not_equal.to_pandas()]
+            )
+            if diffs["foo"].abs().max() == 1:
+                pytest.xfail("https://github.com/rapidsai/cudf/issues/10178")
+        raise
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index a064bec9e82..9eb01ae31b4 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -210,7 +210,7 @@ def test_can_parse_no_schema():
     assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize("rows", [0, 1, 10, 100000])
+@pytest.mark.parametrize("rows", [0, 1, 10, 1000])
 @pytest.mark.parametrize("codec", ["null", "deflate", "snappy"])
 def test_avro_compression(rows, codec):
     schema = {
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 76add8b9c5d..02ca7a0cd58 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -4,7 +4,7 @@
 import decimal
 import operator
 import random
-from itertools import product
+from itertools import combinations_with_replacement, product
 
 import cupy as cp
 import numpy as np
@@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype):
 
 
 def _series_compare_nulls_typegen():
-    tests = []
-    tests += list(product(DATETIME_TYPES, DATETIME_TYPES))
-    tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES))
-    tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES))
-    tests += list(product(STRING_TYPES, STRING_TYPES))
-
-    return tests
+    return [
+        *combinations_with_replacement(DATETIME_TYPES, 2),
+        *combinations_with_replacement(TIMEDELTA_TYPES, 2),
+        *combinations_with_replacement(NUMERIC_TYPES, 2),
+        *combinations_with_replacement(STRING_TYPES, 2),
+    ]
 
 
 @pytest.mark.parametrize("cmpop", _cmpops)
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index b8724fe36f5..1ab5931fe5f 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -574,7 +574,7 @@ def test_concat_empty_dataframes(df, other, ignore_index):
     expected = pd.concat(other_pd, ignore_index=ignore_index)
     actual = gd.concat(other_gd, ignore_index=ignore_index)
     if expected.shape != df.shape:
-        for key, col in actual[actual.columns].iteritems():
+        for key, col in actual[actual.columns].items():
             if is_categorical_dtype(col.dtype):
                 if not is_categorical_dtype(expected[key].dtype):
                     # TODO: Pandas bug:
@@ -1184,7 +1184,7 @@ def test_concat_join_empty_dataframes(
     )
     if expected.shape != df.shape:
         if axis == 0:
-            for key, col in actual[actual.columns].iteritems():
+            for key, col in actual[actual.columns].items():
                 if is_categorical_dtype(col.dtype):
                     if not is_categorical_dtype(expected[key].dtype):
                         # TODO: Pandas bug:
@@ -1306,7 +1306,7 @@ def test_concat_join_empty_dataframes_axis_1(
     )
     if expected.shape != df.shape:
         if axis == 0:
-            for key, col in actual[actual.columns].iteritems():
+            for key, col in actual[actual.columns].items():
                 if is_categorical_dtype(col.dtype):
                     expected[key] = expected[key].fillna("-1")
                     actual[key] = col.astype("str").fillna("-1")
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 9208b8c7cd4..f3d69e1745e 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import gzip
 import os
@@ -8,6 +8,7 @@
 from io import BytesIO, StringIO
 from pathlib import Path
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 import pytest
@@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir):
 def test_csv_reader_carriage_return(tmpdir):
     rows = 1000
     names = ["int_row", "int_double_row"]
-
     buffer = ",".join(names) + "\r\n"
     for row in range(rows):
         buffer += str(row) + ", " + str(2 * row) + "\r\n"
 
     df = read_csv(StringIO(buffer))
+    expect = cudf.DataFrame(
+        {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2}
+    )
 
     assert len(df) == rows
-    for row in range(0, rows):
-        assert df[names[0]][row] == row
-        assert df[names[1]][row] == 2 * row
+    assert_eq(expect, df)
 
 
 def test_csv_reader_tabs():
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index c7dfdd9d9f9..acd9e28c661 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -917,9 +917,7 @@ def test_dataframe_dtypes():
     dtypes = pd.Series(
         [np.int32, np.float32, np.float64], index=["c", "a", "b"]
     )
-    df = cudf.DataFrame(
-        {k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()}
-    )
+    df = cudf.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.items()})
     assert df.dtypes.equals(dtypes)
 
 
@@ -1956,7 +1954,7 @@ def test_dataframe_reductions(data, axis, func, skipna):
         elif func not in cudf.core.dataframe._cupy_nan_methods_map:
             if skipna is False:
                 expected_exception = NotImplementedError
-            elif any(col.nullable for name, col in gdf.iteritems()):
+            elif any(col.nullable for name, col in gdf.items()):
                 expected_exception = ValueError
             elif func in ("cummin", "cummax"):
                 expected_exception = AttributeError
@@ -2134,7 +2132,7 @@ def test_iter(pdf, gdf):
 
 
 def test_iteritems(gdf):
-    for k, v in gdf.iteritems():
+    for k, v in gdf.items():
         assert k in gdf.columns
         assert isinstance(v, cudf.Series)
         assert_eq(v, gdf[k])
@@ -3444,29 +3442,37 @@ def test_get_numeric_data():
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES)
-@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20])
+@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15])
 @pytest.mark.parametrize("data_empty", [False, True])
 def test_shift(dtype, period, data_empty):
-
+    # TODO : this function currently tests for series.shift()
+    # but should instead test for dataframe.shift()
     if data_empty:
         data = None
     else:
         if dtype == np.int8:
             # to keep data in range
-            data = gen_rand(dtype, 100000, low=-2, high=2)
+            data = gen_rand(dtype, 10, low=-2, high=2)
         else:
-            data = gen_rand(dtype, 100000)
+            data = gen_rand(dtype, 10)
 
-    gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)})
-    pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)})
+    gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)})
+    ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)})
 
-    shifted_outcome = gdf.a.shift(period).fillna(0)
-    expected_outcome = pdf.a.shift(period).fillna(0).astype(dtype)
+    shifted_outcome = gs.a.shift(period)
+    expected_outcome = ps.a.shift(period)
 
+    # pandas uses NaNs to signal missing value and force converts the
+    # results columns to float types
     if data_empty:
-        assert_eq(shifted_outcome, expected_outcome, check_index_type=False)
+        assert_eq(
+            shifted_outcome,
+            expected_outcome,
+            check_index_type=False,
+            check_dtype=False,
+        )
     else:
-        assert_eq(shifted_outcome, expected_outcome)
+        assert_eq(shifted_outcome, expected_outcome, check_dtype=False)
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES)
@@ -8811,6 +8817,124 @@ def test_dataframe_add_suffix():
     assert_eq(got, expected)
 
 
+@pytest.mark.parametrize(
+    "data, gkey",
+    [
+        (
+            {
+                "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+                "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+                "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            },
+            ["id"],
+        ),
+        (
+            {
+                "id": [0, 0, 0, 0, 1, 1, 1],
+                "a": [10.0, 3, 4, 2.0, -3.0, 9.0, 10.0],
+                "b": [10.0, 23, -4.0, 2, -3.0, 9, 19.0],
+            },
+            ["id", "a"],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "min_periods", [0, 3],
+)
+@pytest.mark.parametrize(
+    "ddof", [1, 2],
+)
+def test_groupby_covariance(data, gkey, min_periods, ddof):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof)
+    expected = pdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof)
+
+    assert_eq(expected, actual)
+
+
+def test_groupby_covariance_multiindex_dataframe():
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 1, 2, 2],
+            "b": [1, 1, 2, 3],
+            "c": [2, 3, 4, 5],
+            "d": [6, 8, 9, 1],
+        }
+    ).set_index(["a", "b"])
+
+    actual = gdf.groupby(level=["a", "b"]).cov()
+    expected = gdf.to_pandas().groupby(level=["a", "b"]).cov()
+
+    assert_eq(expected, actual)
+
+
+def test_groupby_covariance_empty_columns():
+    gdf = cudf.DataFrame(columns=["id", "val1", "val2"])
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby("id").cov()
+    expected = pdf.groupby("id").cov()
+
+    assert_eq(
+        expected, actual, check_dtype=False, check_index_type=False,
+    )
+
+
+def test_groupby_cov_invalid_column_types():
+    gdf = cudf.DataFrame(
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+    )
+    with pytest.raises(
+        TypeError, match="Covariance accepts only numerical column-pairs",
+    ):
+        gdf.groupby("id").cov()
+
+
+def test_groupby_cov_positive_semidefinite_matrix():
+    # Refer to discussions in PR #9889 re "pair-wise deletion" strategy
+    # being used in pandas to compute the covariance of a dataframe with
+    # rows containing missing values.
+    # Note: cuDF currently matches pandas behavior in that the covariance
+    # matrices are not guaranteed PSD (positive semi definite).
+    # https://github.com/rapidsai/cudf/pull/9889#discussion_r794158358
+    gdf = cudf.DataFrame(
+        [[1, 2], [None, 4], [5, None], [7, 8]], columns=["v0", "v1"]
+    )
+    actual = gdf.groupby(by=cudf.Series([1, 1, 1, 1])).cov()
+    actual.reset_index(drop=True, inplace=True)
+
+    pdf = gdf.to_pandas()
+    expected = pdf.groupby(by=pd.Series([1, 1, 1, 1])).cov()
+    expected.reset_index(drop=True, inplace=True)
+
+    assert_eq(
+        expected, actual, check_dtype=False,
+    )
+
+
+@pytest.mark.xfail
+def test_groupby_cov_for_pandas_bug_case():
+    # Handles case: pandas bug using ddof with missing data.
+    # Filed an issue in Pandas on GH, link below:
+    # https://github.com/pandas-dev/pandas/issues/45814
+    pdf = pd.DataFrame(
+        {"id": ["a", "a"], "val1": [1.0, 2.0], "val2": [np.nan, np.nan]}
+    )
+    expected = pdf.groupby("id").cov(ddof=2)
+
+    gdf = cudf.from_pandas(pdf)
+    actual = gdf.groupby("id").cov(ddof=2)
+
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -8925,3 +9049,24 @@ def test_dataframe_rename_duplicate_column():
         ValueError, match="Duplicate column names are not allowed"
     ):
         gdf.rename(columns={"a": "b"}, inplace=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        np.random.RandomState(seed=10).randint(-50, 50, (10, 10)),
+        np.random.RandomState(seed=10).random_sample((4, 4)),
+        np.array([1.123, 2.343, 5.890, 0.0]),
+        {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]},
+    ],
+)
+@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5])
+@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"])
+def test_dataframe_pct_change(data, periods, fill_method):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.pct_change(periods=periods, fill_method=fill_method)
+    expected = pdf.pct_change(periods=periods, fill_method=fill_method)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py
index 47c9448cf63..692f40873d7 100644
--- a/python/cudf/cudf/tests/test_extension_compilation.py
+++ b/python/cudf/cudf/tests/test_extension_compilation.py
@@ -1,13 +1,17 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 import operator
 
 import cupy as cp
+import numpy as np
 import pytest
 from numba import cuda, types
 from numba.cuda import compile_ptx
+from numba.np.numpy_support import from_dtype
 
 from cudf import NA
 from cudf.core.udf.api import Masked
 from cudf.core.udf.typing import MaskedType
+from cudf.testing._utils import parametrize_numeric_dtypes_pairwise
 
 arith_ops = (
     operator.add,
@@ -159,19 +163,21 @@ def func(x):
 
 
 @pytest.mark.parametrize("op", ops)
-@pytest.mark.parametrize("ty1", number_types, ids=number_ids)
-@pytest.mark.parametrize("ty2", number_types, ids=number_ids)
+@parametrize_numeric_dtypes_pairwise
 @pytest.mark.parametrize(
     "masked",
     ((False, True), (True, False), (True, True)),
     ids=("um", "mu", "mm"),
 )
-def test_compile_arith_masked_ops(op, ty1, ty2, masked):
+def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked):
     def func(x, y):
         return op(x, y)
 
     cc = (7, 5)
 
+    ty1 = from_dtype(np.dtype(left_dtype))
+    ty2 = from_dtype(np.dtype(right_dtype))
+
     if masked[0]:
         ty1 = MaskedType(ty1)
     if masked[1]:
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 102e5b57e8e..19d7c8a10ab 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from itertools import combinations
 
@@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe):
 
 
 @pytest.mark.parametrize(
-    "gdf",
+    "gdf_kwargs",
     [
-        cudf.DataFrame({"a": range(1000000)}),
-        cudf.DataFrame({"a": range(1000000), "b": range(1000000)}),
-        cudf.DataFrame({"a": range(20), "b": range(20)}),
-        cudf.DataFrame(
-            {
+        {"data": {"a": range(100000)}},
+        {"data": {"a": range(100000), "b": range(100000)}},
+        {
+            "data": {
                 "a": range(20),
                 "b": range(20),
                 "c": ["abc", "def", "xyz", "def", "pqr"] * 4,
             }
-        ),
-        cudf.DataFrame(index=[1, 2, 3]),
-        cudf.DataFrame(index=range(1000000)),
-        cudf.DataFrame(columns=["a", "b", "c", "d"]),
-        cudf.DataFrame(columns=["a"], index=range(1000000)),
-        cudf.DataFrame(
-            columns=["a", "col2", "...col n"], index=range(1000000)
-        ),
-        cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")),
-        cudf.DataFrame(
-            columns=["a", "b", "c", "d"],
-            index=cudf.Series(range(1000000)).astype("str"),
-        ),
+        },
+        {"index": [1, 2, 3]},
+        {"index": range(100000)},
+        {"columns": ["a", "b", "c", "d"]},
+        {"columns": ["a"], "index": range(100000)},
+        {"columns": ["a", "col2", "...col n"], "index": range(100000)},
+        {"index": cudf.Series(range(100000)).astype("str")},
+        {
+            "columns": ["a", "b", "c", "d"],
+            "index": cudf.Series(range(100000)).astype("str"),
+        },
     ],
 )
 @pytest.mark.parametrize(
     "slice",
     [
-        slice(250000, 500000),
-        slice(250000, 250001),
-        slice(500000),
+        slice(25000, 50000),
+        slice(25000, 25001),
+        slice(50000),
         slice(1, 10),
         slice(10, 20),
         slice(15, 24000),
         slice(6),
     ],
 )
-def test_dataframe_sliced(gdf, slice):
+def test_dataframe_sliced(gdf_kwargs, slice):
+    gdf = cudf.DataFrame(**gdf_kwargs)
     pdf = gdf.to_pandas()
 
     actual = gdf[slice]
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 8689f773a02..623098741a9 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -16,6 +16,7 @@
 
 import cudf
 from cudf.io.orc import ORCWriter
+from cudf.testing import assert_frame_equal
 from cudf.testing._utils import (
     assert_eq,
     gen_rand_series,
@@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
         path, engine=engine, columns=columns, use_index=use_index
     )
 
-    assert_eq(expect, got, check_categorical=False)
+    assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)
 
 
 def test_orc_reader_filenotfound(tmpdir):
@@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
         else:
             print(type(excpr).__name__)
 
-    expect = orcfile.read(columns=columns).to_pandas()
-    cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression)
-    got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
+    expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
+    expect.to_orc(gdf_fname.strpath, compression=compression)
+    got = cudf.from_pandas(
+        pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
+    )
 
-    assert_eq(expect, got)
+    assert_frame_equal(expect, got)
 
 
 @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
@@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
         else:
             print(type(excpr).__name__)
 
-    expect = orcfile.read().to_pandas()
-    cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    expect = cudf.from_pandas(orcfile.read().to_pandas())
+    expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
+    got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())
 
-    assert_eq(expect, got)
+    assert_frame_equal(expect, got)
 
 
 @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
@@ -492,8 +495,7 @@ def test_chunked_orc_writer(
     writer.close()
 
     got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
-
-    assert_eq(expect, got)
+    assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e1ca9f6f006..7feaa400446 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("skip", range(0, 128))
+@pytest.mark.parametrize("skip", [0, 1, 5, 10])
 def test_parquet_reader_list_skiprows(skip, tmpdir):
-    num_rows = 128
+    num_rows = 10
     src = pd.DataFrame(
         {
             "a": list_gen(int_gen, 0, num_rows, 80, 50),
@@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
     assert_eq(expect, got, check_dtype=False)
 
 
-@pytest.mark.parametrize("skip", range(0, 120))
+@pytest.mark.parametrize("skip", [0, 1, 5, 10])
 def test_parquet_reader_list_num_rows(skip, tmpdir):
-    num_rows = 128
+    num_rows = 20
     src = pd.DataFrame(
         {
             "a": list_gen(int_gen, 0, num_rows, 80, 50),
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index ca02ee55df0..8f2e4811e36 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import textwrap
 
@@ -13,7 +13,14 @@
 from cudf.testing import _utils as utils
 from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
 
-repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"]
+repr_categories = [
+    "uint16",
+    "int64",
+    "float64",
+    "str",
+    "category",
+    "datetime64[ns]",
+]
 
 
 @pytest.mark.parametrize("dtype", repr_categories)
@@ -84,36 +91,22 @@ def test_full_series(nrows, dtype):
     pd.reset_option("display.max_rows")
 
 
+@pytest.mark.parametrize("nrows", [5, 10, 15])
+@pytest.mark.parametrize("ncols", [5, 10, 15])
+@pytest.mark.parametrize("size", [20, 21])
 @pytest.mark.parametrize("dtype", repr_categories)
-@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
-@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1])
-def test_full_dataframe_20(dtype, nrows, ncols):
-    size = 20
-    pdf = pd.DataFrame(
-        {idx: np.random.randint(0, 100, size) for idx in range(size)}
-    ).astype(dtype)
-    gdf = cudf.from_pandas(pdf)
-
-    assert pdf.__repr__() == gdf.__repr__()
-    assert pdf._repr_html_() == gdf._repr_html_()
-    assert pdf._repr_latex_() == gdf._repr_latex_()
-
-
-@pytest.mark.parametrize("dtype", repr_categories)
-@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1])
-@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1])
-def test_full_dataframe_21(dtype, nrows, ncols):
-    size = 21
+def test_full_dataframe_20(dtype, size, nrows, ncols):
     pdf = pd.DataFrame(
         {idx: np.random.randint(0, 100, size) for idx in range(size)}
     ).astype(dtype)
     gdf = cudf.from_pandas(pdf)
 
-    pd.options.display.max_rows = int(nrows)
-    pd.options.display.max_columns = int(ncols)
-    assert pdf.__repr__() == gdf.__repr__()
-    pd.reset_option("display.max_rows")
-    pd.reset_option("display.max_columns")
+    with pd.option_context(
+        "display.max_rows", int(nrows), "display.max_columns", int(ncols)
+    ):
+        assert repr(pdf) == repr(gdf)
+        assert pdf._repr_html_() == gdf._repr_html_()
+        assert pdf._repr_latex_() == gdf._repr_latex_()
 
 
 @given(
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index b8f975f233e..2efa781c506 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import re
 
@@ -17,9 +17,9 @@
 )
 
 
-@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10])
-@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10])
-@pytest.mark.parametrize("num_rows", [1, 2, 1000])
+@pytest.mark.parametrize("num_id_vars", [0, 1, 2])
+@pytest.mark.parametrize("num_value_vars", [0, 1, 2])
+@pytest.mark.parametrize("num_rows", [1, 2, 100])
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
 @pytest.mark.parametrize("nulls", ["none", "some", "all"])
 def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 653c79fe603..56218372c23 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -532,12 +532,7 @@ def _cat_convert_seq_to_cudf(others):
 @pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"])
 @pytest.mark.parametrize("na_rep", [None, "", "null", "a"])
 @pytest.mark.parametrize(
-    "index",
-    [
-        ["1", "2", "3", "4", "5"],
-        pd.Series(["1", "2", "3", "4", "5"]),
-        pd.Index(["1", "2", "3", "4", "5"]),
-    ],
+    "index", [["1", "2", "3", "4", "5"]],
 )
 def test_string_cat(ps_gs, others, sep, na_rep, index):
     ps, gs = ps_gs
@@ -829,7 +824,9 @@ def test_string_join(ps_gs, sep):
 
 @pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"])
 @pytest.mark.parametrize("expand", [True, False])
-@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (1, 1)])
+@pytest.mark.parametrize(
+    "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)]
+)
 def test_string_extract(ps_gs, pat, expand, flags, flags_raise):
     ps, gs = ps_gs
     expectation = raise_builder([flags_raise], NotImplementedError)
@@ -862,9 +859,7 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise):
     ps, gs = ps_gs
 
     expectation = does_not_raise()
-    if flags_raise:
-        expectation = pytest.raises(ValueError)
-    if na_raise:
+    if flags_raise or na_raise:
         expectation = pytest.raises(NotImplementedError)
 
     with expectation:
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index dbff626c363..167f171fa26 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -205,6 +205,14 @@ def test_dataframe_to_struct():
     df["a"][0] = 5
     assert_eq(got, expect)
 
+    # check that a non-string (but convertible to string) named column can be
+    # converted to struct
+    df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0])
+    expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}])
+    with pytest.warns(UserWarning, match="will be casted"):
+        got = df.to_struct()
+    assert_eq(got, expect)
+
 
 @pytest.mark.parametrize(
     "series, slce",
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 56090c8eacf..faaea6eec08 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 import math
 import operator
 
@@ -14,7 +15,11 @@
     unary_ops,
 )
 from cudf.core.udf.utils import precompiled
-from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq
+from cudf.testing._utils import (
+    _decimal_series,
+    assert_eq,
+    parametrize_numeric_dtypes_pairwise,
+)
 
 
 def run_masked_udf_test(func, data, args=(), **kwargs):
@@ -238,10 +243,9 @@ def func(row):
     run_masked_udf_test(func, gdf, check_dtype=False)
 
 
-@pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES))
-@pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES))
+@parametrize_numeric_dtypes_pairwise
 @pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq])
-def test_apply_mixed_dtypes(dtype_a, dtype_b, op):
+def test_apply_mixed_dtypes(left_dtype, right_dtype, op):
     """
     Test that operations can be performed between columns
     of different dtypes and return a column with the correct
@@ -251,7 +255,7 @@ def test_apply_mixed_dtypes(dtype_a, dtype_b, op):
     # First perform the op on two dummy data on host, if numpy can
     # safely type cast, we should expect it to work in udf too.
     try:
-        op(getattr(np, dtype_a)(0), getattr(np, dtype_b)(42))
+        op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42))
     except TypeError:
         pytest.skip("Operation is unsupported for corresponding dtype.")
 
@@ -261,8 +265,8 @@ def func(row):
         return op(x, y)
 
     gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]})
-    gdf["a"] = gdf["a"].astype(dtype_a)
-    gdf["b"] = gdf["b"].astype(dtype_b)
+    gdf["a"] = gdf["a"].astype(left_dtype)
+    gdf["b"] = gdf["b"].astype(right_dtype)
 
     run_masked_udf_test(func, gdf, check_dtype=False)
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 191e6591a74..4dadfede866 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -2,6 +2,8 @@
 
 import decimal
 import functools
+import os
+import traceback
 from collections.abc import Sequence
 from typing import FrozenSet, Set, Union
 
@@ -37,6 +39,60 @@
 }
 
 
+# The test root is set by pytest to support situations where tests are run from
+# a source tree on a built version of cudf.
+NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS")
+
+_cudf_root = os.path.dirname(cudf.__file__)
+# If the environment variable for the test root is not set, we default to
+# using the path relative to the cudf root directory.
+_tests_root = os.getenv("_CUDF_TEST_ROOT") or os.path.join(_cudf_root, "tests")
+
+
+def _external_only_api(func, alternative=""):
+    """Decorator to indicate that a function should not be used internally.
+
+    cudf contains many APIs that exist for pandas compatibility but are
+    intrinsically inefficient. For some of these cudf has internal
+    equivalents that are much faster. Usage of the slow public APIs inside
+    our implementation can lead to unnecessary performance bottlenecks.
+    Applying this decorator to such functions and setting the environment
+    variable NO_EXTERNAL_ONLY_APIS will cause such functions to raise
+    exceptions if they are called from anywhere inside cudf, making it easy
+    to identify and excise such usage.
+
+    The `alternative` should be a complete phrase or sentence since it will
+    be used verbatim in error messages.
+    """
+
+    # If the first arg is a string then an alternative function to use in
+    # place of this API was provided, so we pass that to a subsequent call.
+    # It would be cleaner to implement this pattern by using a class
+    # decorator with a factory method, but there is no way to generically
+    # wrap docstrings on a class (we would need the docstring to be on the
+    # class itself, not instances, because that's what `help` looks at) and
+    # there is also no way to make mypy happy with that approach.
+    if isinstance(func, str):
+        return lambda actual_func: _external_only_api(actual_func, func)
+
+    if not NO_EXTERNAL_ONLY_APIS:
+        return func
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # Check the immediately preceding frame to see if it's in cudf.
+        frame, lineno = next(traceback.walk_stack(None))
+        fn = frame.f_code.co_filename
+        if _cudf_root in fn and _tests_root not in fn:
+            raise RuntimeError(
+                f"External-only API called in {fn} at line {lineno}. "
+                f"{alternative}"
+            )
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 # TODO: We should evaluate whether calls to this could be more easily replaced
 # with column.full, which appears to be significantly faster in simple cases.
 def scalar_broadcast_to(scalar, size, dtype=None):
@@ -69,15 +125,7 @@ def scalar_broadcast_to(scalar, size, dtype=None):
     scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
     dtype = scalar.dtype
 
-    if cudf.dtype(dtype).kind in ("O", "U"):
-        gather_map = column.full(size, 0, dtype="int32")
-        scalar_str_col = column.as_column([scalar], dtype="str")
-        return scalar_str_col[gather_map]
-    else:
-        out_col = column.column_empty(size, dtype=dtype)
-        if out_col.size != 0:
-            out_col.data_array_view[:] = scalar
-        return out_col
+    return cudf.core.column.full(size=size, fill_value=scalar, dtype=dtype)
 
 
 def initfunc(f):
@@ -146,28 +194,6 @@ def set_allocator(
 IS_NEP18_ACTIVE = _is_nep18_active()
 
 
-class cached_property:
-    """
-    Like @property, but only evaluated upon first invocation.
-    To force re-evaluation of a cached_property, simply delete
-    it with `del`.
-    """
-
-    # TODO: Can be replaced with functools.cached_property when we drop support
-    # for Python 3.7.
-
-    def __init__(self, func):
-        self.func = func
-
-    def __get__(self, instance, cls):
-        if instance is None:
-            return self
-        else:
-            value = self.func(instance)
-            object.__setattr__(instance, self.func.__name__, value)
-            return value
-
-
 class GetAttrGetItemMixin:
     """This mixin changes `__getattr__` to attempt a `__getitem__` call.
 
@@ -363,7 +389,10 @@ def get_appropriate_dispatched_func(
             cupy_compatible_args, index = _get_cupy_compatible_args_index(args)
             if cupy_compatible_args:
                 cupy_output = cupy_func(*cupy_compatible_args, **kwargs)
-                return _cast_to_appropriate_cudf_type(cupy_output, index)
+                if isinstance(cupy_output, cp.ndarray):
+                    return _cast_to_appropriate_cudf_type(cupy_output, index)
+                else:
+                    return cupy_output
 
     return NotImplemented
 
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 89b5301ee83..1b1f3e29ab2 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from collections.abc import Iterator
 
@@ -6,6 +6,7 @@
 import numpy as np
 import pandas as pd
 import pyarrow as pa
+from nvtx import annotate
 
 from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.dispatch import (
@@ -39,6 +40,7 @@
 
 
 @meta_nonempty.register(cudf.BaseIndex)
+@annotate("_nonempty_index", color="green", domain="dask_cudf_python")
 def _nonempty_index(idx):
     if isinstance(idx, cudf.core.index.RangeIndex):
         return cudf.core.index.RangeIndex(2, name=idx.name)
@@ -73,6 +75,7 @@ def _nonempty_index(idx):
     raise TypeError(f"Don't know how to handle index of type {type(idx)}")
 
 
+@annotate("_get_non_empty_data", color="green", domain="dask_cudf_python")
 def _get_non_empty_data(s):
     if isinstance(s._column, cudf.core.column.CategoricalColumn):
         categories = (
@@ -100,6 +103,7 @@ def _get_non_empty_data(s):
 
 
 @meta_nonempty.register(cudf.Series)
+@annotate("_nonempty_series", color="green", domain="dask_cudf_python")
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
@@ -109,6 +113,7 @@ def _nonempty_series(s, idx=None):
 
 
 @meta_nonempty.register(cudf.DataFrame)
+@annotate("meta_nonempty_cudf", color="green", domain="dask_cudf_python")
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
@@ -124,15 +129,18 @@ def meta_nonempty_cudf(x):
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
+@annotate("make_meta_cudf", color="green", domain="dask_cudf_python")
 def make_meta_cudf(x, index=None):
     return x.head(0)
 
 
 @make_meta_dispatch.register(cudf.BaseIndex)
+@annotate("make_meta_cudf_index", color="green", domain="dask_cudf_python")
 def make_meta_cudf_index(x, index=None):
     return x[:0]
 
 
+@annotate("_empty_series", color="green", domain="dask_cudf_python")
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
         return cudf.Series(
@@ -142,6 +150,7 @@ def _empty_series(name, dtype, index=None):
 
 
 @make_meta_obj.register(object)
+@annotate("make_meta_object_cudf", color="green", domain="dask_cudf_python")
 def make_meta_object_cudf(x, index=None):
     """Create an empty cudf object containing the desired metadata.
 
@@ -212,6 +221,7 @@ def make_meta_object_cudf(x, index=None):
 
 
 @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex))
+@annotate("concat_cudf", color="green", domain="dask_cudf_python")
 def concat_cudf(
     dfs,
     axis=0,
@@ -236,11 +246,13 @@ def concat_cudf(
 @categorical_dtype_dispatch.register(
     (cudf.DataFrame, cudf.Series, cudf.BaseIndex)
 )
+@annotate("categorical_dtype_cudf", color="green", domain="dask_cudf_python")
 def categorical_dtype_cudf(categories=None, ordered=None):
     return cudf.CategoricalDtype(categories=categories, ordered=ordered)
 
 
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
+@annotate("tolist_cudf", color="green", domain="dask_cudf_python")
 def tolist_cudf(obj):
     return obj.to_arrow().to_pylist()
 
@@ -248,6 +260,9 @@ def tolist_cudf(obj):
 @is_categorical_dtype_dispatch.register(
     (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
 )
+@annotate(
+    "is_categorical_dtype_cudf", color="green", domain="dask_cudf_python"
+)
 def is_categorical_dtype_cudf(obj):
     return cudf.api.types.is_categorical_dtype(obj)
 
@@ -261,6 +276,7 @@ def is_categorical_dtype_cudf(obj):
         )
 
     @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
+    @annotate("percentile_cudf", color="green", domain="dask_cudf_python")
     def percentile_cudf(a, q, interpolation="linear"):
         # Cudf dispatch to the equivalent of `np.percentile`:
         # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
@@ -305,6 +321,7 @@ def percentile_cudf(a, q, interpolation="linear"):
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
+@annotate("union_categoricals_cudf", color="green", domain="dask_cudf_python")
 def union_categoricals_cudf(
     to_union, sort_categories=False, ignore_order=False
 ):
@@ -313,11 +330,13 @@ def union_categoricals_cudf(
     )
 
 
+@annotate("safe_hash", color="green", domain="dask_cudf_python")
 def safe_hash(frame):
     return cudf.Series(frame.hash_values(), index=frame.index)
 
 
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
+@annotate("hash_object_cudf", color="green", domain="dask_cudf_python")
 def hash_object_cudf(frame, index=True):
     if index:
         return safe_hash(frame.reset_index())
@@ -325,6 +344,7 @@ def hash_object_cudf(frame, index=True):
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
+@annotate("hash_object_cudf_index", color="green", domain="dask_cudf_python")
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
@@ -335,6 +355,7 @@ def hash_object_cudf_index(ind, index=None):
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
+@annotate("group_split_cudf", color="green", domain="dask_cudf_python")
 def group_split_cudf(df, c, k, ignore_index=False):
     return dict(
         zip(
@@ -349,10 +370,12 @@ def group_split_cudf(df, c, k, ignore_index=False):
 
 
 @sizeof_dispatch.register(cudf.DataFrame)
+@annotate("sizeof_cudf_dataframe", color="green", domain="dask_cudf_python")
 def sizeof_cudf_dataframe(df):
     return int(df.memory_usage().sum())
 
 
 @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex))
+@annotate("sizeof_cudf_series_index", color="green", domain="dask_cudf_python")
 def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 729db6c232d..d8802f33941 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import math
 import warnings
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from nvtx import annotate
 from tlz import partition_all
 
 import dask
@@ -57,6 +58,7 @@ def __dask_postcompute__(self):
     def __dask_postpersist__(self):
         return type(self), (self._name, self._meta, self.divisions)
 
+    @annotate("_FRAME_INIT", color="green", domain="dask_cudf_python")
     def __init__(self, dsk, name, meta, divisions):
         if not isinstance(dsk, HighLevelGraph):
             dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[])
@@ -82,6 +84,9 @@ def __repr__(self):
         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
         return s % (type(self).__name__, len(self.dask), self.npartitions)
 
+    @annotate(
+        "_FRAME_to_dask_dataframe", color="green", domain="dask_cudf_python"
+    )
     def to_dask_dataframe(self, **kwargs):
         """Create a dask.dataframe object from a dask_cudf object"""
         nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False)
@@ -99,6 +104,9 @@ def to_dask_dataframe(self, **kwargs):
 class DataFrame(_Frame, dd.core.DataFrame):
     _partition_type = cudf.DataFrame
 
+    @annotate(
+        "DATAFRAME_assign_column", color="green", domain="dask_cudf_python"
+    )
     def _assign_column(self, k, v):
         def assigner(df, k, v):
             out = df.copy()
@@ -108,6 +116,7 @@ def assigner(df, k, v):
         meta = assigner(self._meta, k, dask_make_meta(v))
         return self.map_partitions(assigner, k, v, meta=meta)
 
+    @annotate("DATAFRAME_apply_rows", color="green", domain="dask_cudf_python")
     def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
         import uuid
 
@@ -127,6 +136,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs):
             do_apply_rows, func, incols, outcols, kwargs, meta=meta
         )
 
+    @annotate("DATAFRAME_merge", color="green", domain="dask_cudf_python")
     def merge(self, other, **kwargs):
         if kwargs.pop("shuffle", "tasks") != "tasks":
             raise ValueError(
@@ -138,6 +148,7 @@ def merge(self, other, **kwargs):
             on = list(on)
         return super().merge(other, on=on, shuffle="tasks", **kwargs)
 
+    @annotate("DATAFRAME_join", color="green", domain="dask_cudf_python")
     def join(self, other, **kwargs):
         if kwargs.pop("shuffle", "tasks") != "tasks":
             raise ValueError(
@@ -155,6 +166,7 @@ def join(self, other, **kwargs):
             on = list(on)
         return super().join(other, how=how, on=on, shuffle="tasks", **kwargs)
 
+    @annotate("DATAFRAME_set_index", color="green", domain="dask_cudf_python")
     def set_index(self, other, sorted=False, divisions=None, **kwargs):
         if kwargs.pop("shuffle", "tasks") != "tasks":
             raise ValueError(
@@ -226,6 +238,9 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs):
             **kwargs,
         )
 
+    @annotate(
+        "DATAFRAME_sort_values", color="green", domain="dask_cudf_python"
+    )
     def sort_values(
         self,
         by,
@@ -261,12 +276,14 @@ def sort_values(
             return df.reset_index(drop=True)
         return df
 
+    @annotate("DATAFRAME_to_parquet", color="green", domain="dask_cudf_python")
     def to_parquet(self, path, *args, **kwargs):
         """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
+    @annotate("DATAFRAME_to_orc", color="green", domain="dask_cudf_python")
     def to_orc(self, path, **kwargs):
         """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
@@ -274,6 +291,7 @@ def to_orc(self, path, **kwargs):
         return to_orc(self, path, **kwargs)
 
     @derived_from(pd.DataFrame)
+    @annotate("DATAFRAME_var", color="green", domain="dask_cudf_python")
     def var(
         self,
         axis=None,
@@ -302,6 +320,9 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
+    @annotate(
+        "DATAFRAME_repartition", color="green", domain="dask_cudf_python"
+    )
     def repartition(self, *args, **kwargs):
         """Wraps dask.dataframe DataFrame.repartition method.
         Uses DataFrame.shuffle if `columns=` is specified.
@@ -324,6 +345,7 @@ def repartition(self, *args, **kwargs):
             )
         return super().repartition(*args, **kwargs)
 
+    @annotate("DATAFRAME_shuffle", color="green", domain="dask_cudf_python")
     def shuffle(self, *args, **kwargs):
         """Wraps dask.dataframe DataFrame.shuffle method"""
         shuffle_arg = kwargs.pop("shuffle", None)
@@ -331,18 +353,21 @@ def shuffle(self, *args, **kwargs):
             raise ValueError("dask_cudf does not support disk-based shuffle.")
         return super().shuffle(*args, shuffle="tasks", **kwargs)
 
+    @annotate("DATAFRAME_groupby", color="green", domain="dask_cudf_python")
     def groupby(self, by=None, **kwargs):
         from .groupby import CudfDataFrameGroupBy
 
         return CudfDataFrameGroupBy(self, by=by, **kwargs)
 
 
+@annotate("DATAFRAME_sum_of_squares", color="green", domain="dask_cudf_python")
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
     return cudf.Series(outcol)
 
 
+@annotate("DATAFRAME_var_aggregate", color="green", domain="dask_cudf_python")
 def var_aggregate(x2, x, n, ddof):
     try:
         with warnings.catch_warnings(record=True):
@@ -355,10 +380,12 @@ def var_aggregate(x2, x, n, ddof):
         return np.float64(np.nan)
 
 
+@annotate("DATAFRAME_nlargest_agg", color="green", domain="dask_cudf_python")
 def nlargest_agg(x, **kwargs):
     return cudf.concat(x).nlargest(**kwargs)
 
 
+@annotate("DATAFRAME_nsmallest_agg", color="green", domain="dask_cudf_python")
 def nsmallest_agg(x, **kwargs):
     return cudf.concat(x).nsmallest(**kwargs)
 
@@ -366,6 +393,7 @@ def nsmallest_agg(x, **kwargs):
 class Series(_Frame, dd.core.Series):
     _partition_type = cudf.Series
 
+    @annotate("Series_count", color="green", domain="dask_cudf_python")
     def count(self, split_every=False):
         return reduction(
             [self],
@@ -375,12 +403,14 @@ def count(self, split_every=False):
             meta="i8",
         )
 
+    @annotate("Series_mean", color="green", domain="dask_cudf_python")
     def mean(self, split_every=False):
         sum = self.sum(split_every=split_every)
         n = self.count(split_every=split_every)
         return sum / n
 
     @derived_from(pd.DataFrame)
+    @annotate("Series_var", color="green", domain="dask_cudf_python")
     def var(
         self,
         axis=None,
@@ -409,16 +439,19 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
+    @annotate("Series_groupby", color="green", domain="dask_cudf_python")
     def groupby(self, *args, **kwargs):
         from .groupby import CudfSeriesGroupBy
 
         return CudfSeriesGroupBy(self, *args, **kwargs)
 
     @property
+    @annotate("Series_list", color="green", domain="dask_cudf_python")
     def list(self):
         return ListMethods(self)
 
     @property
+    @annotate("Series_struct", color="green", domain="dask_cudf_python")
     def struct(self):
         return StructMethods(self)
 
@@ -427,6 +460,7 @@ class Index(Series, dd.core.Index):
     _partition_type = cudf.Index  # type: ignore
 
 
+@annotate("_naive_var", color="green", domain="dask_cudf_python")
 def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     num = ddf._get_numeric_data()
     x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
@@ -441,6 +475,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     return handle_out(out, result)
 
 
+@annotate("_parallel_var", color="green", domain="dask_cudf_python")
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
@@ -507,6 +542,7 @@ def _finalize_var(vals):
     return handle_out(out, result)
 
 
+@annotate("_extract_meta", color="green", domain="dask_cudf_python")
 def _extract_meta(x):
     """
     Extract internal cache data (``_meta``) from dask_cudf objects
@@ -522,6 +558,7 @@ def _extract_meta(x):
     return x
 
 
+@annotate("_emulate", color="green", domain="dask_cudf_python")
 def _emulate(func, *args, **kwargs):
     """
     Apply a function using args / kwargs. If arguments contain dd.DataFrame /
@@ -531,6 +568,7 @@ def _emulate(func, *args, **kwargs):
         return func(*_extract_meta(args), **_extract_meta(kwargs))
 
 
+@annotate("align_partitions", color="green", domain="dask_cudf_python")
 def align_partitions(args):
     """Align partitions between dask_cudf objects.
 
@@ -546,6 +584,7 @@ def align_partitions(args):
     return args
 
 
+@annotate("reduction", color="green", domain="dask_cudf_python")
 def reduction(
     args,
     chunk=None,
@@ -684,6 +723,7 @@ def reduction(
     return dd.core.new_dd_object(graph, b, meta, (None, None))
 
 
+@annotate("from_cudf", color="green", domain="dask_cudf_python")
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
     if isinstance(getattr(data, "index", None), cudf.MultiIndex):
         raise NotImplementedError(
@@ -705,6 +745,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 )
 
 
+@annotate("from_dask_dataframe", color="green", domain="dask_cudf_python")
 def from_dask_dataframe(df):
     return df.map_partitions(cudf.from_pandas)
 
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 1bc270a5b9f..658e63ea923 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from nvtx import annotate
 
 from dask.base import tokenize
 from dask.dataframe.core import (
@@ -35,11 +36,19 @@
 
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
+    @annotate(
+        "CudfDataFrameGroupBy_INIT", color="green", domain="dask_cudf_python"
+    )
     def __init__(self, *args, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, **kwargs)
 
+    @annotate(
+        "CudfDataFrameGroupBy_GETITEM",
+        color="green",
+        domain="dask_cudf_python",
+    )
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
@@ -53,6 +62,9 @@ def __getitem__(self, key):
         g._meta = g._meta[key]
         return g
 
+    @annotate(
+        "CudfDataFrameGroupBy_MEAN", color="green", domain="dask_cudf_python"
+    )
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -66,6 +78,11 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )
 
+    @annotate(
+        "CudfDataFrameGroupBy_COLLECT",
+        color="green",
+        domain="dask_cudf_python",
+    )
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -79,6 +96,11 @@ def collect(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )
 
+    @annotate(
+        "CudfDataFrameGroupBy_AGGREGATE",
+        color="green",
+        domain="dask_cudf_python",
+    )
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
@@ -118,11 +140,17 @@ def aggregate(self, arg, split_every=None, split_out=1):
 
 
 class CudfSeriesGroupBy(SeriesGroupBy):
+    @annotate(
+        "CudfSeriesGroupBy_INIT", color="green", domain="dask_cudf_python"
+    )
     def __init__(self, *args, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, **kwargs)
 
+    @annotate(
+        "CudfSeriesGroupBy_MEAN", color="green", domain="dask_cudf_python"
+    )
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -136,6 +164,9 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
+    @annotate(
+        "CudfSeriesGroupBy_STD", color="green", domain="dask_cudf_python"
+    )
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -149,6 +180,9 @@ def std(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
+    @annotate(
+        "CudfSeriesGroupBy_VAR", color="green", domain="dask_cudf_python"
+    )
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -162,6 +196,9 @@ def var(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
+    @annotate(
+        "CudfSeriesGroupBy_COLLECT", color="green", domain="dask_cudf_python"
+    )
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -175,6 +212,9 @@ def collect(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
+    @annotate(
+        "CudfSeriesGroupBy_AGGREGATE", color="green", domain="dask_cudf_python"
+    )
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
@@ -205,6 +245,7 @@ def aggregate(self, arg, split_every=None, split_out=1):
         )
 
 
+@annotate("groupby_agg", color="green", domain="dask_cudf_python")
 def groupby_agg(
     ddf,
     gb_cols,
@@ -371,6 +412,7 @@ def groupby_agg(
     return new_dd_object(graph, gb_agg_name, _meta, divisions)
 
 
+@annotate("_redirect_aggs", color="green", domain="dask_cudf_python")
 def _redirect_aggs(arg):
     """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
@@ -397,6 +439,7 @@ def _redirect_aggs(arg):
     return redirects.get(arg, arg)
 
 
+@annotate("_is_supported", color="green", domain="dask_cudf_python")
 def _is_supported(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
@@ -422,6 +465,7 @@ def _make_name(*args, sep="_"):
     return sep.join(_args)
 
 
+@annotate("_groupby_partition_agg", color="green", domain="dask_cudf_python")
 def _groupby_partition_agg(
     df, gb_cols, aggs, columns, split_out, dropna, sort, sep
 ):
@@ -479,6 +523,7 @@ def _groupby_partition_agg(
     return output
 
 
+@annotate("_tree_node_agg", color="green", domain="dask_cudf_python")
 def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
     """Node in groupby-aggregation reduction tree.
 
@@ -513,6 +558,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
     return gb
 
 
+@annotate("_var_agg", color="green", domain="dask_cudf_python")
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     """Calculate variance (given count, sum, and sum-squared columns)."""
 
@@ -534,6 +580,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     return var
 
 
+@annotate("_finalize_gb_agg", color="green", domain="dask_cudf_python")
 def _finalize_gb_agg(
     gb,
     gb_cols,
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index af40d9ca41b..ada738c5a9b 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -1,9 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 from collections.abc import Iterator
 
 import cupy
 import numpy as np
 import tlz as toolz
+from nvtx import annotate
 
 from dask.base import tokenize
 from dask.dataframe import methods
@@ -16,12 +18,14 @@
 from cudf.api.types import is_categorical_dtype
 
 
+@annotate("set_index_post", color="green", domain="dask_cudf_python")
 def set_index_post(df, index_name, drop, column_dtype):
     df2 = df.set_index(index_name, drop=drop)
     df2.columns = df2.columns.astype(column_dtype)
     return df2
 
 
+@annotate("_set_partitions_pre", color="green", domain="dask_cudf_python")
 def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     if ascending:
         partitions = divisions.searchsorted(s, side="right") - 1
@@ -38,6 +42,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     return partitions
 
 
+@annotate("_quantile", color="green", domain="dask_cudf_python")
 def _quantile(a, q):
     n = len(a)
     if not len(a):
@@ -45,6 +50,7 @@ def _quantile(a, q):
     return (a.quantiles(q=q.tolist(), interpolation="nearest"), n)
 
 
+@annotate("merge_quantiles", color="green", domain="dask_cudf_python")
 def merge_quantiles(finalq, qs, vals):
     """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
@@ -107,6 +113,7 @@ def _append_counts(val, count):
     return rv.reset_index(drop=True)
 
 
+@annotate("_approximate_quantile", color="green", domain="dask_cudf_python")
 def _approximate_quantile(df, q):
     """Approximate quantiles of DataFrame or Series.
     [NOTE: Same logic as dask.dataframe Series quantile]
@@ -180,6 +187,7 @@ def set_quantile_index(df):
     return df
 
 
+@annotate("quantile_divisions", color="green", domain="cudf_python")
 def quantile_divisions(df, by, npartitions):
     qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
     divisions = _approximate_quantile(df[by], qn).compute()
@@ -213,6 +221,7 @@ def quantile_divisions(df, by, npartitions):
     return divisions
 
 
+@annotate("sort_values", color="green", domain="cudf_python")
 def sort_values(
     df,
     by,