diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1e1ad94ab0b..9e72c0119f3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -88,6 +88,13 @@ repos: # of dependencies, so we'll have to update this manually. additional_dependencies: - cmake-format==0.6.11 + - id: copyright-check + name: copyright-check + # This hook's use of Git tools appears to conflict with + # existing CI invocations so we don't invoke it during CI runs. + stages: [commit] + entry: python ./ci/checks/copyright.py --git-modified-only + language: python default_language_version: python: python3 diff --git a/build.sh b/build.sh index 8b3add1dddd..765a1b5325f 100755 --- a/build.sh +++ b/build.sh @@ -18,7 +18,7 @@ ARGS=$* REPODIR=$(cd $(dirname $0); pwd) VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" -HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"\\\"] +HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) libcudf - build the cudf C++ code only @@ -32,7 +32,6 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk -v - verbose build mode -g - build for debug -n - no install step - -l - build legacy tests --allgpuarch - build for all supported GPU architectures --disable_nvtx - disable inserting NVTX profiling ranges --show_depr_warn - show cmake deprecation warnings diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index 178bdab0154..62eeb4d131b 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. ######################################### # cuDF GPU build and test script for CI # ######################################### @@ -98,11 +98,7 @@ conda list --show-channel-urls ################################################################################ logger "Build libcudf..." -if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds -else - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds -fi +"$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds ################################################################################ # BENCHMARK - Run and parse libcudf and cuDF benchmarks diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py new file mode 100644 index 00000000000..d72fd95fea3 --- /dev/null +++ b/ci/checks/copyright.py @@ -0,0 +1,233 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import re +import argparse +import io +import os +import sys + +SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + +# Add the scripts dir for gitutils +sys.path.append(os.path.normpath(SCRIPT_DIR)) + +# Now import gitutils. Ignore flake8 error here since there is no other way to +# set up imports +import gitutils # noqa: E402 + +FilesToCheck = [ + re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"), + re.compile(r"CMakeLists[.]txt$"), + re.compile(r"CMakeLists_standalone[.]txt$"), + re.compile(r"setup[.]cfg$"), + re.compile(r"[.]flake8[.]cython$"), + re.compile(r"meta[.]yaml$") +] +ExemptFiles = [] + +# this will break starting at year 10000, which is probably OK :) +CheckSimple = re.compile( + r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)") +CheckDouble = re.compile( + r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" # noqa: E501 +) + + +def checkThisFile(f): + # This check covers things like symlinks which point to files that DNE + if not (os.path.exists(f)): + return False + if gitutils and gitutils.isFileEmpty(f): + return False + for exempt in ExemptFiles: + if exempt.search(f): + return False + for checker in FilesToCheck: + if checker.search(f): + return True + return False + + +def getCopyrightYears(line): + res = CheckSimple.search(line) + if res: + return (int(res.group(1)), int(res.group(1))) + res = CheckDouble.search(line) + if res: + return (int(res.group(1)), int(res.group(2))) + return (None, None) + + +def replaceCurrentYear(line, start, end): + # first turn a simple regex into double (if applicable). then update years + res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line) + res = CheckDouble.sub( + r"Copyright (c) {:04d}-{:04d}, NVIDIA CORPORATION".format(start, end), + res) + return res + + +def checkCopyright(f, update_current_year): + """ + Checks for copyright headers and their years + """ + errs = [] + thisYear = datetime.datetime.now().year + lineNum = 0 + crFound = False + yearMatched = False + with io.open(f, "r", encoding="utf-8") as fp: + lines = fp.readlines() + for line in lines: + lineNum += 1 + start, end = getCopyrightYears(line) + if start is None: + continue + crFound = True + if start > end: + e = [ + f, + lineNum, + "First year after second year in the copyright " + "header (manual fix required)", + None + ] + errs.append(e) + if thisYear < start or thisYear > end: + e = [ + f, + lineNum, + "Current year not included in the " + "copyright header", + None + ] + if thisYear < start: + e[-1] = replaceCurrentYear(line, thisYear, end) + if thisYear > end: + e[-1] = replaceCurrentYear(line, start, thisYear) + errs.append(e) + else: + yearMatched = True + fp.close() + # copyright header itself not found + if not crFound: + e = [ + f, + 0, + "Copyright header missing or formatted incorrectly " + "(manual fix required)", + None + ] + errs.append(e) + # even if the year matches a copyright header, make the check pass + if yearMatched: + errs = [] + + if update_current_year: + errs_update = [x for x in errs if x[-1] is not None] + if len(errs_update) > 0: + print("File: {}. Changing line(s) {}".format( + f, ', '.join(str(x[1]) for x in errs if x[-1] is not None))) + for _, lineNum, __, replacement in errs_update: + lines[lineNum - 1] = replacement + with io.open(f, "w", encoding="utf-8") as out_file: + for new_line in lines: + out_file.write(new_line) + errs = [x for x in errs if x[-1] is None] + + return errs + + +def getAllFilesUnderDir(root, pathFilter=None): + retList = [] + for (dirpath, dirnames, filenames) in os.walk(root): + for fn in filenames: + filePath = os.path.join(dirpath, fn) + if pathFilter(filePath): + retList.append(filePath) + return retList + + +def checkCopyright_main(): + """ + Checks for copyright headers in all the modified files. In case of local + repo, this script will just look for uncommitted files and in case of CI + it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch" + """ + retVal = 0 + global ExemptFiles + + argparser = argparse.ArgumentParser( + "Checks for a consistent copyright header in git's modified files") + argparser.add_argument("--update-current-year", + dest='update_current_year', + action="store_true", + required=False, + help="If set, " + "update the current year if a header is already " + "present and well formatted.") + argparser.add_argument("--git-modified-only", + dest='git_modified_only', + action="store_true", + required=False, + help="If set, " + "only files seen as modified by git will be " + "processed.") + + (args, dirs) = argparser.parse_known_args() + try: + ExemptFiles = [re.compile(file) for file in ExemptFiles] + except re.error as reException: + print("Regular expression error:") + print(reException) + return 1 + + if args.git_modified_only: + files = gitutils.modifiedFiles(pathFilter=checkThisFile) + else: + files = [] + for d in [os.path.abspath(d) for d in dirs]: + if not (os.path.isdir(d)): + raise ValueError(f"{d} is not a directory.") + files += getAllFilesUnderDir(d, pathFilter=checkThisFile) + + errors = [] + for f in files: + errors += checkCopyright(f, args.update_current_year) + + if len(errors) > 0: + print("Copyright headers incomplete in some of the files!") + for e in errors: + print(" %s:%d Issue: %s" % (e[0], e[1], e[2])) + print("") + n_fixable = sum(1 for e in errors if e[-1] is not None) + path_parts = os.path.abspath(__file__).split(os.sep) + file_from_repo = os.sep.join(path_parts[path_parts.index("ci"):]) + if n_fixable > 0: + print(("You can run `python {} --git-modified-only " + "--update-current-year` to fix {} of these " + "errors.\n").format(file_from_repo, n_fixable)) + retVal = 1 + else: + print("Copyright check passed") + + return retVal + + +if __name__ == "__main__": + import sys + sys.exit(checkCopyright_main()) \ No newline at end of file diff --git a/ci/checks/gitutils.py b/ci/checks/gitutils.py new file mode 100644 index 00000000000..0aea1d660cb --- /dev/null +++ b/ci/checks/gitutils.py @@ -0,0 +1,286 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import subprocess +import os +import re + + +def isFileEmpty(f): + return os.stat(f).st_size == 0 + + +def __git(*opts): + """Runs a git command and returns its output""" + cmd = "git " + " ".join(list(opts)) + ret = subprocess.check_output(cmd, shell=True) + return ret.decode("UTF-8").rstrip("\n") + + +def __gitdiff(*opts): + """Runs a git diff command with no pager set""" + return __git("--no-pager", "diff", *opts) + + +def branch(): + """Returns the name of the current branch""" + name = __git("rev-parse", "--abbrev-ref", "HEAD") + name = name.rstrip() + return name + + +def repo_version(): + """ + Determines the version of the repo by using `git describe` + + Returns + ------- + str + The full version of the repo in the format 'v#.#.#{a|b|rc}' + """ + return __git("describe", "--tags", "--abbrev=0") + + +def repo_version_major_minor(): + """ + Determines the version of the repo using `git describe` and returns only + the major and minor portion + + Returns + ------- + str + The partial version of the repo in the format '{major}.{minor}' + """ + + full_repo_version = repo_version() + + match = re.match(r"^v?(?P[0-9]+)(?:\.(?P[0-9]+))?", + full_repo_version) + + if (match is None): + print(" [DEBUG] Could not determine repo major minor version. " + f"Full repo version: {full_repo_version}.") + return None + + out_version = match.group("major") + + if (match.group("minor")): + out_version += "." + match.group("minor") + + return out_version + + +def determine_merge_commit(current_branch="HEAD"): + """ + When running outside of CI, this will estimate the target merge commit hash + of `current_branch` by finding a common ancester with the remote branch + 'branch-{major}.{minor}' where {major} and {minor} are determined from the + repo version. + + Parameters + ---------- + current_branch : str, optional + Which branch to consider as the current branch, by default "HEAD" + + Returns + ------- + str + The common commit hash ID + """ + + try: + # Try to determine the target branch from the most recent tag + head_branch = __git("describe", + "--all", + "--tags", + "--match='branch-*'", + "--abbrev=0") + except subprocess.CalledProcessError: + print(" [DEBUG] Could not determine target branch from most recent " + "tag. Falling back to 'branch-{major}.{minor}.") + head_branch = None + + if (head_branch is not None): + # Convert from head to branch name + head_branch = __git("name-rev", "--name-only", head_branch) + else: + # Try and guess the target branch as "branch-." + version = repo_version_major_minor() + + if (version is None): + return None + + head_branch = "branch-{}".format(version) + + try: + # Now get the remote tracking branch + remote_branch = __git("rev-parse", + "--abbrev-ref", + "--symbolic-full-name", + head_branch + "@{upstream}") + except subprocess.CalledProcessError: + print(" [DEBUG] Could not remote tracking reference for " + f"branch {head_branch}.") + remote_branch = None + + if (remote_branch is None): + return None + + print(f" [DEBUG] Determined TARGET_BRANCH as: '{remote_branch}'. " + "Finding common ancestor.") + + common_commit = __git("merge-base", remote_branch, current_branch) + + return common_commit + + +def uncommittedFiles(): + """ + Returns a list of all changed files that are not yet committed. This + means both untracked/unstaged as well as uncommitted files too. + """ + files = __git("status", "-u", "-s") + ret = [] + for f in files.splitlines(): + f = f.strip(" ") + f = re.sub("\s+", " ", f) # noqa: W605 + tmp = f.split(" ", 1) + # only consider staged files or uncommitted files + # in other words, ignore untracked files + if tmp[0] == "M" or tmp[0] == "A": + ret.append(tmp[1]) + return ret + + +def changedFilesBetween(baseName, branchName, commitHash): + """ + Returns a list of files changed between branches baseName and latest commit + of branchName. + """ + current = branch() + # checkout "base" branch + __git("checkout", "--force", baseName) + # checkout branch for comparing + __git("checkout", "--force", branchName) + # checkout latest commit from branch + __git("checkout", "-fq", commitHash) + + files = __gitdiff("--name-only", + "--ignore-submodules", + f"{baseName}..{branchName}") + + # restore the original branch + __git("checkout", "--force", current) + return files.splitlines() + + +def changesInFileBetween(file, b1, b2, filter=None): + """Filters the changed lines to a file between the branches b1 and b2""" + current = branch() + __git("checkout", "--quiet", b1) + __git("checkout", "--quiet", b2) + diffs = __gitdiff("--ignore-submodules", + "-w", + "--minimal", + "-U0", + "%s...%s" % (b1, b2), + "--", + file) + __git("checkout", "--quiet", current) + lines = [] + for line in diffs.splitlines(): + if filter is None or filter(line): + lines.append(line) + return lines + + +def modifiedFiles(pathFilter=None): + """ + If inside a CI-env (ie. TARGET_BRANCH and COMMIT_HASH are defined, and + current branch is "current-pr-branch"), then lists out all files modified + between these 2 branches. Locally, TARGET_BRANCH will try to be determined + from the current repo version and finding a coresponding branch named + 'branch-{major}.{minor}'. If this fails, this functino will list out all + the uncommitted files in the current branch. + + Such utility function is helpful while putting checker scripts as part of + cmake, as well as CI process. This way, during development, only the files + touched (but not yet committed) by devs can be checked. But, during the CI + process ALL files modified by the dev, as submiited in the PR, will be + checked. This happens, all the while using the same script. + """ + targetBranch = os.environ.get("TARGET_BRANCH") + commitHash = os.environ.get("COMMIT_HASH") + currentBranch = branch() + print( + f" [DEBUG] TARGET_BRANCH={targetBranch}, COMMIT_HASH={commitHash}, " + f"currentBranch={currentBranch}") + + if targetBranch and commitHash and (currentBranch == "current-pr-branch"): + print(" [DEBUG] Assuming a CI environment.") + allFiles = changedFilesBetween(targetBranch, currentBranch, commitHash) + else: + print(" [DEBUG] Did not detect CI environment. " + "Determining TARGET_BRANCH locally.") + + common_commit = determine_merge_commit(currentBranch) + + if (common_commit is not None): + + # Now get the diff. Use --staged to get both diff between + # common_commit..HEAD and any locally staged files + allFiles = __gitdiff("--name-only", + "--ignore-submodules", + "--staged", + f"{common_commit}").splitlines() + else: + # Fallback to just uncommitted files + allFiles = uncommittedFiles() + + files = [] + for f in allFiles: + if pathFilter is None or pathFilter(f): + files.append(f) + + filesToCheckString = "\n\t".join(files) if files else "" + print(f" [DEBUG] Found files to check:\n\t{filesToCheckString}\n") + return files + + +def listAllFilesInDir(folder): + """Utility function to list all files/subdirs in the input folder""" + allFiles = [] + for root, dirs, files in os.walk(folder): + for name in files: + allFiles.append(os.path.join(root, name)) + return allFiles + + +def listFilesToCheck(filesDirs, filter=None): + """ + Utility function to filter the input list of files/dirs based on the input + filter method and returns all the files that need to be checked + """ + allFiles = [] + for f in filesDirs: + if os.path.isfile(f): + if filter is None or filter(f): + allFiles.append(f) + elif os.path.isdir(f): + files = listAllFilesInDir(f) + for f_ in files: + if filter is None or filter(f_): + allFiles.append(f_) + return allFiles diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 9fb86b0b3c5..a7ad260758d 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. ##################### # cuDF Style Tester # ##################### @@ -19,9 +19,24 @@ export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake. mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} + pre-commit run --hook-stage manual --all-files PRE_COMMIT_RETVAL=$? +# Check for copyright headers in the files modified currently +COPYRIGHT=`python ci/checks/copyright.py --git-modified-only 2>&1` +CR_RETVAL=$? + +# Output results if failure otherwise show pass +if [ "$CR_RETVAL" != "0" ]; then + echo -e "\n\n>>>> FAILED: copyright check; begin output\n\n" + echo -e "$COPYRIGHT" + echo -e "\n\n>>>> FAILED: copyright check; end output\n\n" +else + echo -e "\n\n>>>> PASSED: copyright check\n\n" + echo -e "$COPYRIGHT" +fi + # Run clang-format and check for a consistent code format CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1` CLANG_FORMAT_RETVAL=$? @@ -40,7 +55,7 @@ HEADER_META_RETVAL=$? echo -e "$HEADER_META" RETVALS=( - $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL + $CR_RETVAL $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL ) IFS=$'\n' RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1` diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh index e6ef72d930c..f2f67e9e000 100755 --- a/ci/cpu/upload.sh +++ b/ci/cpu/upload.sh @@ -23,25 +23,13 @@ if [ -z "$MY_UPLOAD_KEY" ]; then return 0 fi -################################################################################ -# SETUP - Get conda file output locations -################################################################################ - -gpuci_logger "Get conda file output locations" - -export LIBCUDF_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf --output` -export LIBCUDF_KAFKA_FILE=`conda build --no-build-id --croot "$WORKSPACE/.conda-bld" conda/recipes/libcudf_kafka --output` -export CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf --python=$PYTHON --output` -export DASK_CUDF_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/dask-cudf --python=$PYTHON --output` -export CUDF_KAFKA_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/cudf_kafka --python=$PYTHON --output` -export CUSTREAMZ_FILE=`conda build --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON --output` - ################################################################################ # UPLOAD - Conda packages ################################################################################ gpuci_logger "Starting conda uploads" if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then + export LIBCUDF_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf --output) test -e ${LIBCUDF_FILE} echo "Upload libcudf" echo ${LIBCUDF_FILE} @@ -49,16 +37,19 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF" == "1" ]]; then fi if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then + export CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf --python=$PYTHON --output) test -e ${CUDF_FILE} echo "Upload cudf" echo ${CUDF_FILE} gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CUDF_FILE} --no-progress + export DASK_CUDF_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/dask-cudf --python=$PYTHON --output) test -e ${DASK_CUDF_FILE} echo "Upload dask-cudf" echo ${DASK_CUDF_FILE} gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${DASK_CUDF_FILE} --no-progress + export CUSTREAMZ_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/custreamz --python=$PYTHON --output) test -e ${CUSTREAMZ_FILE} echo "Upload custreamz" echo ${CUSTREAMZ_FILE} @@ -66,6 +57,7 @@ if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF" == "1" ]]; then fi if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then + export LIBCUDF_KAFKA_FILE=$(conda build --no-build-id --croot "${CONDA_BLD_DIR}" conda/recipes/libcudf_kafka --output) test -e ${LIBCUDF_KAFKA_FILE} echo "Upload libcudf_kafka" echo ${LIBCUDF_KAFKA_FILE} @@ -73,6 +65,7 @@ if [[ "$BUILD_LIBCUDF" == "1" && "$UPLOAD_LIBCUDF_KAFKA" == "1" ]]; then fi if [[ "$BUILD_CUDF" == "1" && "$UPLOAD_CUDF_KAFKA" == "1" ]]; then + export CUDF_KAFKA_FILE=$(conda build --croot "${CONDA_BLD_DIR}" conda/recipes/cudf_kafka --python=$PYTHON --output) test -e ${CUDF_KAFKA_FILE} echo "Upload cudf_kafka" echo ${CUDF_KAFKA_FILE} diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 53ad948b61c..a79ffa0fc47 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. ############################################## # cuDF GPU build and test script for CI # ############################################## @@ -128,11 +128,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then ################################################################################ gpuci_logger "Build from source" - if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds - else - "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds - fi + "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds ################################################################################ # TEST - Run GoogleTest @@ -226,11 +222,7 @@ else install_dask gpuci_logger "Build python libs from source" - if [[ "${BUILD_MODE}" == "pull-request" ]]; then - "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds - else - "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds - fi + "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds fi @@ -249,15 +241,15 @@ fi cd "$WORKSPACE/python/cudf" gpuci_logger "Python py.test for cuDF" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config=.coveragerc --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope cudf cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf cd "$WORKSPACE/python/custreamz" gpuci_logger "Python py.test for cuStreamz" -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml="$WORKSPACE/junit-custreamz.xml" -v --cov-config=.coveragerc --cov=custreamz --cov-report=xml:"$WORKSPACE/python/custreamz/custreamz-coverage.xml" --cov-report term custreamz gpuci_logger "Test notebooks" "$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index b9577d937d9..b926a6cdc99 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. name: cudf_dev channels: @@ -17,7 +17,7 @@ dependencies: - numba>=0.54 - numpy - pandas>=1.0,<1.4.0dev0 - - pyarrow=5.0.0=*cuda + - pyarrow=6.0.1=*cuda - fastavro>=0.22.9 - python-snappy>=0.6.0 - notebook>=0.5.0 @@ -45,7 +45,7 @@ dependencies: - dask>=2021.11.1,<=2022.01.0 - distributed>=2021.11.1,<=2022.01.0 - streamz - - arrow-cpp=5.0.0 + - arrow-cpp=6.0.1 - dlpack>=0.5,<0.6.0a0 - arrow-cpp-proc * cuda - double-conversion diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index bd1412bc611..0145e2e4d01 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -31,7 +31,7 @@ requirements: - setuptools - numba >=0.54 - dlpack>=0.5,<0.6.0a0 - - pyarrow 5.0.0 *cuda + - pyarrow 6.0.1 *cuda - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 70c020d4abd..4e20c979f6c 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -40,7 +40,7 @@ requirements: host: - librmm {{ minor_version }}.* - cudatoolkit {{ cuda_version }}.* - - arrow-cpp 5.0.0 *cuda + - arrow-cpp 6.0.1 *cuda - arrow-cpp-proc * cuda - dlpack>=0.5,<0.6.0a0 run: @@ -216,6 +216,7 @@ test: - test -f $PREFIX/include/cudf/strings/replace_re.hpp - test -f $PREFIX/include/cudf/strings/split/partition.hpp - test -f $PREFIX/include/cudf/strings/split/split.hpp + - test -f $PREFIX/include/cudf/strings/split/split_re.hpp - test -f $PREFIX/include/cudf/strings/string_view.hpp - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp - test -f $PREFIX/include/cudf/strings/strip.hpp diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 0b274f3a41d..1eb5b13ddc3 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -27,8 +27,6 @@ requirements: host: - libcudf {{version}} - librdkafka >=1.7.0,<1.8.0a0 - run: - - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not test: commands: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 90e94ffcc7b..2ffd62f1b53 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -445,6 +445,7 @@ add_library( src/strings/search/find_multiple.cu src/strings/split/partition.cu src/strings/split/split.cu + src/strings/split/split_re.cu src/strings/split/split_record.cu src/strings/strings_column_factories.cu src/strings/strings_column_view.cpp @@ -497,6 +498,8 @@ set_target_properties( # set target compile options CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON + # For std:: support of __int128_t. Can be removed once using cuda::std + CXX_EXTENSIONS ON CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON POSITION_INDEPENDENT_CODE ON diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 0704180bad0..11eef015364 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -59,6 +59,12 @@ function(ConfigureBench CMAKE_BENCH_NAME) ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + # For std:: support of __int128_t. Can be removed once using cuda::std + CXX_EXTENSIONS ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_link_libraries( ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main @@ -254,7 +260,7 @@ ConfigureBench( string/convert_durations.cpp string/convert_fixed_point.cpp string/convert_numerics.cpp - string/copy.cpp + string/copy.cu string/extract.cpp string/factory.cu string/filter.cpp @@ -265,7 +271,7 @@ ConfigureBench( string/split.cpp string/substring.cpp string/translate.cpp - string/url_decode.cpp + string/url_decode.cu ) # ################################################################################################## diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 745d4e354e7..f8226c7387a 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,8 +27,8 @@ class COMPILED_BINARYOP : public cudf::benchmark { }; -template -void BM_compiled_binaryop(benchmark::State& state) +template +void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) { const cudf::size_type column_size{(cudf::size_type)state.range(0)}; @@ -50,21 +50,26 @@ void BM_compiled_binaryop(benchmark::State& state) } // TODO tparam boolean for null. -#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \ - TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP, \ - BM_compiled_binaryop, \ - TypeLhs, \ - TypeRhs, \ - TypeOut, \ - cudf::binary_operator::binop) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ + (::benchmark::State & st) \ + { \ + BM_compiled_binaryop(st, cudf::binary_operator::bop); \ + } \ + BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ + ->Unit(benchmark::kMicrosecond) \ + ->UseManualTime() \ + ->Arg(10000) /* 10k */ \ + ->Arg(100000) /* 100k */ \ + ->Arg(1000000) /* 1M */ \ + ->Arg(10000000) /* 10M */ \ ->Arg(100000000); /* 100M */ +#define build_name(a, b, c, d) a##_##b##_##c##_##d + +#define BINARYOP_BENCHMARK_DEFINE(lhs, rhs, bop, tout) \ + BM_BINARYOP_BENCHMARK_DEFINE(build_name(bop, lhs, rhs, tout), lhs, rhs, bop, tout) + using namespace cudf; using namespace numeric; diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu index 55a1e524479..f21356aff02 100644 --- a/cpp/benchmarks/join/join.cu +++ b/cpp/benchmarks/join/join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,7 +49,7 @@ void nvbench_inner_join(nvbench::state& state, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream) { cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); - return hj_obj.inner_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + return hj_obj.inner_join(right_input.select(right_on), std::nullopt, stream); }; BM_join(state, join); @@ -71,7 +71,7 @@ void nvbench_left_join(nvbench::state& state, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream) { cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); - return hj_obj.left_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + return hj_obj.left_join(right_input.select(right_on), std::nullopt, stream); }; BM_join(state, join); @@ -93,7 +93,7 @@ void nvbench_full_join(nvbench::state& state, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream) { cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream); - return hj_obj.full_join(right_input.select(right_on), compare_nulls, std::nullopt, stream); + return hj_obj.full_join(right_input.select(right_on), std::nullopt, stream); }; BM_join(state, join); diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index e88253395d8..f2b9cb1bdb9 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,24 +16,41 @@ #pragma once -#include -#include +#include "generate_input_tables.cuh" -#include +#include +#include + +#include +#include #include +#include +#include #include +#include #include #include -#include -#include -#include -#include +#include + +#include +#include +#include #include -#include "generate_input_tables.cuh" +struct null75_generator { + thrust::minstd_rand engine; + thrust::uniform_int_distribution rand_gen; + null75_generator() : engine(), rand_gen() {} + __device__ bool operator()(size_t i) + { + engine.discard(i); + // roughly 75% nulls + return (rand_gen(engine) & 3) == 0; + } +}; template rand_gen(0, build_table_size); - auto build_random_null_mask = [&rand_gen](int size) { + auto build_random_null_mask = [](int size) { // roughly 75% nulls - auto validity = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; }); - return cudf::test::detail::make_null_mask(validity, validity + size); + auto validity = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{}); + return cudf::detail::valid_if(validity, validity + size, thrust::identity{}).first; }; std::unique_ptr build_key_column = [&]() { @@ -96,17 +111,14 @@ static void BM_join(state_type& state, Join JoinFunc) selectivity, multiplicity); - auto payload_data_it = thrust::make_counting_iterator(0); - cudf::test::fixed_width_column_wrapper build_payload_column( - payload_data_it, payload_data_it + build_table_size); - - cudf::test::fixed_width_column_wrapper probe_payload_column( - payload_data_it, payload_data_it + probe_table_size); + auto init = cudf::make_fixed_width_scalar(static_cast(0)); + auto build_payload_column = cudf::sequence(build_table_size, *init); + auto probe_payload_column = cudf::sequence(probe_table_size, *init); CHECK_CUDA(0); - cudf::table_view build_table({build_key_column->view(), build_payload_column}); - cudf::table_view probe_table({probe_key_column->view(), probe_payload_column}); + cudf::table_view build_table({build_key_column->view(), *build_payload_column}); + cudf::table_view probe_table({probe_key_column->view(), *probe_payload_column}); // Setup join parameters and result table [[maybe_unused]] std::vector columns_to_join = {0}; diff --git a/cpp/benchmarks/string/copy.cpp b/cpp/benchmarks/string/copy.cu similarity index 83% rename from cpp/benchmarks/string/copy.cpp rename to cpp/benchmarks/string/copy.cu index d40b0e069bc..2f064e71c44 100644 --- a/cpp/benchmarks/string/copy.cpp +++ b/cpp/benchmarks/string/copy.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,8 @@ * limitations under the License. */ -#include +#include "string_bench_args.hpp" + #include #include #include @@ -23,10 +24,7 @@ #include #include -#include -#include - -#include "string_bench_args.hpp" +#include class StringCopy : public cudf::benchmark { }; @@ -47,11 +45,14 @@ static void BM_copy(benchmark::State& state, copy_type ct) create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); // scatter indices - std::vector host_map_data(n_rows); - std::iota(host_map_data.begin(), host_map_data.end(), 0); - std::random_shuffle(host_map_data.begin(), host_map_data.end()); - cudf::test::fixed_width_column_wrapper index_map(host_map_data.begin(), - host_map_data.end()); + auto index_map_col = make_numeric_column( + cudf::data_type{cudf::type_id::INT32}, n_rows, cudf::mask_state::UNALLOCATED); + auto index_map = index_map_col->mutable_view(); + thrust::shuffle_copy(thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(n_rows), + index_map.begin(), + thrust::default_random_engine()); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); diff --git a/cpp/benchmarks/string/url_decode.cpp b/cpp/benchmarks/string/url_decode.cpp deleted file mode 100644 index 4dc77cffa1a..00000000000 --- a/cpp/benchmarks/string/url_decode.cpp +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -struct url_string_generator { - size_t num_chars; - std::bernoulli_distribution dist; - - url_string_generator(size_t num_chars, double esc_seq_chance) - : num_chars{num_chars}, dist{esc_seq_chance} - { - } - - std::string operator()(std::mt19937& engine) - { - std::string str; - str.reserve(num_chars); - while (str.size() < num_chars) { - if (str.size() < num_chars - 3 && dist(engine)) { - str += "%20"; - } else { - str.push_back('a'); - } - } - return str; - } -}; - -cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows, - cudf::size_type chars_per_row, - double esc_seq_chance) -{ - std::mt19937 engine(1); - url_string_generator url_gen(chars_per_row, esc_seq_chance); - std::vector strings; - strings.reserve(num_rows); - std::generate_n(std::back_inserter(strings), num_rows, [&]() { return url_gen(engine); }); - return cudf::test::strings_column_wrapper(strings.begin(), strings.end()); -} - -class UrlDecode : public cudf::benchmark { -}; - -template -void BM_url_decode(benchmark::State& state) -{ - cudf::size_type const num_rows = state.range(0); - cudf::size_type const chars_per_row = state.range(1); - - auto column = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0); - auto strings_view = cudf::strings_column_view(column); - - for (auto _ : state) { - cuda_event_timer raii(state, true, rmm::cuda_stream_default); - auto result = cudf::strings::url_decode(strings_view); - } - - state.SetBytesProcessed(state.iterations() * num_rows * - (chars_per_row + sizeof(cudf::size_type))); -} - -#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ - TEMPLATED_BENCHMARK_F(UrlDecode, BM_url_decode, esc_seq_pct) \ - ->Args({100000000, 10}) \ - ->Args({10000000, 100}) \ - ->Args({1000000, 1000}) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); - -URLD_BENCHMARK_DEFINE(10) -URLD_BENCHMARK_DEFINE(50) diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu new file mode 100644 index 00000000000..c460820d788 --- /dev/null +++ b/cpp/benchmarks/string/url_decode.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +struct url_string_generator { + char* chars; + double esc_seq_chance; + thrust::minstd_rand engine; + thrust::uniform_real_distribution esc_seq_dist; + url_string_generator(char* c, double esc_seq_chance, thrust::minstd_rand& engine) + : chars(c), esc_seq_chance(esc_seq_chance), engine(engine), esc_seq_dist(0, 1) + { + } + + __device__ void operator()(thrust::tuple str_begin_end) + { + auto begin = thrust::get<0>(str_begin_end); + auto end = thrust::get<1>(str_begin_end); + engine.discard(begin); + for (auto i = begin; i < end; ++i) { + if (esc_seq_dist(engine) < esc_seq_chance and i < end - 3) { + chars[i] = '%'; + chars[i + 1] = '2'; + chars[i + 2] = '0'; + i += 2; + } else { + chars[i] = 'a'; + } + } + } +}; + +auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, double esc_seq_chance) +{ + std::vector strings{std::string(chars_per_row, 'a')}; + auto col_1a = cudf::test::strings_column_wrapper(strings.begin(), strings.end()); + auto table_a = cudf::repeat(cudf::table_view{{col_1a}}, num_rows); + auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa... + auto chars_col = result_col->child(cudf::strings_column_view::chars_column_index).mutable_view(); + auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view(); + + auto engine = thrust::default_random_engine{}; + thrust::for_each_n(thrust::device, + thrust::make_zip_iterator(offset_col.begin(), + offset_col.begin() + 1), + num_rows, + url_string_generator{chars_col.begin(), esc_seq_chance, engine}); + return result_col; +} + +class UrlDecode : public cudf::benchmark { +}; + +void BM_url_decode(benchmark::State& state, int esc_seq_pct) +{ + cudf::size_type const num_rows = state.range(0); + cudf::size_type const chars_per_row = state.range(1); + + auto column = generate_column(num_rows, chars_per_row, esc_seq_pct / 100.0); + auto strings_view = cudf::strings_column_view(column->view()); + + for (auto _ : state) { + cuda_event_timer raii(state, true, rmm::cuda_stream_default); + auto result = cudf::strings::url_decode(strings_view); + } + + state.SetBytesProcessed(state.iterations() * num_rows * + (chars_per_row + sizeof(cudf::size_type))); +} + +#define URLD_BENCHMARK_DEFINE(esc_seq_pct) \ + BENCHMARK_DEFINE_F(UrlDecode, esc_seq_pct) \ + (::benchmark::State & st) { BM_url_decode(st, esc_seq_pct); } \ + BENCHMARK_REGISTER_F(UrlDecode, esc_seq_pct) \ + ->Args({100000000, 10}) \ + ->Args({10000000, 100}) \ + ->Args({1000000, 1000}) \ + ->Unit(benchmark::kMillisecond) \ + ->UseManualTime(); + +URLD_BENCHMARK_DEFINE(10) +URLD_BENCHMARK_DEFINE(50) diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index 90097889a86..ca19e3046ad 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,15 +14,16 @@ * limitations under the License. */ -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" +#include +#include #include #include #include -#include #include +#include +#include #include #include @@ -170,21 +171,18 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa template void type_dispatcher_benchmark(::benchmark::State& state) { - const auto source_size = static_cast(state.range(1)); - - const auto n_cols = static_cast(state.range(0)); - + const auto n_cols = static_cast(state.range(0)); + const auto source_size = static_cast(state.range(1)); const auto work_per_thread = static_cast(state.range(2)); - auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); + auto init = cudf::make_fixed_width_scalar(static_cast(0)); - std::vector> source_column_wrappers; + std::vector> source_column_wrappers; std::vector source_columns; for (int i = 0; i < n_cols; ++i) { - source_column_wrappers.push_back( - cudf::test::fixed_width_column_wrapper(data, data + source_size)); - source_columns.push_back(source_column_wrappers[i]); + source_column_wrappers.push_back(cudf::sequence(source_size, *init)); + source_columns.push_back(*source_column_wrappers[i]); } cudf::mutable_table_view source_table{source_columns}; diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index ae1448da502..83c5e4c3e8f 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -308,7 +308,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endfunction() -set(CUDF_VERSION_Arrow 5.0.0) +set(CUDF_VERSION_Arrow 6.0.1) find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 23587f49334..fb5b968671f 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -79,6 +79,7 @@ class aggregation { ROW_NUMBER, ///< get row-number of current index (relative to rolling window) RANK, ///< get rank of current index DENSE_RANK, ///< get dense rank of current index + PERCENT_RANK, ///< get percent (i.e. fractional) rank of current index COLLECT_LIST, ///< collect values into a list COLLECT_SET, ///< collect values into a list without duplicate entries LEAD, ///< window function, accesses row at specified offset following current row @@ -305,34 +306,34 @@ std::unique_ptr make_row_number_aggregation(); * 3. `RANK` aggregations are not compatible with exclusive scans. * * @code{.pseudo} - * Example: Consider an motor-racing statistics dataset, containing the following columns: - * 1. driver_name: (STRING) Name of the car driver - * 2. num_overtakes: (INT32) Number of times the driver overtook another car in a lap - * 3. lap_number: (INT32) The number of the lap + * Example: Consider a motor-racing statistics dataset, containing the following columns: + * 1. venue: (STRING) Location of the race event + * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) + * 3. time: (INT32) Time taken to complete the circuit * * For the following presorted data: * - * [ // driver_name, num_overtakes, lap_number - * { "bottas", 2, 3 }, - * { "bottas", 2, 7 }, - * { "bottas", 2, 7 }, - * { "bottas", 1, 1 }, - * { "bottas", 1, 2 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 3, 4 }, - * { "hamilton", 2, 4 } + * [ // venue, driver, time + * { "silverstone", "HAM" ("hamilton"), 15823}, + * { "silverstone", "LEC" ("leclerc"), 15827}, + * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "RIC" ("ricciardo"), 15905}, + * { "monza", "RIC" ("ricciardo"), 12154}, + * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. + * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. + * { "monza", "LEC" ("leclerc"), 12201}, + * { "monza", "PER" ("perez"), 12203} * ] * * A grouped rank aggregation scan with: - * groupby column : driver_name - * input orderby column: struct_column{num_overtakes, lap_number} - * result: column{1, 2, 2, 4, 5, 1, 1, 3, 4} - * - * A grouped rank aggregation scan with: - * groupby column : driver_name - * input orderby column: num_overtakes - * result: column{1, 1, 1, 4, 4, 1, 1, 3, 4} + * groupby column : venue + * input orderby column: time + * Produces the following rank column: + * { 1, 2, 3, 3, 5, 1, 2, 2, 4, 5} + * (This corresponds to the following grouping and `driver` rows:) + * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } + * <----------silverstone----------->|<-------------monza--------------> * @endcode */ template @@ -357,39 +358,95 @@ std::unique_ptr make_rank_aggregation(); * 3. `DENSE_RANK` aggregations are not compatible with exclusive scans. * * @code{.pseudo} - * Example: Consider an motor-racing statistics dataset, containing the following columns: - * 1. driver_name: (STRING) Name of the car driver - * 2. num_overtakes: (INT32) Number of times the driver overtook another car in a lap - * 3. lap_number: (INT32) The number of the lap + * Example: Consider a motor-racing statistics dataset, containing the following columns: + * 1. venue: (STRING) Location of the race event + * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) + * 3. time: (INT32) Time taken to complete the circuit * * For the following presorted data: * - * [ // driver_name, num_overtakes, lap_number - * { "bottas", 2, 3 }, - * { "bottas", 2, 7 }, - * { "bottas", 2, 7 }, - * { "bottas", 1, 1 }, - * { "bottas", 1, 2 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 4, 1 }, - * { "hamilton", 3, 4 }, - * { "hamilton", 2, 4 } + * [ // venue, driver, time + * { "silverstone", "HAM" ("hamilton"), 15823}, + * { "silverstone", "LEC" ("leclerc"), 15827}, + * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "RIC" ("ricciardo"), 15905}, + * { "monza", "RIC" ("ricciardo"), 12154}, + * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. + * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. + * { "monza", "LEC" ("leclerc"), 12201}, + * { "monza", "PER" ("perez"), 12203} * ] * * A grouped dense rank aggregation scan with: - * groupby column : driver_name - * input orderby column: struct_column{num_overtakes, lap_number} - * result: column{1, 2, 2, 3, 4, 1, 1, 2, 3} - * - * A grouped dense rank aggregation scan with: - * groupby column : driver_name - * input orderby column: num_overtakes - * result: column{1, 1, 1, 2, 2, 1, 1, 2, 3} + * groupby column : venue + * input orderby column: time + * Produces the following dense rank column: + * { 1, 2, 3, 3, 4, 1, 2, 2, 3, 4} + * (This corresponds to the following grouping and `driver` rows:) + * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } + * <----------silverstone----------->|<-------------monza--------------> * @endcode */ template std::unique_ptr make_dense_rank_aggregation(); +/** + * @brief Factory to create a PERCENT_RANK aggregation + * + * `PERCENT_RANK` returns a non-nullable column of double precision "fractional" ranks. + * For row index `i`, the percent rank of row `i` is defined as: + * percent_rank = (rank - 1) / (group_row_count - 1) + * where, + * 1. rank is the `RANK` of the row within the group + * 2. group_row_count is the number of rows in the group + * + * This aggregation only works with "scan" algorithms. The input to the grouped or + * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks. + * If rows are ordered by more than one column, the orderby input column should be a struct + * column containing the ordering columns. + * + * Note: + * 1. This method requires that the rows are presorted by the group keys and order_by columns. + * 2. `PERCENT_RANK` aggregations will return a fully valid column regardless of null_handling + * policy specified in the scan. + * 3. `PERCENT_RANK` aggregations are not compatible with exclusive scans. + * + * @code{.pseudo} + * Example: Consider a motor-racing statistics dataset, containing the following columns: + * 1. venue: (STRING) Location of the race event + * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) + * 3. time: (INT32) Time taken to complete the circuit + * + * For the following presorted data: + * + * [ // venue, driver, time + * { "silverstone", "HAM" ("hamilton"), 15823}, + * { "silverstone", "LEC" ("leclerc"), 15827}, + * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. + * { "silverstone", "RIC" ("ricciardo"), 15905}, + * { "monza", "RIC" ("ricciardo"), 12154}, + * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. + * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. + * { "monza", "LEC" ("leclerc"), 12201}, + * { "monza", "PER" ("perez"), 12203} + * ] + * + * A grouped percent rank aggregation scan with: + * groupby column : venue + * input orderby column: time + * Produces the following percent rank column: + * { 0.00, 0.25, 0.50, 0.50, 1.00, 0.00, 0.25, 0.25, 0.75, 1.00 } + * + * (This corresponds to the following grouping and `driver` rows:) + * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } + * <----------silverstone----------->|<-------------monza--------------> + * @endcode + */ +template +std::unique_ptr make_percent_rank_aggregation(); + /** * @brief Factory to create a COLLECT_LIST aggregation * diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp index 325f023f283..ba15e37f9ea 100644 --- a/cpp/include/cudf/column/column_view.hpp +++ b/cpp/include/cudf/column/column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,13 @@ #pragma once #include +#include +#include #include +#include +#include +#include #include /** @@ -375,6 +380,43 @@ class column_view : public detail::column_view_base { */ auto child_end() const noexcept { return _children.cend(); } + /** + * @brief Construct a column view from a device_span. + * + * Only numeric and chrono types are supported. + * + * @tparam T The device span type. Must be const and match the column view's type. + * @param data A typed device span containing the column view's data. + */ + template () or cudf::is_chrono())> + column_view(device_span data) + : column_view( + cudf::data_type{cudf::type_to_id()}, data.size(), data.data(), nullptr, 0, 0, {}) + { + CUDF_EXPECTS(data.size() < std::numeric_limits::max(), + "Data exceeds the maximum size of a column view."); + } + + /** + * @brief Converts a column view into a device span. + * + * Only numeric and chrono data types are supported. The column view must not + * be nullable. + * + * @tparam T The device span type. Must be const and match the column view's type. + * @throws cudf::logic_error if the column view type does not match the span type. + * @throws cudf::logic_error if the column view is nullable. + * @return A typed device span of the column view's data. + */ + template () or cudf::is_chrono())> + [[nodiscard]] operator device_span() const + { + CUDF_EXPECTS(type() == cudf::data_type{cudf::type_to_id()}, + "Device span type must match column view type."); + CUDF_EXPECTS(!nullable(), "A nullable column view cannot be converted to a device span."); + return device_span(data(), size()); + } + private: friend column_view bit_cast(column_view const& input, data_type type); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index fbf315776f4..a234f4d3715 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,6 +77,8 @@ class simple_aggregations_collector { // Declares the interface for the simple class rank_aggregation const& agg); virtual std::vector> visit(data_type col_type, class dense_rank_aggregation const& agg); + virtual std::vector> visit( + data_type col_type, class percent_rank_aggregation const& agg); virtual std::vector> visit( data_type col_type, class collect_list_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -126,6 +128,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class row_number_aggregation const& agg); virtual void visit(class rank_aggregation const& agg); virtual void visit(class dense_rank_aggregation const& agg); + virtual void visit(class percent_rank_aggregation const& agg); virtual void visit(class collect_list_aggregation const& agg); virtual void visit(class collect_set_aggregation const& agg); virtual void visit(class lead_lag_aggregation const& agg); @@ -654,6 +657,22 @@ class dense_rank_aggregation final : public rolling_aggregation, public groupby_ void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; +class percent_rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation { + public: + percent_rank_aggregation() : aggregation{PERCENT_RANK} {} + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(*this); + } + std::vector> get_simple_aggregations( + data_type col_type, simple_aggregations_collector& collector) const override + { + return collector.visit(col_type, *this); + } + void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } +}; + /** * @brief Derived aggregation class for specifying COLLECT_LIST aggregation */ @@ -1243,6 +1262,12 @@ struct target_type_impl { using type = size_type; }; +// Always use double for PERCENT_RANK +template +struct target_type_impl { + using type = double; +}; + // Always use list for COLLECT_LIST template struct target_type_impl { @@ -1405,6 +1430,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind return f.template operator()(std::forward(args)...); case aggregation::DENSE_RANK: return f.template operator()(std::forward(args)...); + case aggregation::PERCENT_RANK: + return f.template operator()(std::forward(args)...); case aggregation::COLLECT_LIST: return f.template operator()(std::forward(args)...); case aggregation::COLLECT_SET: diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp index 8e3db1c7b10..36dce6caf0b 100644 --- a/cpp/include/cudf/detail/scan.hpp +++ b/cpp/include/cudf/detail/scan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -102,5 +102,17 @@ std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Generate row percent ranks for a column. + * + * @param order_by Input column to generate ranks for. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return rank values. + */ +std::unique_ptr inclusive_percent_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index a59ad4c42ee..d31f85d6603 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,6 +129,8 @@ struct DeviceMin { !cudf::is_fixed_point()>* = nullptr> static constexpr T identity() { + // chrono types do not have std::numeric_limits specializations and should use T::max() + // https://eel.is/c++draft/numeric.limits.general#6 if constexpr (cudf::is_chrono()) return T::max(); return cuda::std::numeric_limits::max(); } @@ -171,6 +173,8 @@ struct DeviceMax { !cudf::is_fixed_point()>* = nullptr> static constexpr T identity() { + // chrono types do not have std::numeric_limits specializations and should use T::min() + // https://eel.is/c++draft/numeric.limits.general#6 if constexpr (cudf::is_chrono()) return T::min(); return cuda::std::numeric_limits::lowest(); } diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 51d58383de4..4ec5f134bb0 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -130,7 +131,7 @@ struct MurmurHash3_32 { * * @returns A hash value that intelligently combines the lhs and rhs hash values */ - [[nodiscard]] __device__ inline result_type hash_combine(result_type lhs, result_type rhs) + constexpr result_type hash_combine(result_type lhs, result_type rhs) const { result_type combined{lhs}; diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index cce05042917..e4e94074fb8 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #include #include -#include namespace cudf { /** @@ -26,6 +25,22 @@ namespace cudf { * @file */ +/** + * @brief Identifies the hash function to be used + */ +enum class hash_id { + HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed + HASH_MURMUR3, ///< Murmur3 hash function + HASH_MD5, ///< MD5 hash function + HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function + HASH_SPARK_MURMUR3 ///< Spark Murmur3 hash function +}; + +/** + * @brief The default seed value for hash functions + */ +static constexpr uint32_t DEFAULT_HASH_SEED = 0; + /** * @brief Computes the hash value of each row in the input set of columns. * diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index f6efea5f2bb..d56f8f0e904 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -530,7 +530,6 @@ class hash_join { * provided `output_size` is smaller than the actual output size. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device @@ -543,7 +542,6 @@ class hash_join { std::pair>, std::unique_ptr>> inner_join(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, std::optional output_size = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -554,7 +552,6 @@ class hash_join { * provided `output_size` is smaller than the actual output size. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device @@ -567,7 +564,6 @@ class hash_join { std::pair>, std::unique_ptr>> left_join(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, std::optional output_size = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -578,7 +574,6 @@ class hash_join { * provided `output_size` is smaller than the actual output size. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device @@ -591,7 +586,6 @@ class hash_join { std::pair>, std::unique_ptr>> full_join(cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, std::optional output_size = {}, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; @@ -601,39 +595,32 @@ class hash_join { * probe table. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches * * @return The exact number of output when performing an inner join between two tables with * `build` and `probe` as the the join keys . */ [[nodiscard]] std::size_t inner_join_size( - cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + cudf::table_view const& probe, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * Returns the exact number of matches (rows) when performing a left join with the specified probe * table. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches * * @return The exact number of output when performing a left join between two tables with `build` * and `probe` as the the join keys . */ [[nodiscard]] std::size_t left_join_size( - cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; + cudf::table_view const& probe, rmm::cuda_stream_view stream = rmm::cuda_stream_default) const; /** * Returns the exact number of matches (rows) when performing a full join with the specified probe * table. * * @param probe The probe table, from which the tuples are probed. - * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the intermediate table and columns' device * memory. @@ -643,7 +630,6 @@ class hash_join { */ std::size_t full_join_size( cudf::table_view const& probe, - null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 6b1ad7db08b..3ffd9a87d39 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once -#include +#include #include diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp index 3f35b796e58..ba34e20398e 100644 --- a/cpp/include/cudf/rolling.hpp +++ b/cpp/include/cudf/rolling.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -451,7 +451,7 @@ std::unique_ptr grouped_time_range_rolling_window( * should be the exact same type (`INT32`). * * @code{.pseudo} - * Example: Consider an motor-racing statistics dataset, containing the following columns: + * Example: Consider a motor-racing statistics dataset, containing the following columns: * 1. driver_name: (STRING) Name of the car driver * 2. num_overtakes: (INT32) Number of times the driver overtook another car in a lap * 3. lap_number: (INT32) The number of the lap diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh index aa3f544202f..7af56f89449 100644 --- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh +++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include @@ -150,9 +152,11 @@ __device__ DecimalType parse_decimal(char const* iter, char const* iter_end, int exp_ten += exp_offset; // shift the output value based on the exp_ten and the scale values - value = exp_ten < scale - ? value / static_cast(exp10(static_cast(scale - exp_ten))) - : value * static_cast(exp10(static_cast(exp_ten - scale))); + auto const shift_adjust = + abs(scale - exp_ten) > cuda::std::numeric_limits::digits10 + ? cuda::std::numeric_limits::max() + : numeric::detail::exp10(abs(scale - exp_ten)); + value = exp_ten < scale ? value / shift_adjust : value * shift_adjust; return static_cast(value) * (sign == 0 ? 1 : sign); } diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index 466f71aace0..94e9f36d7d3 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -48,12 +49,14 @@ namespace strings { * * @param strings Strings instance for this operation. * @param pattern The regular expression pattern with group indicators. + * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate the returned table's device memory. * @return Columns of strings extracted from the input column. */ std::unique_ptr extract( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -69,7 +72,7 @@ std::unique_ptr
extract( * @code{.pseudo} * Example: * s = ["a1 b4", "b2", "c3 a5", "b", null] - * r = extract_all(s,"([ab])(\\d)") + * r = extract_all_record(s,"([ab])(\\d)") * r is now [ ["a", "1", "b", "4"], * ["b", "2"], * ["a", "5"], @@ -81,12 +84,14 @@ std::unique_ptr
extract( * * @param strings Strings instance for this operation. * @param pattern The regular expression pattern with group indicators. + * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate any returned device memory. * @return Lists column containing strings extracted from the input column. */ -std::unique_ptr extract_all( +std::unique_ptr extract_all_record( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp new file mode 100644 index 00000000000..320d1bdc9b4 --- /dev/null +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace strings { +/** + * @addtogroup strings_split + * @{ + * @file + */ + +/** + * @brief Splits strings elements into a table of strings columns + * using a regex pattern to delimit each string. + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of strings[row]` + * where `token` is a substring between delimiters. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = split_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc def_g", "_bc", "ab cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
split_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits strings elements into a table of strings columns + * using a regex pattern to delimit each string starting from the end of the string. + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = rsplit_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a_bc def", "a_", "_ab", "ab"], + * ["g", "bc", "cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
rsplit_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits strings elements into a list column of strings + * using the given regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_record_re(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = split_record_re(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a", "bc def_g"], + * ["a", "_bc"], + * ["", "ab cd"], + * ["ab", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings. + */ +std::unique_ptr split_record_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits strings elements into a list column of strings + * using the given regex pattern to delimit each string starting from the end of the string. + * + * Each element generates a vector of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_record_re(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = rsplit_record_re(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a_bc def", "g"], + * ["a_", "bc"], + * ["_ab", "cd"], + * ["ab_cd", ""] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings. + */ +std::unique_ptr rsplit_record_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp index 6222b2e680e..76e2589a5a9 100644 --- a/cpp/include/cudf/types.hpp +++ b/cpp/include/cudf/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -326,21 +326,5 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh */ std::size_t size_of(data_type t); -/** - * @brief Identifies the hash function to be used - */ -enum class hash_id { - HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed - HASH_MURMUR3, ///< Murmur3 hash function - HASH_MD5, ///< MD5 hash function - HASH_SERIAL_MURMUR3, ///< Serial Murmur3 hash function - HASH_SPARK_MURMUR3 ///< Spark Murmur3 hash function -}; - -/** - * @brief The default seed value for hash functions - */ -static constexpr uint32_t DEFAULT_HASH_SEED = 0; - /** @} */ } // namespace cudf diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 0ac41b2c4a1..bfaf4694693 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp index 3a27d798487..62aa22c2788 100644 --- a/cpp/include/cudf/wrappers/durations.hpp +++ b/cpp/include/cudf/wrappers/durations.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,6 @@ #pragma once -#include - #include namespace cudf { @@ -68,32 +66,3 @@ static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep), ""); /** @} */ // end of group } // namespace cudf - -namespace std { -/** - * @brief Specialization of std::numeric_limits for cudf::detail::duration - * - * Pass through to return the limits of the underlying numeric representation. - */ -#define DURATION_LIMITS(TypeName) \ - template <> \ - struct numeric_limits { \ - static constexpr TypeName max() noexcept { return TypeName::max(); } \ - static constexpr TypeName lowest() noexcept \ - { \ - return TypeName(std::numeric_limits::lowest()); \ - } \ - static constexpr TypeName min() noexcept { return TypeName::min(); } \ - } - -DURATION_LIMITS(cudf::duration_D); -DURATION_LIMITS(cudf::duration_h); -DURATION_LIMITS(cudf::duration_m); -DURATION_LIMITS(cudf::duration_s); -DURATION_LIMITS(cudf::duration_ms); -DURATION_LIMITS(cudf::duration_us); -DURATION_LIMITS(cudf::duration_ns); - -#undef DURATION_LIMITS - -} // namespace std diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp index 8481068ca05..5a4424112de 100644 --- a/cpp/include/cudf/wrappers/timestamps.hpp +++ b/cpp/include/cudf/wrappers/timestamps.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,32 +78,3 @@ static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep), ""); /** @} */ // end of group } // namespace cudf - -namespace std { -/** - * @brief Specialization of std::numeric_limits for cudf::detail::timestamp - * - * Pass through to return the limits of the underlying numeric representation. - */ -#define TIMESTAMP_LIMITS(TypeName) \ - template <> \ - struct numeric_limits { \ - static constexpr TypeName max() noexcept { return TypeName::max(); } \ - static constexpr TypeName lowest() noexcept \ - { \ - return TypeName{TypeName::duration{std::numeric_limits::lowest()}}; \ - } \ - static constexpr TypeName min() noexcept { return TypeName::min(); } \ - } - -TIMESTAMP_LIMITS(cudf::timestamp_D); -TIMESTAMP_LIMITS(cudf::timestamp_h); -TIMESTAMP_LIMITS(cudf::timestamp_m); -TIMESTAMP_LIMITS(cudf::timestamp_s); -TIMESTAMP_LIMITS(cudf::timestamp_ms); -TIMESTAMP_LIMITS(cudf::timestamp_us); -TIMESTAMP_LIMITS(cudf::timestamp_ns); - -#undef TIMESTAMP_LIMITS - -} // namespace std diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index c00b3d6db85..8e2a167f7b2 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -160,6 +160,12 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } +std::vector> simple_aggregations_collector::visit( + data_type col_type, percent_rank_aggregation const& agg) +{ + return visit(col_type, static_cast(agg)); +} + std::vector> simple_aggregations_collector::visit( data_type col_type, collect_list_aggregation const& agg) { @@ -333,6 +339,11 @@ void aggregation_finalizer::visit(dense_rank_aggregation const& agg) visit(static_cast(agg)); } +void aggregation_finalizer::visit(percent_rank_aggregation const& agg) +{ + visit(static_cast(agg)); +} + void aggregation_finalizer::visit(collect_list_aggregation const& agg) { visit(static_cast(agg)); @@ -616,6 +627,16 @@ template std::unique_ptr make_dense_rank_aggregation() template std::unique_ptr make_dense_rank_aggregation(); +/// Factory to create a PERCENT_RANK aggregation +template +std::unique_ptr make_percent_rank_aggregation() +{ + return std::make_unique(); +} +template std::unique_ptr make_percent_rank_aggregation(); +template std::unique_ptr +make_percent_rank_aggregation(); + /// Factory to create a COLLECT_LIST aggregation template std::unique_ptr make_collect_list_aggregation(null_policy null_handling) diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 62aa3df8e5c..eae7d0b6129 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #include #include +#include +#include #include #include #include @@ -98,7 +100,9 @@ std::unique_ptr rank_scan(column_view const& order_by, order_by, group_labels, group_offsets, - [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; }, + [] __device__(bool unequal, auto row_index_in_group) { + return unequal ? row_index_in_group + 1 : 0; + }, DeviceMax{}, has_nested_nulls(table_view{{order_by}}), stream, @@ -115,13 +119,50 @@ std::unique_ptr dense_rank_scan(column_view const& order_by, order_by, group_labels, group_offsets, - [] __device__(bool equality, auto row_index) { return equality; }, + [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; }, DeviceSum{}, has_nested_nulls(table_view{{order_by}}), stream, mr); } +std::unique_ptr percent_rank_scan(column_view const& order_by, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const rank_column = rank_scan( + order_by, group_labels, group_offsets, stream, rmm::mr::get_current_device_resource()); + auto const rank_view = rank_column->view(); + auto const group_size_iter = cudf::detail::make_counting_transform_iterator( + 0, + [labels = group_labels.begin(), + offsets = group_offsets.begin()] __device__(size_type row_index) { + auto const group_label = labels[row_index]; + auto const group_start = offsets[group_label]; + auto const group_end = offsets[group_label + 1]; + return group_end - group_start; + }); + + // Result type for PERCENT_RANK is independent of input type. + using result_type = cudf::detail::target_type_t; + + auto percent_rank_result = cudf::make_fixed_width_column( + data_type{type_to_id()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr); + + thrust::transform(rmm::exec_policy(stream), + rank_view.begin(), + rank_view.end(), + group_size_iter, + percent_rank_result->mutable_view().begin(), + [] __device__(auto const rank, auto const group_size) { + return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1)); + }); + + return percent_rank_result; +} + } // namespace detail } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp index 82ef0e25380..76a7f3f73c7 100644 --- a/cpp/src/groupby/sort/group_scan.hpp +++ b/cpp/src/groupby/sort/group_scan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,6 +116,21 @@ std::unique_ptr dense_rank_scan(column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Internal API to calculate groupwise percent rank value + * + * @param order_by column or struct column by which the rows within a group are sorted + * @param group_labels ID of group to which the row belongs + * @param group_offsets group index offsets with group ID indices + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Column of type `double` of percent rank values + */ +std::unique_ptr percent_rank_scan(column_view const& order_by, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index dde4e00eb4a..8e1463f7964 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,12 +74,16 @@ struct element_arg_minmax_fn { /** * @brief Value accessor for column which supports dictionary column too. * + * This is similar to `value_accessor` in `column_device_view.cuh` but with support of dictionary + * type. + * * @tparam T Type of the underlying column. For dictionary column, type of the key column. */ template struct value_accessor { column_device_view const col; bool const is_dict; + value_accessor(column_device_view const& col) : col(col), is_dict(cudf::is_dictionary(col.type())) { } @@ -93,6 +97,7 @@ struct value_accessor { return col.element(i); } } + __device__ auto operator()(size_type i) const { return value(i); } }; @@ -100,20 +105,28 @@ struct value_accessor { * @brief Null replaced value accessor for column which supports dictionary column too. * For null value, returns null `init` value * - * @tparam T Type of the underlying column. For dictionary column, type of the key column. + * @tparam SourceType Type of the underlying column. For dictionary column, type of the key column. + * @tparam TargetType Type that is used for computation. */ -template -struct null_replaced_value_accessor : value_accessor { - using super_t = value_accessor; +template +struct null_replaced_value_accessor : value_accessor { + using super_t = value_accessor; + + TargetType const init; bool const has_nulls; - T const init; - null_replaced_value_accessor(column_device_view const& col, T const& init, bool const has_nulls) + + null_replaced_value_accessor(column_device_view const& col, + TargetType const& init, + bool const has_nulls) : super_t(col), init(init), has_nulls(has_nulls) { } - __device__ T operator()(size_type i) const + + __device__ TargetType operator()(size_type i) const { - return has_nulls && super_t::col.is_null_nocheck(i) ? init : super_t::value(i); + return has_nulls && super_t::col.is_null_nocheck(i) + ? init + : static_cast(super_t::value(i)); } }; @@ -168,7 +181,7 @@ struct group_reduction_functor; + using SourceDType = device_storage_type_t; using ResultType = cudf::detail::target_type_t; using ResultDType = device_storage_type_t; @@ -203,9 +216,11 @@ struct group_reduction_functor; - auto init = OpType::template identity(); + auto init = OpType::template identity(); auto inp_values = cudf::detail::make_counting_transform_iterator( - 0, null_replaced_value_accessor{*d_values_ptr, init, values.has_nulls()}); + 0, + null_replaced_value_accessor{ + *d_values_ptr, init, values.has_nulls()}); do_reduction(inp_values, result_begin, OpType{}); } diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp index 402ff8c47ed..8c4959da35b 100644 --- a/cpp/src/groupby/sort/scan.cpp +++ b/cpp/src/groupby/sort/scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -144,6 +144,23 @@ void scan_result_functor::operator()(aggregation const& detail::dense_rank_scan( order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); } + +template <> +void scan_result_functor::operator()(aggregation const& agg) +{ + if (cache.has_result(values, agg)) return; + CUDF_EXPECTS(helper.is_presorted(), + "Percent rank aggregate in groupby scan requires the keys to be presorted"); + auto const order_by = get_grouped_values(); + CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by), + "Unsupported list type in grouped percent_rank scan."); + + cache.add_result( + values, + agg, + detail::percent_rank_scan( + order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); +} } // namespace detail // Sort-based groupby diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index dabe992d959..e2893a2e881 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -194,20 +194,13 @@ template > make_sliced_tasks( F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool) { + constexpr size_t default_max_slice_size = 4 * 1024 * 1024; + static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size); + auto const slices = make_file_io_slices(size, max_slice_size); std::vector> slice_tasks; - constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024; - static auto const max_slice_bytes = - getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes); - size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes); - size_t slice_offset = 0; - for (size_t t = 0; t < n_slices; ++t) { - DataT* ptr_slice = ptr + slice_offset; - - size_t const slice_size = (t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes; - slice_tasks.push_back(pool.submit(function, ptr_slice, slice_size, offset + slice_offset)); - - slice_offset += slice_size; - } + std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) { + return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset); + }); return slice_tasks; } @@ -318,6 +311,21 @@ std::unique_ptr make_cufile_output(std::string const& filepa return nullptr; } +std::vector make_file_io_slices(size_t size, size_t max_slice_size) +{ + max_slice_size = std::max(1024ul, max_slice_size); + auto const n_slices = util::div_rounding_up_safe(size, max_slice_size); + std::vector slices; + slices.reserve(n_slices); + std::generate_n(std::back_inserter(slices), n_slices, [&, idx = 0]() mutable { + auto const slice_offset = idx++ * max_slice_size; + auto const slice_size = std::min(size - slice_offset, max_slice_size); + return file_io_slice{slice_offset, slice_size}; + }); + + return slices; +} + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index fcee4e43a20..be3ecc49ab0 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -291,6 +291,21 @@ std::unique_ptr make_cufile_input(std::string const& filepath */ std::unique_ptr make_cufile_output(std::string const& filepath); +/** + * @brief Byte range to be read/written in a single operation. + */ +struct file_io_slice { + size_t offset; + size_t size; +}; + +/** + * @brief Split the total number of bytes to read/write into slices to enable parallel IO. + * + * If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse. + */ +std::vector make_file_io_slices(size_t size, size_t max_slice_size); + } // namespace detail } // namespace io } // namespace cudf diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 7590c93f0c3..b89bcabf23e 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -140,8 +140,8 @@ probe_join_hash_table(cudf::table_device_view build_table, std::size_t get_full_join_size(cudf::table_device_view build_table, cudf::table_device_view probe_table, multimap_type const& hash_table, - bool has_nulls, - null_equality compare_nulls, + bool const has_nulls, + null_equality const compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -235,6 +235,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream) : _is_empty{build.num_rows() == 0}, + _nulls_equal{compare_nulls}, _hash_table{compute_hash_table_size(build.num_rows()), std::numeric_limits::max(), cudf::detail::JoinNoneValue, @@ -253,50 +254,43 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, if (_is_empty) { return; } - build_join_hash_table(_build, _hash_table, compare_nulls, stream); + cudf::detail::build_join_hash_table(_build, _hash_table, _nulls_equal, stream); } std::pair>, std::unique_ptr>> hash_join::hash_join_impl::inner_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, compare_nulls, output_size, stream, mr); + return compute_hash_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::hash_join_impl::left_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, compare_nulls, output_size, stream, mr); + return compute_hash_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::hash_join_impl::full_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); - return compute_hash_join( - probe, compare_nulls, output_size, stream, mr); + return compute_hash_join(probe, output_size, stream, mr); } std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { CUDF_FUNC_RANGE(); @@ -316,12 +310,11 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p *flattened_probe_table_ptr, _hash_table, cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), - compare_nulls, + _nulls_equal, stream); } std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { CUDF_FUNC_RANGE(); @@ -341,12 +334,11 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr *flattened_probe_table_ptr, _hash_table, cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), - compare_nulls, + _nulls_equal, stream); } std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { @@ -362,20 +354,20 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr auto build_table_ptr = cudf::table_device_view::create(_build, stream); auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream); - return get_full_join_size(*build_table_ptr, - *flattened_probe_table_ptr, - _hash_table, - cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), - compare_nulls, - stream, - mr); + return cudf::detail::get_full_join_size( + *build_table_ptr, + *flattened_probe_table_ptr, + _hash_table, + cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build), + _nulls_equal, + stream, + mr); } template std::pair>, std::unique_ptr>> hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const @@ -403,42 +395,40 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, [](const auto& b, const auto& p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); - return probe_join_indices( - flattened_probe_table, compare_nulls, output_size, stream, mr); + return probe_join_indices(flattened_probe_table, output_size, stream, mr); } template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe, - null_equality compare_nulls, +hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe_table, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { // Trivial left join case - exit early if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) { - return get_trivial_left_join_indices(probe, stream, mr); + return get_trivial_left_join_indices(probe_table, stream, mr); } CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null."); auto build_table_ptr = cudf::table_device_view::create(_build, stream); - auto probe_table_ptr = cudf::table_device_view::create(probe, stream); - - auto join_indices = - cudf::detail::probe_join_hash_table(*build_table_ptr, - *probe_table_ptr, - _hash_table, - cudf::has_nulls(probe) | cudf::has_nulls(_build), - compare_nulls, - output_size, - stream, - mr); + auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream); + + auto join_indices = cudf::detail::probe_join_hash_table( + *build_table_ptr, + *probe_table_ptr, + _hash_table, + cudf::has_nulls(probe_table) | cudf::has_nulls(_build), + _nulls_equal, + output_size, + stream, + mr); if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr); + join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr); join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); } return join_indices; diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh index 21bfd8120f7..9c44aeebd59 100644 --- a/cpp/src/join/hash_join.cuh +++ b/cpp/src/join/hash_join.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,7 +89,7 @@ class make_pair_function { * @param probe_table The left hand table * @param hash_table A hash table built on the build table that maps the index * of every row to the hash value of that row. - * @param compare_nulls Controls whether null join-key values should match or not. + * @param nulls_equal Flag to denote nulls are equal or not. * @param stream CUDA stream used for device memory operations and kernel launches * * @return The exact size of the output of the join operation @@ -98,8 +98,8 @@ template std::size_t compute_join_output_size(table_device_view build_table, table_device_view probe_table, multimap_type const& hash_table, - bool has_nulls, - null_equality compare_nulls, + bool const has_nulls, + cudf::null_equality const nulls_equal, rmm::cuda_stream_view stream) { const size_type build_table_num_rows{build_table.num_rows()}; @@ -121,7 +121,7 @@ std::size_t compute_join_output_size(table_device_view build_table, } auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls}; - pair_equality equality{probe_table, build_table, probe_nulls, compare_nulls}; + pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal}; row_hash hash_probe{probe_nulls, probe_table}; auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); @@ -152,14 +152,14 @@ std::unique_ptr combine_table_pair(std::unique_ptr&& l * * @param build Table of columns used to build join hash. * @param hash_table Build hash table. - * @param compare_nulls Controls whether null join-key values should match or not. + * @param nulls_equal Flag to denote nulls are equal or not. * @param stream CUDA stream used for device memory operations and kernel launches. * */ template void build_join_hash_table(cudf::table_view const& build, MultimapType& hash_table, - null_equality compare_nulls, + null_equality const nulls_equal, rmm::cuda_stream_view stream) { auto build_table_ptr = cudf::table_device_view::create(build, stream); @@ -174,7 +174,7 @@ void build_join_hash_table(cudf::table_view const& build, auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); size_type const build_table_num_rows{build_table_ptr->num_rows()}; - if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { + if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) { hash_table.insert(iter, iter + build_table_num_rows, stream.value()); } else { thrust::counting_iterator stencil(0); @@ -197,7 +197,8 @@ struct hash_join::hash_join_impl { hash_join_impl& operator=(hash_join_impl&&) = delete; private: - bool _is_empty; + bool const _is_empty; + cudf::null_equality const _nulls_equal; cudf::table_view _build; std::vector> _created_null_columns; cudf::structs::detail::flattened_table _flattened_build_table; @@ -221,7 +222,6 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> inner_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -229,7 +229,6 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> left_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -237,21 +236,17 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> full_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const; [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const; std::size_t full_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -260,7 +255,6 @@ struct hash_join::hash_join_impl { std::pair>, std::unique_ptr>> compute_hash_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; @@ -276,7 +270,6 @@ struct hash_join::hash_join_impl { * @tparam JoinKind The type of join to be performed. * * @param probe_table Table of probe side columns to join. - * @param compare_nulls Controls whether null join-key values should match or not. * @param output_size Optional value which allows users to specify the exact output size. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned vectors. @@ -286,8 +279,7 @@ struct hash_join::hash_join_impl { template std::pair>, std::unique_ptr>> - probe_join_indices(cudf::table_view const& probe, - null_equality compare_nulls, + probe_join_indices(cudf::table_view const& probe_table, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const; diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index db79075d864..7a478ca2eb3 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include +#include "join/hash_join.cuh" +#include "join/join_common_utils.hpp" #include #include @@ -51,11 +51,11 @@ inner_join(table_view const& left_input, // build the hash map from the smaller table. if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); - auto result = hj_obj.inner_join(right, compare_nulls, std::nullopt, stream, mr); - return std::make_pair(std::move(result.second), std::move(result.first)); + auto [right_result, left_result] = hj_obj.inner_join(right, std::nullopt, stream, mr); + return std::make_pair(std::move(left_result), std::move(right_result)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); - return hj_obj.inner_join(left, compare_nulls, std::nullopt, stream, mr); + return hj_obj.inner_join(left, std::nullopt, stream, mr); } } @@ -78,16 +78,17 @@ std::unique_ptr
inner_join(table_view const& left_input, auto const left = scatter_columns(matched.second.front(), left_on, left_input); auto const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = inner_join(left.select(left_on), right.select(right_on), compare_nulls, mr); + auto const [left_join_indices, right_join_indices] = cudf::detail::inner_join( + left.select(left_on), right.select(right_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.first->begin(), - join_indices.first->end(), + left_join_indices->begin(), + left_join_indices->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second->begin(), - join_indices.second->end(), + right_join_indices->begin(), + right_join_indices->end(), out_of_bounds_policy::DONT_CHECK, stream, mr); @@ -113,7 +114,7 @@ left_join(table_view const& left_input, table_view const right = matched.second.back(); cudf::hash_join hj_obj(right, compare_nulls, stream); - return hj_obj.left_join(left, compare_nulls, std::nullopt, stream, mr); + return hj_obj.left_join(left, std::nullopt, stream, mr); } std::unique_ptr
left_join(table_view const& left_input, @@ -134,23 +135,24 @@ std::unique_ptr
left_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = left_join(left.select(left_on), right.select(right_on), compare_nulls); - - if ((left_on.empty() || right_on.empty()) || - is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { - auto probe_build_pair = get_empty_joined_table(left, right); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + if ((left_on.empty() or right_on.empty()) or + cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) { + auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right); + return cudf::detail::combine_table_pair(std::move(left_empty_table), + std::move(right_empty_table)); } + + auto const [left_join_indices, right_join_indices] = cudf::detail::left_join( + left.select(left_on), right.select(right_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.first->begin(), - join_indices.first->end(), + left_join_indices->begin(), + left_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second->begin(), - join_indices.second->end(), + right_join_indices->begin(), + right_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); @@ -176,7 +178,7 @@ full_join(table_view const& left_input, table_view const right = matched.second.back(); cudf::hash_join hj_obj(right, compare_nulls, stream); - return hj_obj.full_join(left, compare_nulls, std::nullopt, stream, mr); + return hj_obj.full_join(left, std::nullopt, stream, mr); } std::unique_ptr
full_join(table_view const& left_input, @@ -197,23 +199,24 @@ std::unique_ptr
full_join(table_view const& left_input, table_view const left = scatter_columns(matched.second.front(), left_on, left_input); table_view const right = scatter_columns(matched.second.back(), right_on, right_input); - auto join_indices = full_join(left.select(left_on), right.select(right_on), compare_nulls); - - if ((left_on.empty() || right_on.empty()) || - is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { - auto probe_build_pair = get_empty_joined_table(left, right); - return cudf::detail::combine_table_pair(std::move(probe_build_pair.first), - std::move(probe_build_pair.second)); + if ((left_on.empty() or right_on.empty()) or + cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) { + auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right); + return cudf::detail::combine_table_pair(std::move(left_empty_table), + std::move(right_empty_table)); } + + auto const [left_join_indices, right_join_indices] = cudf::detail::full_join( + left.select(left_on), right.select(right_on), compare_nulls, stream, mr); std::unique_ptr
left_result = detail::gather(left, - join_indices.first->begin(), - join_indices.first->end(), + left_join_indices->begin(), + left_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); std::unique_ptr
right_result = detail::gather(right, - join_indices.second->begin(), - join_indices.second->end(), + right_join_indices->begin(), + right_join_indices->end(), out_of_bounds_policy::NULLIFY, stream, mr); @@ -234,56 +237,50 @@ hash_join::hash_join(cudf::table_view const& build, std::pair>, std::unique_ptr>> hash_join::inner_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->inner_join(probe, compare_nulls, output_size, stream, mr); + return impl->inner_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::left_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->left_join(probe, compare_nulls, output_size, stream, mr); + return impl->left_join(probe, output_size, stream, mr); } std::pair>, std::unique_ptr>> hash_join::full_join(cudf::table_view const& probe, - null_equality compare_nulls, std::optional output_size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->full_join(probe, compare_nulls, output_size, stream, mr); + return impl->full_join(probe, output_size, stream, mr); } std::size_t hash_join::inner_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { - return impl->inner_join_size(probe, compare_nulls, stream); + return impl->inner_join_size(probe, stream); } std::size_t hash_join::left_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream) const { - return impl->left_join_size(probe, compare_nulls, stream); + return impl->left_join_size(probe, stream); } std::size_t hash_join::full_join_size(cudf::table_view const& probe, - null_equality compare_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - return impl->full_join_size(probe, compare_nulls, stream, mr); + return impl->full_join_size(probe, stream, mr); } // external APIs diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu index 9ac4db3a34b..464a8688a2d 100644 --- a/cpp/src/reductions/scan/rank_scan.cu +++ b/cpp/src/reductions/scan/rank_scan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -90,7 +91,7 @@ std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, return rank_generator( order_by, has_nested_nulls(table_view{{order_by}}), - [] __device__(bool equality, auto row_index) { return equality; }, + [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; }, DeviceSum{}, stream, mr); @@ -105,11 +106,34 @@ std::unique_ptr inclusive_rank_scan(column_view const& order_by, return rank_generator( order_by, has_nested_nulls(table_view{{order_by}}), - [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; }, + [] __device__(bool unequal, auto row_index) { return unequal ? row_index + 1 : 0; }, DeviceMax{}, stream, mr); } +std::unique_ptr inclusive_percent_rank_scan(column_view const& order_by, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const rank_column = + inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource()); + auto const rank_view = rank_column->view(); + + // Result type for PERCENT_RANK is independent of input type. + using result_type = cudf::detail::target_type_t; + auto percent_rank_result = cudf::make_fixed_width_column( + data_type{type_to_id()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr); + + thrust::transform(rmm::exec_policy(stream), + rank_view.begin(), + rank_view.end(), + percent_rank_result->mutable_view().begin(), + [n_rows = rank_view.size()] __device__(auto const rank) { + return n_rows == 1 ? 0.0 : ((rank - 1.0) / (n_rows - 1)); + }); + return percent_rank_result; +} + } // namespace detail } // namespace cudf diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index 5f10707d7fb..d73fc862380 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,14 +34,19 @@ std::unique_ptr scan(column_view const& input, if (agg->kind == aggregation::RANK) { CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, - "Unsupported rank aggregation operator for exclusive scan"); + "Rank aggregation operator requires an inclusive scan"); return inclusive_rank_scan(input, rmm::cuda_stream_default, mr); } if (agg->kind == aggregation::DENSE_RANK) { CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, - "Unsupported dense rank aggregation operator for exclusive scan"); + "Dense rank aggregation operator requires an inclusive scan"); return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr); } + if (agg->kind == aggregation::PERCENT_RANK) { + CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, + "Percent rank aggregation operator requires an inclusive scan"); + return inclusive_percent_rank_scan(input, rmm::cuda_stream_default, mr); + } return inclusive == scan_type::EXCLUSIVE ? detail::scan_exclusive(input, agg, null_handling, rmm::cuda_stream_default, mr) diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index c4076dd61c1..a67af9442f0 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,6 +83,7 @@ struct extract_fn { std::unique_ptr
extract( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { @@ -91,7 +92,8 @@ std::unique_ptr
extract( auto const d_strings = *strings_column; // compile regex into device object - auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto prog = + reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); auto d_prog = *prog; // extract should include groups auto const groups = d_prog.group_counts(); @@ -150,10 +152,11 @@ std::unique_ptr
extract( std::unique_ptr
extract(strings_column_view const& strings, std::string const& pattern, + regex_flags const flags, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract(strings, pattern, rmm::cuda_stream_default, mr); + return detail::extract(strings, pattern, flags, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index c4749eae003..e27dccb9338 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -89,13 +89,14 @@ struct extract_fn { } // namespace /** - * @copydoc cudf::strings::extract_all + * @copydoc cudf::strings::extract_all_record * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_all( +std::unique_ptr extract_all_record( strings_column_view const& strings, std::string const& pattern, + regex_flags const flags, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { @@ -103,7 +104,8 @@ std::unique_ptr extract_all( auto const d_strings = column_device_view::create(strings.parent(), stream); // Compile regex into device object. - auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_prog = + reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); // The extract pattern should always include groups. auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern."); @@ -179,12 +181,13 @@ std::unique_ptr extract_all( // external API -std::unique_ptr extract_all(strings_column_view const& strings, - std::string const& pattern, - rmm::mr::device_memory_resource* mr) +std::unique_ptr extract_all_record(strings_column_view const& strings, + std::string const& pattern, + regex_flags const flags, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_all(strings, pattern, rmm::cuda_stream_default, mr); + return detail::extract_all_record(strings, pattern, flags, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 065c358d08b..6f36658523b 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -280,15 +280,15 @@ class regex_parser { yy = 0; if (a >= '0' && a <= '9') yy += (a - '0') << 4; - else if (a > 'a' && a <= 'f') + else if (a >= 'a' && a <= 'f') yy += (a - 'a' + 10) << 4; - else if (a > 'A' && a <= 'F') + else if (a >= 'A' && a <= 'F') yy += (a - 'A' + 10) << 4; if (b >= '0' && b <= '9') yy += b - '0'; - else if (b > 'a' && b <= 'f') + else if (b >= 'a' && b <= 'f') yy += b - 'a' + 10; - else if (b > 'A' && b <= 'F') + else if (b >= 'A' && b <= 'F') yy += b - 'A' + 10; break; } diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu new file mode 100644 index 00000000000..d80148f2fe6 --- /dev/null +++ b/cpp/src/strings/split/split_re.cu @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +using string_index_pair = thrust::pair; + +enum class split_direction { + FORWARD, ///< for split logic + BACKWARD ///< for rsplit logic +}; + +/** + * @brief Identify the tokens from the `idx'th` string element of `d_strings`. + * + * Each string's tokens are stored in the `d_tokens` vector. + * The `d_token_offsets` specifies the output position within `d_tokens` + * for each string. + */ +template +struct token_reader_fn { + column_device_view const d_strings; + reprog_device prog; + split_direction const direction; + offset_type const* d_token_offsets; + string_index_pair* d_tokens; + + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return; } + auto const d_str = d_strings.element(idx); + + auto const token_offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - token_offset; + auto const d_result = d_tokens + token_offset; // store tokens here + + size_type token_idx = 0; + size_type begin = 0; // characters + size_type end = d_str.length(); + size_type last_pos = 0; // bytes + while (prog.find(idx, d_str, begin, end) > 0) { + // get the token (characters just before this match) + auto const token = + string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; + // store it if we have space + if (token_idx < token_count - 1) { + d_result[token_idx++] = token; + } else { + if (direction == split_direction::FORWARD) { break; } // we are done + for (auto l = 0; l < token_idx - 1; ++l) { + d_result[l] = d_result[l + 1]; // shift left + } + d_result[token_idx - 1] = token; + } + // setup for next match + last_pos = d_str.byte_offset(end); + begin = end + (begin == end); + end = d_str.length(); + } + + // set the last token to the remainder of the string + d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; + + if (direction == split_direction::BACKWARD) { + // update first entry -- this happens when max_tokens is hit before the end of the string + auto const first_offset = + d_result[0].first + ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) + : 0; + if (first_offset) { + d_result[0] = string_index_pair{d_str.data(), first_offset + d_result[0].second}; + } + } + } +}; + +/** + * @brief Call regex to split each input string into tokens. + * + * This will also convert the `offsets` values from counts to offsets. + * + * @param d_strings Strings to split + * @param d_prog Regex to evaluate against each string + * @param direction Whether tokens are generated forwards or backwards. + * @param max_tokens The maximum number of tokens for each split. + * @param offsets The number of matches on input. + * The offsets for each token in each string on output. + * @param stream CUDA stream used for kernel launches. + */ +rmm::device_uvector generate_tokens(column_device_view const& d_strings, + reprog_device& d_prog, + split_direction direction, + size_type maxsplit, + mutable_column_view& offsets, + rmm::cuda_stream_view stream) +{ + auto const strings_count = d_strings.size(); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + auto const d_offsets = offsets.data(); + + // convert match counts to token offsets + auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { + return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1; + }; + thrust::transform_exclusive_scan( + rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); + + // the last offset entry is the total number of tokens to be generated + auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); + + // generate tokens for each string + rmm::device_uvector tokens(total_tokens, stream); + auto const regex_insts = d_prog.insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_LARGE_INSTS) { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else { + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } + + return tokens; +} + +/** + * @brief Returns string pair for the specified column for each string in `d_strings` + * + * This is used to build the table result of a split. + * Null is returned if the row is null or if the `column_index` is larger + * than the token count for that string. + */ +struct tokens_transform_fn { + column_device_view const d_strings; + string_index_pair const* d_tokens; + offset_type const* d_token_offsets; + size_type const column_index; + + __device__ string_index_pair operator()(size_type idx) const + { + auto const offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - offset; + return (column_index >= token_count) || d_strings.is_null(idx) + ? string_index_pair{nullptr, 0} + : d_tokens[offset + column_index]; + } +}; + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + split_direction direction, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const strings_count = input.size(); + + std::vector> results; + if (strings_count == 0) { + results.push_back(make_empty_column(type_id::STRING)); + return std::make_unique
(std::move(results)); + } + + // create the regex device prog from the given pattern + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + // count the number of delimiters matched in each string + auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); + auto offsets_view = offsets->mutable_view(); + auto d_offsets = offsets_view.data(); + + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); + + // the output column count is the maximum number of tokens generated for any input string + auto const columns_count = thrust::transform_reduce( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + [d_offsets] __device__(auto const idx) -> size_type { + return d_offsets[idx + 1] - d_offsets[idx]; + }, + 0, + thrust::maximum{}); + + // boundary case: if no columns, return one all-null column (custrings issue #119) + if (columns_count == 0) { + results.push_back(std::make_unique( + data_type{type_id::STRING}, + strings_count, + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), + strings_count)); + return std::make_unique
(std::move(results)); + } + + // convert the tokens into multiple strings columns + auto make_strings_lambda = [&](size_type column_index) { + // returns appropriate token for each row/column + auto indices_itr = cudf::detail::make_counting_transform_iterator( + 0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index}); + return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); + }; + // build a vector of columns + results.resize(columns_count); + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(columns_count), + results.begin(), + make_strings_lambda); + + return std::make_unique
(std::move(results)); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + split_direction direction, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const strings_count = input.size(); + + // create the regex device prog from the given pattern + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + // count the number of delimiters matched in each string + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets_view = offsets->mutable_view(); + + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); + + // convert the tokens into one big strings column + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + + // create a lists column using the offsets and the strings columns + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + +} // namespace detail + +// external APIs + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 913761ecd03..f96edd3ce5a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -23,7 +23,14 @@ function(ConfigureTest CMAKE_TEST_NAME) ${CMAKE_TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + # For std:: support of __int128_t. Can be removed once using cuda::std + CXX_EXTENSIONS ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) + target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main) add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) install( @@ -41,8 +48,13 @@ endfunction() # ################################################################################################## # * column tests ---------------------------------------------------------------------------------- ConfigureTest( - COLUMN_TEST column/bit_cast_test.cpp column/column_view_shallow_test.cpp column/column_test.cu - column/column_device_view_test.cu column/compound_test.cu + COLUMN_TEST + column/bit_cast_test.cpp + column/column_device_view_test.cu + column/column_test.cu + column/column_view_device_span_test.cpp + column/column_view_shallow_test.cpp + column/compound_test.cu ) # ################################################################################################## @@ -199,6 +211,7 @@ ConfigureTest( ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp) ConfigureTest(CSV_TEST io/csv_test.cpp) +ConfigureTest(FILE_IO_TEST io/file_io_test.cpp) ConfigureTest(ORC_TEST io/orc_test.cpp) ConfigureTest(PARQUET_TEST io/parquet_test.cpp) ConfigureTest(JSON_TEST io/json_test.cpp) diff --git a/cpp/tests/column/column_view_device_span_test.cpp b/cpp/tests/column/column_view_device_span_test.cpp new file mode 100644 index 00000000000..2b7ea3b3650 --- /dev/null +++ b/cpp/tests/column/column_view_device_span_test.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +template () or cudf::is_chrono())> +std::unique_ptr example_column() +{ + auto begin = thrust::make_counting_iterator(1); + auto end = thrust::make_counting_iterator(16); + return cudf::test::fixed_width_column_wrapper(begin, end).release(); +} + +template +struct ColumnViewDeviceSpanTests : public cudf::test::BaseFixture { +}; + +using DeviceSpanTypes = cudf::test::FixedWidthTypesWithoutFixedPoint; +TYPED_TEST_SUITE(ColumnViewDeviceSpanTests, DeviceSpanTypes); + +TYPED_TEST(ColumnViewDeviceSpanTests, conversion_round_trip) +{ + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + + // Test implicit conversion, round trip + cudf::device_span device_span_from_col_view = col_view; + cudf::column_view col_view_from_device_span = device_span_from_col_view; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_view, col_view_from_device_span); +} + +struct ColumnViewDeviceSpanErrorTests : public cudf::test::BaseFixture { +}; + +TEST_F(ColumnViewDeviceSpanErrorTests, type_mismatch) +{ + auto col = example_column(); + auto col_view = cudf::column_view{*col}; + EXPECT_THROW((void)cudf::device_span{col_view}, cudf::logic_error); +} + +TEST_F(ColumnViewDeviceSpanErrorTests, nullable_column) +{ + auto col = example_column(); + col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_NULL), col->size()); + auto col_view = cudf::column_view{*col}; + EXPECT_THROW((void)cudf::device_span{col_view}, cudf::logic_error); +} diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp index 210b4b8f90d..256f9129cbf 100644 --- a/cpp/tests/copying/shift_tests.cpp +++ b/cpp/tests/copying/shift_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,10 +54,22 @@ std::unique_ptr make_scalar( } template -auto lowest = std::numeric_limits::lowest(); +constexpr auto highest() +{ + // chrono types do not have std::numeric_limits specializations and should use T::max() + // https://eel.is/c++draft/numeric.limits.general#6 + if constexpr (cudf::is_chrono()) return T::max(); + return std::numeric_limits::max(); +} template -auto highest = std::numeric_limits::max(); +constexpr auto lowest() +{ + // chrono types do not have std::numeric_limits specializations and should use T::min() + // https://eel.is/c++draft/numeric.limits.general#6 + if constexpr (cudf::is_chrono()) return T::min(); + return std::numeric_limits::lowest(); +} template struct ShiftTest : public cudf::test::BaseFixture { @@ -101,16 +113,16 @@ TYPED_TEST(ShiftTest, OneColumn) { using T = TypeParam; - auto input = fixed_width_column_wrapper{lowest, + auto input = fixed_width_column_wrapper{lowest(), cudf::test::make_type_param_scalar(1), cudf::test::make_type_param_scalar(2), cudf::test::make_type_param_scalar(3), cudf::test::make_type_param_scalar(4), cudf::test::make_type_param_scalar(5), - highest}; + highest()}; auto expected = fixed_width_column_wrapper{cudf::test::make_type_param_scalar(7), cudf::test::make_type_param_scalar(7), - lowest, + lowest(), cudf::test::make_type_param_scalar(1), cudf::test::make_type_param_scalar(2), cudf::test::make_type_param_scalar(3), @@ -126,16 +138,16 @@ TYPED_TEST(ShiftTest, OneColumnNegativeShift) { using T = TypeParam; - auto input = fixed_width_column_wrapper{lowest, + auto input = fixed_width_column_wrapper{lowest(), cudf::test::make_type_param_scalar(1), cudf::test::make_type_param_scalar(2), cudf::test::make_type_param_scalar(3), cudf::test::make_type_param_scalar(4), cudf::test::make_type_param_scalar(5), - highest}; + highest()}; auto expected = fixed_width_column_wrapper{cudf::test::make_type_param_scalar(4), cudf::test::make_type_param_scalar(5), - highest, + highest(), cudf::test::make_type_param_scalar(7), cudf::test::make_type_param_scalar(7), cudf::test::make_type_param_scalar(7), @@ -151,16 +163,16 @@ TYPED_TEST(ShiftTest, OneColumnNullFill) { using T = TypeParam; - auto input = fixed_width_column_wrapper{lowest, + auto input = fixed_width_column_wrapper{lowest(), cudf::test::make_type_param_scalar(5), cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(3), cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(1), - highest}; + highest()}; auto expected = fixed_width_column_wrapper({cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(0), - lowest, + lowest(), cudf::test::make_type_param_scalar(5), cudf::test::make_type_param_scalar(0), cudf::test::make_type_param_scalar(3), diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index fddaa9d2050..fd065249c4e 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,8 +129,13 @@ struct AtomicsTest : public cudf::test::BaseFixture { thrust::host_vector result_init(9); // +3 padding for int8 tests result_init[0] = cudf::test::make_type_param_scalar(0); - result_init[1] = std::numeric_limits::max(); - result_init[2] = std::numeric_limits::min(); + if constexpr (cudf::is_chrono()) { + result_init[1] = T::max(); + result_init[2] = T::min(); + } else { + result_init[1] = std::numeric_limits::max(); + result_init[2] = std::numeric_limits::min(); + } result_init[3] = result_init[0]; result_init[4] = result_init[1]; result_init[5] = result_init[2]; diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp index 1b1b12ea69e..81369beb2ec 100644 --- a/cpp/tests/groupby/rank_scan_tests.cpp +++ b/cpp/tests/groupby/rank_scan_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,17 +23,27 @@ #include -using namespace cudf::test::iterators; - namespace cudf { namespace test { -inline void test_pair_rank_scans(column_view const& keys, - column_view const& order, - column_view const& expected_dense, - column_view const& expected_rank, - null_policy include_null_keys = null_policy::INCLUDE, - sorted keys_are_sorted = sorted::YES) +using namespace iterators; + +template +using input = fixed_width_column_wrapper; +using rank_result_col = fixed_width_column_wrapper; +using percent_result_t = + cudf::detail::target_type_t; +using percent_result_col = fixed_width_column_wrapper; +using null_iter_t = decltype(nulls_at({})); + +auto constexpr X = int32_t{0}; // Placeholder for NULL rows. +auto const all_valid = nulls_at({}); + +inline void test_rank_scans(column_view const& keys, + column_view const& order, + column_view const& expected_dense, + column_view const& expected_rank, + column_view const& expected_percent_rank) { test_single_scan(keys, order, @@ -49,6 +59,13 @@ inline void test_pair_rank_scans(column_view const& keys, make_rank_aggregation(), null_policy::INCLUDE, sorted::YES); + test_single_scan(keys, + order, + keys, + expected_percent_rank, + make_percent_rank_aggregation(), + null_policy::INCLUDE, + sorted::YES); } struct groupby_rank_scan_test : public BaseFixture { @@ -70,248 +87,283 @@ TYPED_TEST(typed_groupby_rank_scan_test, empty_cols) { using T = TypeParam; - fixed_width_column_wrapper keys{}; - fixed_width_column_wrapper order_col{}; - structs_column_wrapper struct_order{}; + auto const keys = input{}; + auto const order_by = input{}; + auto const order_by_struct = structs_column_wrapper{}; - fixed_width_column_wrapper expected_dense_vals{}; - fixed_width_column_wrapper expected_rank_vals{}; + auto const expected_dense = rank_result_col{}; + auto const expected_rank = rank_result_col{}; + auto const expected_percent = percent_result_col{}; - test_pair_rank_scans(keys, order_col, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, zero_valid_keys) { using T = TypeParam; - fixed_width_column_wrapper keys{{1, 2, 3}, all_nulls()}; - fixed_width_column_wrapper order_col1{3, 3, 1}; - fixed_width_column_wrapper order_col2{3, 3, 1}; - fixed_width_column_wrapper order_col3{3, 3, 1}; - structs_column_wrapper struct_order{order_col2, order_col3}; + auto const keys = input{{X, X, X}, all_nulls()}; + auto const order_by = input{{3, 3, 1}}; + auto const order_by_struct = [] { + auto member_1 = input{{3, 3, 1}}; + auto member_2 = input{{3, 3, 1}}; + return structs_column_wrapper{member_1, member_2}; + }(); - fixed_width_column_wrapper expected_dense_vals{1, 1, 2}; - fixed_width_column_wrapper expected_rank_vals{1, 1, 3}; + auto const dense_rank_results = rank_result_col{1, 1, 2}; + auto const rank_results = rank_result_col{1, 1, 3}; + auto const percent_rank_result = percent_result_col{0, 0, 1}; - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + test_rank_scans(keys, order_by, dense_rank_results, rank_results, percent_rank_result); + test_rank_scans(keys, order_by_struct, dense_rank_results, rank_results, percent_rank_result); } TYPED_TEST(typed_groupby_rank_scan_test, zero_valid_orders) { - using T = TypeParam; - - fixed_width_column_wrapper keys{1, 1, 3, 3}; - fixed_width_column_wrapper order_col1{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col2{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col3{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col4{{5, 6, 7, 8}, all_nulls()}; - fixed_width_column_wrapper order_col5{{5, 6, 7, 8}, all_nulls()}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, all_nulls()}; - - fixed_width_column_wrapper expected_dense_vals{1, 1, 1, 1}; - fixed_width_column_wrapper expected_rank_vals{1, 1, 1, 1}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + using T = TypeParam; + using null_iter_t = decltype(all_nulls()); + + auto const keys = input{{1, 1, 3, 3}}; + auto const make_order_by = [&] { return input{{X, X, X, X}, all_nulls()}; }; + auto const make_struct_order_by = [&](null_iter_t const& null_iter = no_nulls()) { + auto member1 = make_order_by(); + auto member2 = make_order_by(); + return structs_column_wrapper{{member1, member2}, null_iter}; + }; + auto const order_by = make_order_by(); + auto const order_by_struct = make_struct_order_by(); + auto const order_by_struct_all_nulls = make_struct_order_by(all_nulls()); + + auto const expected_dense = rank_result_col{1, 1, 1, 1}; + auto const expected_rank = rank_result_col{1, 1, 1, 1}; + auto const expected_percent = percent_result_col{0, 0, 0, 0}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct_all_nulls, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, basic) { using T = TypeParam; - fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; - fixed_width_column_wrapper order_col1{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; - fixed_width_column_wrapper order_col2{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; - fixed_width_column_wrapper order_col3{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; - structs_column_wrapper struct_order{order_col2, order_col3}; - - fixed_width_column_wrapper expected_dense_vals = { - {1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3}}; - fixed_width_column_wrapper expected_rank_vals = - fixed_width_column_wrapper{{1, 1, 1, 4, 4, 4, 7, 1, 2, 2, 4, 4}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + auto const keys = input{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const make_order_by = [&] { return input{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; }; + auto const order_by = make_order_by(); + auto const order_by_struct = [&] { + auto order2 = make_order_by(); + auto order3 = make_order_by(); + return structs_column_wrapper{order2, order3}; + }(); + + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 2, 3, 1, 2, 2, 3, 3}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 4, 7, 1, 2, 2, 4, 4}; + auto const expected_percent = percent_result_col{ + 0.0, 0.0, 0.0, 3.0 / 6, 3.0 / 6, 3.0 / 6, 6.0 / 6, 0.0, 1.0 / 4, 1.0 / 4, 3.0 / 4, 3.0 / 4}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, null_orders) { using T = TypeParam; - fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}; - fixed_width_column_wrapper order_col1{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col2{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col3{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col4{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col5{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 3, 3, 3}}; - fixed_width_column_wrapper expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 4, 4, 4}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); + auto const null_mask = nulls_at({2, 8}); + auto const keys = input{{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}}; + auto const make_order_by = [&] { + return input{{-1, -2, X, -2, -3, -3, -4, -4, X, -5, -5, -5}, null_mask}; + }; + auto const make_struct_order_by = [&](null_iter_t const& null_iter = all_valid) { + auto member1 = make_order_by(); + auto member2 = make_order_by(); + return structs_column_wrapper{{member1, member2}, null_iter}; + }; + auto const order_by = make_order_by(); + auto const order_by_struct = make_struct_order_by(); + auto const order_by_struct_with_nulls = make_struct_order_by(null_mask); + + auto const expected_dense = rank_result_col{1, 2, 3, 4, 5, 5, 1, 1, 2, 3, 3, 3}; + auto const expected_rank = rank_result_col{1, 2, 3, 4, 5, 5, 1, 1, 3, 4, 4, 4}; + auto const expected_percent = percent_result_col{ + 0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 5, 3.0 / 5, 3.0 / 5, 3.0 / 5}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_struct_with_nulls, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, null_orders_and_keys) { using T = TypeParam; - fixed_width_column_wrapper keys = {{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - fixed_width_column_wrapper order_col1{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col2{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col3{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col4{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col5{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; - fixed_width_column_wrapper expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + auto const null_mask = nulls_at({2, 8}); + auto const keys = input{{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}, nulls_at({9, 10, 11})}; + auto const make_order_by = [&] { + return input{{-1, -2, -2, -2, -3, -3, -4, -4, -4, -5, -5, -6}, null_mask}; + }; + auto const make_struct_order_by = [&](null_iter_t const& null_iter = all_valid) { + auto member1 = make_order_by(); + auto member2 = make_order_by(); + return structs_column_wrapper{{member1, member2}, null_iter}; + }; + auto const order_by = make_order_by(); + auto const order_by_struct = make_struct_order_by(); + auto const order_by_struct_with_nulls = make_struct_order_by(null_mask); + + auto const expected_dense = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; + auto const expected_rank = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; + auto const expected_percent = percent_result_col{ + {0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_struct, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_struct_with_nulls, expected_dense, expected_rank, expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs) { - auto col = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto strings = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto struct_col = structs_column_wrapper{{col, strings}, null_at(11)}.release(); - - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - auto expected_dense_vals = - fixed_width_column_wrapper{1, 1, 2, 2, 3, 4, 1, 1, 2, 1, 1, 2}; - auto expected_rank_vals = - fixed_width_column_wrapper{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3}; + auto const struct_col = [] { + auto nums = input{{0, 0, 7, 7, 7, X, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "XX", "9", "9", "10d"}, null_at(8)}; + return structs_column_wrapper{{nums, strings}, null_at(11)}.release(); + }(); + + auto const keys = strings_column_wrapper{ + {"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, nulls_at({9, 10, 11})}; + + auto const expected_dense = rank_result_col{1, 1, 2, 2, 3, 4, 1, 1, 2, 1, 1, 2}; + auto const expected_rank = rank_result_col{1, 1, 3, 3, 5, 6, 1, 1, 3, 1, 1, 3}; + auto const expected_percent = percent_result_col{ + 0.0, 0.0, 2.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}; std::vector requests; requests.emplace_back(groupby::scan_request()); requests[0].values = *struct_col; requests[0].aggregations.push_back(make_dense_rank_aggregation()); requests[0].aggregations.push_back(make_rank_aggregation()); + requests[0].aggregations.push_back(make_percent_rank_aggregation()); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); - auto result = gb_obj.scan(requests); + auto [result_keys, agg_results] = gb_obj.scan(requests); - CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result.first->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], expected_dense_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], expected_rank_vals); + CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result_keys->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], expected_dense); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], expected_rank); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], expected_percent); } TYPED_TEST(typed_groupby_rank_scan_test, nestedStructs) { using T = TypeParam; - auto col1 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col2 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col3 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col4 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto strings1 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto strings2 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto struct_col = structs_column_wrapper{col1, strings1}; - auto nested_col = structs_column_wrapper{struct_col, col2}.release(); - auto flattened_col = structs_column_wrapper{col3, strings2, col4}.release(); - - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; + auto nested_structs = [] { + auto structs_member = [] { + auto nums_member = input{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings_member = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; + return structs_column_wrapper{nums_member, strings_member}; + }(); + auto nums_member = input{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + return structs_column_wrapper{structs_member, nums_member}.release(); + }(); + + auto flat_struct = [] { + auto nums_member = input{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings_member = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; + auto nuther_nums = + fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + return structs_column_wrapper{nums_member, strings_member, nuther_nums}.release(); + }(); + + auto const keys = strings_column_wrapper{ + {"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, nulls_at({9, 10, 11})}; std::vector requests; requests.emplace_back(groupby::scan_request()); requests.emplace_back(groupby::scan_request()); - requests[0].values = *nested_col; + requests[0].values = *nested_structs; requests[0].aggregations.push_back(make_dense_rank_aggregation()); requests[0].aggregations.push_back(make_rank_aggregation()); - requests[1].values = *flattened_col; + requests[0].aggregations.push_back(make_percent_rank_aggregation()); + requests[1].values = *flat_struct; requests[1].aggregations.push_back(make_dense_rank_aggregation()); requests[1].aggregations.push_back(make_rank_aggregation()); + requests[1].aggregations.push_back(make_percent_rank_aggregation()); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); - auto result = gb_obj.scan(requests); + auto [result_keys, agg_results] = gb_obj.scan(requests); - CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result.first->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *result.second[1].results[0]); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], *result.second[1].results[1]); + CUDF_TEST_EXPECT_TABLES_EQUAL(table_view({keys}), result_keys->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], *agg_results[1].results[0]); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], *agg_results[1].results[1]); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], *agg_results[1].results[2]); } TYPED_TEST(typed_groupby_rank_scan_test, structsWithNullPushdown) { using T = TypeParam; - auto col1 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto col2 = fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; - auto strings1 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto strings2 = strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - - std::vector> struct_columns; - struct_columns.push_back(col1.release()); - struct_columns.push_back(strings1.release()); - auto struct_col = - cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{}); - auto const struct_nulls = - thrust::host_vector(std::vector{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0}); - struct_col->set_null_mask( - cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end())); - - std::vector> null_struct_columns; - null_struct_columns.push_back(col2.release()); - null_struct_columns.push_back(strings2.release()); - auto null_col = - cudf::make_structs_column(12, std::move(null_struct_columns), 0, rmm::device_buffer{}); - null_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL)); - - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; + auto constexpr num_rows = 12; + + auto get_struct_column = [] { + auto nums_member = + fixed_width_column_wrapper{{0, 0, 7, 7, 7, 5, 4, 4, 4, 9, 9, 9}, null_at(5)}; + auto strings_member = strings_column_wrapper{ + {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; + auto struct_column = structs_column_wrapper{nums_member, strings_member}.release(); + // Reset null-mask, a posteriori. Nulls will not be pushed down to children. + auto const null_iter = nulls_at({1, 2, 11}); + struct_column->set_null_mask( + cudf::test::detail::make_null_mask(null_iter, null_iter + num_rows)); + return struct_column; + }; + + auto const possibly_null_structs = get_struct_column(); + + auto const definitely_null_structs = [&] { + auto struct_column = get_struct_column(); + struct_column->set_null_mask(create_null_mask(num_rows, mask_state::ALL_NULL)); + return struct_column; + }(); + + strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, + nulls_at({9, 10, 11})}; std::vector requests; requests.emplace_back(groupby::scan_request()); requests.emplace_back(groupby::scan_request()); - requests[0].values = *struct_col; + requests[0].values = *possibly_null_structs; requests[0].aggregations.push_back(make_dense_rank_aggregation()); requests[0].aggregations.push_back(make_rank_aggregation()); - requests[1].values = *null_col; + requests[0].aggregations.push_back(make_percent_rank_aggregation()); + requests[1].values = *definitely_null_structs; requests[1].aggregations.push_back(make_dense_rank_aggregation()); requests[1].aggregations.push_back(make_rank_aggregation()); + requests[1].aggregations.push_back(make_percent_rank_aggregation()); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); - auto result = gb_obj.scan(requests); - - auto expected_dense_vals = - fixed_width_column_wrapper{1, 2, 2, 3, 4, 5, 1, 1, 2, 1, 1, 2}; - auto expected_rank_vals = - fixed_width_column_wrapper{1, 2, 2, 4, 5, 6, 1, 1, 3, 1, 1, 3}; - auto expected_null_result = - fixed_width_column_wrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], expected_dense_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[1], expected_rank_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[1].results[0], expected_null_result); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[1].results[1], expected_null_result); + auto [result_keys, agg_results] = gb_obj.scan(requests); + + auto expected_dense = rank_result_col{1, 2, 2, 3, 4, 5, 1, 1, 2, 1, 1, 2}; + auto expected_rank = rank_result_col{1, 2, 2, 4, 5, 6, 1, 1, 3, 1, 1, 3}; + auto expected_percent = percent_result_col{ + 0.0, 1.0 / 5, 1.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}; + auto expected_rank_for_null = rank_result_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto expected_percent_for_null = + percent_result_col{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[0], expected_dense); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[1], expected_rank); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[0].results[2], expected_percent); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[0], expected_rank_for_null); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[1], expected_rank_for_null); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*agg_results[1].results[2], expected_percent_for_null); } /* List support dependent on https://github.com/rapidsai/cudf/issues/8683 @@ -372,67 +424,72 @@ TYPED_TEST(list_groupby_rank_scan_test, lists) TEST(groupby_rank_scan_test, bools) { - fixed_width_column_wrapper keys = {{0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - fixed_width_column_wrapper order_col1{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col2{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col3{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col4{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - fixed_width_column_wrapper order_col5{{0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 1, 2, 2, 2, 2, 1, 2, 3, 1, 1, 2}}; - fixed_width_column_wrapper expected_rank_vals{{1, 1, 3, 3, 3, 3, 1, 2, 3, 1, 1, 3}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + using bools = fixed_width_column_wrapper; + using null_iter_t = decltype(nulls_at({})); + + auto const keys = bools{{0, 0, 0, 0, 0, 0, 1, 1, 1, X, X, X}, nulls_at({9, 10, 11})}; + auto const nulls_6_8 = nulls_at({6, 8}); + auto const make_order_by = [&] { return bools{{0, 0, 1, 1, 1, 1, X, 1, X, 0, 0, 1}, nulls_6_8}; }; + auto const make_structs = [&](null_iter_t const& null_iter = all_valid) { + auto member_1 = make_order_by(); + auto member_2 = make_order_by(); + return structs_column_wrapper{{member_1, member_2}, null_iter}; + }; + + auto const order_by = make_order_by(); + auto const order_by_structs = make_structs(); + auto const order_by_structs_with_nulls = make_structs(nulls_6_8); + + auto const expected_dense = rank_result_col{{1, 1, 2, 2, 2, 2, 1, 2, 3, 1, 1, 2}}; + auto const expected_rank = rank_result_col{{1, 1, 3, 3, 3, 3, 1, 2, 3, 1, 1, 3}}; + auto const expected_percent = percent_result_col{ + {0.0, 0.0, 2.0 / 5, 2.0 / 5, 2.0 / 5, 2.0 / 5, 0.0, 1.0 / 2, 2.0 / 2, 0.0, 0.0, 2.0 / 2}}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_structs, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent); } TEST(groupby_rank_scan_test, strings) { - strings_column_wrapper keys = {{"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, - {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}}; - strings_column_wrapper order_col1{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col2{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col3{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col4{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - strings_column_wrapper order_col5{ - {"-1", "-2", "-2", "-2", "-3", "-3", "-4", "-4", "-4", "-5", "-5", "-6"}, - {1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1}}; - structs_column_wrapper struct_order{order_col2, order_col3}; - structs_column_wrapper struct_order_with_nulls{{order_col4, order_col5}, - {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0}}; - - fixed_width_column_wrapper expected_dense_vals{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; - fixed_width_column_wrapper expected_rank_vals{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; - - test_pair_rank_scans(keys, order_col1, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order, expected_dense_vals, expected_rank_vals); - test_pair_rank_scans(keys, struct_order_with_nulls, expected_dense_vals, expected_rank_vals); + using strings = strings_column_wrapper; + using null_iter_t = decltype(nulls_at({})); + + auto const keys = + strings{{"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, nulls_at({9, 10, 11})}; + auto const nulls_2_8 = nulls_at({2, 8}); + auto const make_order_by = [&] { + return strings{{"-1", "-2", "X", "-2", "-3", "-3", "-4", "-4", "X", "-5", "-5", "-6"}, + nulls_2_8}; + }; + auto const make_structs = [&](null_iter_t const& null_iter = all_valid) { + auto member_1 = make_order_by(); + auto member_2 = make_order_by(); + return structs_column_wrapper{{member_1, member_2}, null_iter}; + }; + + auto const order_by = make_order_by(); + auto const order_by_structs = make_structs(); + auto const order_by_structs_with_nulls = make_structs(nulls_at({4, 5, 11})); + + auto const expected_dense = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 2, 1, 1, 2}}; + auto const expected_rank = rank_result_col{{1, 2, 3, 4, 5, 5, 1, 1, 3, 1, 1, 3}}; + auto const expected_percent = percent_result_col{ + {0.0, 1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 4.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2}}; + + test_rank_scans(keys, order_by, expected_dense, expected_rank, expected_percent); + test_rank_scans(keys, order_by_structs, expected_dense, expected_rank, expected_percent); + test_rank_scans( + keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent); } TEST_F(groupby_rank_scan_test_failures, test_exception_triggers) { using T = uint32_t; - fixed_width_column_wrapper keys{{1, 2, 3}, {1, 1, 0}}; - fixed_width_column_wrapper col{3, 3, 1}; + auto const keys = input{{1, 2, 3}, null_at(2)}; + auto const col = input{3, 3, 1}; CUDF_EXPECT_THROW_MESSAGE( test_single_scan(keys, diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp index 5947e309bec..be7da4a784c 100644 --- a/cpp/tests/groupby/sum_tests.cpp +++ b/cpp/tests/groupby/sum_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -156,6 +156,27 @@ TYPED_TEST(groupby_sum_test, dictionary) force_use_sort_impl::YES); } +struct overflow_test : public cudf::test::BaseFixture { +}; +TEST_F(overflow_test, overflow_integer) +{ + using int32_col = fixed_width_column_wrapper; + using int64_col = fixed_width_column_wrapper; + + auto const keys = int32_col{0, 0}; + auto const vals = int32_col{-2147483648, -2147483648}; + auto const expect_keys = int32_col{0}; + auto const expect_vals = int64_col{-4294967296L}; + + auto test_sum = [&](auto const use_sort) { + auto agg = make_sum_aggregation(); + test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), use_sort); + }; + + test_sum(force_use_sort_impl::NO); + test_sum(force_use_sort_impl::YES); +} + template struct FixedPointTestAllReps : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/io/file_io_test.cpp b/cpp/tests/io/file_io_test.cpp new file mode 100644 index 00000000000..b546239fdca --- /dev/null +++ b/cpp/tests/io/file_io_test.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +// Base test fixture for tests +struct CuFileIOTest : public cudf::test::BaseFixture { +}; + +TEST_F(CuFileIOTest, SliceSize) +{ + std::vector> test_cases{ + {1 << 20, 1 << 18}, {1 << 18, 1 << 20}, {1 << 20, 3333}, {0, 1 << 18}, {0, 0}, {1 << 20, 0}}; + for (auto const& test_case : test_cases) { + auto const slices = cudf::io::detail::make_file_io_slices(test_case.first, test_case.second); + if (slices.empty()) { + ASSERT_EQ(test_case.first, 0); + } else { + ASSERT_EQ(slices.front().offset, 0); + ASSERT_EQ(slices.back().offset + slices.back().size, test_case.first); + for (auto i = 1u; i < slices.size(); ++i) { + ASSERT_EQ(slices[i].offset, slices[i - 1].offset + slices[i - 1].size); + } + } + } +} + +CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index e6ae709f009..57041e448a2 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1004,7 +1004,7 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin) std::size_t const size_gold = 0; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.inner_join(t0, optional_size); column_wrapper col_gold_0{}; column_wrapper col_gold_1{}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1043,7 +1043,7 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin) std::size_t const size_gold = 5; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.left_join(t0, optional_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1082,7 +1082,7 @@ TEST_F(JoinTest, EmptyRightTableFullJoin) std::size_t const size_gold = 5; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.full_join(t0, optional_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1310,7 +1310,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes) std::size_t const size_gold = 9; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.full_join(t0, optional_size); column_wrapper col_gold_0{{NoneValue, NoneValue, NoneValue, NoneValue, 4, 0, 1, 2, 3}}; column_wrapper col_gold_1{{0, 1, 2, 3, 4, NoneValue, NoneValue, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1330,7 +1330,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes) std::size_t const size_gold = 5; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.left_join(t0, optional_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{NoneValue, NoneValue, NoneValue, NoneValue, 4}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1350,7 +1350,7 @@ TEST_F(JoinTest, HashJoinSequentialProbes) std::size_t const size_gold = 3; EXPECT_EQ(output_size, size_gold); - auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, optional_size); + auto result = hash_join.inner_join(t0, optional_size); column_wrapper col_gold_0{{2, 4, 0}}; column_wrapper col_gold_1{{1, 1, 4}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1390,7 +1390,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls) { auto output_size = hash_join.left_join_size(t0); EXPECT_EQ(5, output_size); - auto result = hash_join.left_join(t0, cudf::null_equality::EQUAL, output_size); + auto result = hash_join.left_join(t0, output_size); column_wrapper col_gold_0{{0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{0, NoneValue, 2, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1400,7 +1400,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls) { auto output_size = hash_join.inner_join_size(t0); EXPECT_EQ(2, output_size); - auto result = hash_join.inner_join(t0, cudf::null_equality::EQUAL, output_size); + auto result = hash_join.inner_join(t0, output_size); column_wrapper col_gold_0{{0, 2}}; column_wrapper col_gold_1{{0, 2}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); @@ -1410,7 +1410,7 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls) { auto output_size = hash_join.full_join_size(t0); EXPECT_EQ(8, output_size); - auto result = hash_join.full_join(t0, cudf::null_equality::EQUAL, output_size); + auto result = hash_join.full_join(t0, output_size); column_wrapper col_gold_0{{NoneValue, NoneValue, NoneValue, 0, 1, 2, 3, 4}}; column_wrapper col_gold_1{{1, 3, 4, 0, NoneValue, 2, NoneValue, NoneValue}}; auto const [sorted_gold, sorted_result] = gather_maps_as_tables(col_gold_0, col_gold_1, result); diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp index 2249fac4e2e..ffcbce08163 100644 --- a/cpp/tests/reductions/rank_tests.cpp +++ b/cpp/tests/reductions/rank_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,22 +20,40 @@ #include #include +#include #include #include +#include using aggregation = cudf::aggregation; using cudf::null_policy; using cudf::scan_type; -using namespace cudf::test::iterators; + +namespace cudf::test { + +using namespace iterators; + +template +using input = fixed_width_column_wrapper; +using rank_result_col = fixed_width_column_wrapper; +using percent_result_t = + cudf::detail::target_type_t; +using percent_result_col = fixed_width_column_wrapper; + +auto const rank = cudf::make_rank_aggregation(); +auto const dense_rank = cudf::make_dense_rank_aggregation(); +auto const percent_rank = cudf::make_percent_rank_aggregation(); + +auto constexpr INCLUSIVE_SCAN = cudf::scan_type::INCLUSIVE; +auto constexpr INCLUDE_NULLS = cudf::null_policy::INCLUDE; template struct TypedRankScanTest : BaseScanTest { inline void test_ungrouped_rank_scan(cudf::column_view const& input, cudf::column_view const& expect_vals, - std::unique_ptr const& agg, - null_policy null_handling) + std::unique_ptr const& agg) { - auto col_out = cudf::scan(input, agg, scan_type::INCLUSIVE, null_handling); + auto col_out = cudf::scan(input, agg, INCLUSIVE_SCAN, INCLUDE_NULLS); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view()); } }; @@ -55,16 +73,25 @@ TYPED_TEST(TypedRankScanTest, Rank) return make_vector({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113}); return make_vector({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34}); }(); - auto col = this->make_column(v); - - auto const expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6}; - auto const expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12}; - this->test_ungrouped_rank_scan( - *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE); - this->test_ungrouped_rank_scan( - *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE); + auto const col = this->make_column(v); + + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 3.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 6.0 / 11, + 6.0 / 11, + 10.0 / 11, + 11.0 / 11}; + this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank); + this->test_ungrouped_rank_scan(*col, expected_rank, rank); + this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank); } TYPED_TEST(TypedRankScanTest, RankWithNulls) @@ -74,132 +101,159 @@ TYPED_TEST(TypedRankScanTest, RankWithNulls) return make_vector({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113}); return make_vector({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34}); }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0}); - auto col = this->make_column(v, b); - - auto const expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8}; - auto const expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12}; - this->test_ungrouped_rank_scan( - *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE); - this->test_ungrouped_rank_scan( - *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE); + auto const null_iter = nulls_at({3, 6, 7, 11}); + auto const b = thrust::host_vector(null_iter, null_iter + v.size()); + auto col = this->make_column(v, b); + + auto const expected_dense = rank_result_col{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 4.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 8.0 / 11, + 8.0 / 11, + 10.0 / 11, + 11.0 / 11}; + this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank); + this->test_ungrouped_rank_scan(*col, expected_rank, rank); + this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank); } -TYPED_TEST(TypedRankScanTest, MixedStructs) +namespace { +template +auto make_input_column() { - auto const v = [] { - if (std::is_signed::value) - return make_vector({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}); - return make_vector({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}); - }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); - auto col = this->make_column(v, b); - auto strings = cudf::test::strings_column_wrapper{ + if constexpr (std::is_same_v) { + return strings_column_wrapper{{"0", "0", "4", "4", "4", "5", "7", "7", "7", "9", "9", "9"}, + null_at(5)}; + } else { + return (std::is_signed_v) + ? input{{-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}, null_at(5)} + : input{{0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}, null_at(5)}; + } +} + +auto make_strings_column() +{ + return strings_column_wrapper{ {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - std::vector> vector_of_columns; - vector_of_columns.push_back(std::move(col)); - vector_of_columns.push_back(strings.release()); - auto struct_col = cudf::test::structs_column_wrapper{std::move(vector_of_columns)}.release(); - - auto expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8}; - auto expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12}; - - this->test_ungrouped_rank_scan( - *struct_col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE); - this->test_ungrouped_rank_scan( - *struct_col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE); +} + +template +auto make_mixed_structs_column() +{ + auto col = make_input_column(); + auto strings = make_strings_column(); + return structs_column_wrapper{{col, strings}}; +} +} // namespace + +TYPED_TEST(TypedRankScanTest, MixedStructs) +{ + auto const struct_col = make_mixed_structs_column(); + auto const expected_dense = rank_result_col{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8}; + auto const expected_rank = rank_result_col{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 2.0 / 11, + 2.0 / 11, + 4.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 8.0 / 11, + 9.0 / 11, + 9.0 / 11, + 11.0 / 11}; + + this->test_ungrouped_rank_scan(struct_col, expected_dense, dense_rank); + this->test_ungrouped_rank_scan(struct_col, expected_rank, rank); + this->test_ungrouped_rank_scan(struct_col, expected_percent, percent_rank); } TYPED_TEST(TypedRankScanTest, NestedStructs) { - auto const v = [] { - if (std::is_signed::value) - return make_vector({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}); - return make_vector({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}); + auto const nested_col = [&] { + auto struct_col = [&] { + auto col = make_input_column(); + auto strings = make_strings_column(); + return structs_column_wrapper{{col, strings}}; + }(); + auto col = make_input_column(); + return structs_column_wrapper{{struct_col, col}}; }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); - auto col1 = this->make_column(v, b); - auto col2 = this->make_column(v, b); - auto col3 = this->make_column(v, b); - auto col4 = this->make_column(v, b); - auto strings1 = cudf::test::strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - auto strings2 = cudf::test::strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - std::vector> struct_columns; - struct_columns.push_back(std::move(col1)); - struct_columns.push_back(strings1.release()); - auto struct_col = cudf::test::structs_column_wrapper{std::move(struct_columns)}; - std::vector> nested_columns; - nested_columns.push_back(struct_col.release()); - nested_columns.push_back(std::move(col2)); - auto nested_col = cudf::test::structs_column_wrapper{std::move(nested_columns)}; - std::vector> flat_columns; - flat_columns.push_back(std::move(col3)); - flat_columns.push_back(strings2.release()); - flat_columns.push_back(std::move(col4)); - auto flat_col = cudf::test::structs_column_wrapper{std::move(flat_columns)}; - - auto dense_out = cudf::scan( - nested_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto dense_expected = cudf::scan( - flat_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_out = cudf::scan( - nested_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_expected = - cudf::scan(flat_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); + auto const flat_col = [&] { + auto col = make_input_column(); + auto strings_col = make_strings_column(); + auto nuther_col = make_input_column(); + return structs_column_wrapper{{col, strings_col, nuther_col}}; + }(); + auto const dense_out = cudf::scan(nested_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const dense_expected = cudf::scan(flat_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), dense_expected->view()); + + auto const rank_out = cudf::scan(nested_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_expected = cudf::scan(flat_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), rank_expected->view()); + + auto const percent_out = cudf::scan(nested_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_expected = cudf::scan(flat_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), percent_expected->view()); } -TYPED_TEST(TypedRankScanTest, structsWithNullPushdown) +TYPED_TEST(TypedRankScanTest, StructsWithNullPushdown) { - auto const v = [] { - if (std::is_signed::value) - return make_vector({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9}); - return make_vector({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9}); - }(); - auto const b = thrust::host_vector(std::vector{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1}); - auto col = this->make_column(v, b); - auto strings = cudf::test::strings_column_wrapper{ - {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)}; - std::vector> struct_columns; - struct_columns.push_back(std::move(col)); - struct_columns.push_back(strings.release()); - - auto struct_col = - cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{}); - - struct_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL)); - auto expected_null_result = - cudf::test::fixed_width_column_wrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - auto dense_null_out = cudf::scan( - *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_null_out = cudf::scan( - *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_null_out->view(), expected_null_result); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_null_out->view(), expected_null_result); - - auto const struct_nulls = - thrust::host_vector(std::vector{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}); - struct_col->set_null_mask( - cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end())); - auto expected_dense_vals = - cudf::test::fixed_width_column_wrapper{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9}; - auto expected_rank_vals = - cudf::test::fixed_width_column_wrapper{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12}; - auto dense_out = cudf::scan( - *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_out = cudf::scan( - *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense_vals); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank_vals); + auto struct_col = make_mixed_structs_column().release(); + + // First, verify that if the structs column has only nulls, all output rows are ranked 1. + { + struct_col->set_null_mask( + create_null_mask(12, cudf::mask_state::ALL_NULL)); // Null mask not pushed down to members. + auto const expected_null_result = rank_result_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto const expected_percent_rank_null_result = + percent_result_col{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + auto const dense_out = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_out = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_null_result); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_null_result); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent_rank_null_result); + } + + // Next, verify that if the structs column a null mask that is NOT pushed down to members, + // the ranks are still correct. + { + auto const null_iter = nulls_at({1, 2}); + struct_col->set_null_mask( + cudf::test::detail::make_null_mask(null_iter, null_iter + struct_col->size())); + auto const expected_dense = rank_result_col{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9}; + auto const expected_rank = rank_result_col{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12}; + auto const expected_percent = percent_result_col{0.0, + 1.0 / 11, + 1.0 / 11, + 3.0 / 11, + 4.0 / 11, + 5.0 / 11, + 6.0 / 11, + 6.0 / 11, + 8.0 / 11, + 9.0 / 11, + 9.0 / 11, + 11.0 / 11}; + auto const dense_out = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_out = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent); + } } struct RankScanTest : public cudf::test::BaseFixture { @@ -207,49 +261,66 @@ struct RankScanTest : public cudf::test::BaseFixture { TEST(RankScanTest, BoolRank) { - cudf::test::fixed_width_column_wrapper vals{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34}; - cudf::test::fixed_width_column_wrapper expected_dense_vals{ - 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2}; - cudf::test::fixed_width_column_wrapper expected_rank_vals{ - 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4}; - - auto dense_out = cudf::scan( - vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto rank_out = - cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, dense_out->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, rank_out->view()); + auto const vals = input{0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11}; + + auto const dense_out = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const rank_out = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto const percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, dense_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, rank_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, percent_out->view()); } TEST(RankScanTest, BoolRankWithNull) { - cudf::test::fixed_width_column_wrapper vals{{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34}, - {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}}; - cudf::table_view order_table{std::vector{vals}}; - cudf::test::fixed_width_column_wrapper expected_dense_vals{ - 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3}; - cudf::test::fixed_width_column_wrapper expected_rank_vals{ - 1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9}; - - auto nullable_dense_out = cudf::scan( - vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - auto nullable_rank_out = - cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, nullable_dense_out->view()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, nullable_rank_out->view()); + auto const vals = input{{0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}, nulls_at({8, 9, 10, 11})}; + auto const expected_dense = rank_result_col{1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3}; + auto const expected_rank = rank_result_col{1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9}; + auto const expected_percent = percent_result_col{0.0, + 0.0, + 0.0, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 3.0 / 11, + 8.0 / 11, + 8.0 / 11, + 8.0 / 11, + 8.0 / 11}; + + auto nullable_dense_out = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto nullable_rank_out = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + auto nullable_percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, nullable_dense_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, nullable_rank_out->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, nullable_percent_out->view()); } TEST(RankScanTest, ExclusiveScan) { - cudf::test::fixed_width_column_wrapper vals{3, 4, 5}; - cudf::test::fixed_width_column_wrapper order_col{3, 3, 1}; - cudf::table_view order_table{std::vector{order_col}}; - - CUDF_EXPECT_THROW_MESSAGE( - cudf::scan( - vals, cudf::make_dense_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE), - "Unsupported dense rank aggregation operator for exclusive scan"); - CUDF_EXPECT_THROW_MESSAGE( - cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE), - "Unsupported rank aggregation operator for exclusive scan"); + auto const vals = input{3, 4, 5}; + + CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), + "Dense rank aggregation operator requires an inclusive scan"); + CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), + "Rank aggregation operator requires an inclusive scan"); + CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), + "Percent rank aggregation operator requires an inclusive scan"); } + +} // namespace cudf::test diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index a72ec61dd8f..bacd62ac86e 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ +#include #include #include #include #include #include -#include #include #include @@ -250,6 +250,33 @@ TEST_F(StringsContainsTests, OctalTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsContainsTests, HexTest) +{ + std::vector ascii_chars( // all possible matchable chars + {thrust::make_counting_iterator(0), thrust::make_counting_iterator(127)}); + auto const count = static_cast(ascii_chars.size()); + std::vector offsets( + {thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + count + 1}); + auto d_chars = cudf::detail::make_device_uvector_sync(ascii_chars); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets); + auto input = cudf::make_strings_column(d_chars, d_offsets); + + auto strings_view = cudf::strings_column_view(input->view()); + for (auto ch : ascii_chars) { + std::stringstream str; + str << "\\x" << std::setfill('0') << std::setw(2) << std::hex << static_cast(ch); + std::string pattern = str.str(); + + auto results = cudf::strings::contains_re(strings_view, pattern); + // only one element in the input should match ch + auto true_dat = cudf::detail::make_counting_transform_iterator( + 0, [ch](auto idx) { return ch == static_cast(idx); }); + cudf::test::fixed_width_column_wrapper expected(true_dat, true_dat + count); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + TEST_F(StringsContainsTests, EmbeddedNullCharacter) { std::vector data(10); diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 516882bd8ad..9a28dbf0697 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -150,6 +150,39 @@ TEST_F(StringsExtractTests, ExtractEventTest) } } +TEST_F(StringsExtractTests, MultiLine) +{ + auto input = + cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); + auto view = cudf::strings_column_view(input); + + auto results = cudf::strings::extract(view, "(^[a-c]+$)", cudf::strings::regex_flags::MULTILINE); + cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc"}, + {1, 1, 1, 0, 1}); + auto expected = cudf::table_view{{expected_multiline}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + results = cudf::strings::extract(view, "^([a-c]+)$"); + cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", ""}, {0, 0, 1, 0, 0}); + expected = cudf::table_view{{expected_default}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); +} + +TEST_F(StringsExtractTests, DotAll) +{ + auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""}); + auto view = cudf::strings_column_view(input); + + auto results = cudf::strings::extract(view, "(a.*f)", cudf::strings::regex_flags::DOTALL); + cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""}, + {1, 1, 1, 0}); + auto expected = cudf::table_view{{expected_dotall}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); + results = cudf::strings::extract(view, "(a.*f)"); + cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0}); + expected = cudf::table_view{{expected_default}}; + CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; @@ -181,7 +214,7 @@ TEST_F(StringsExtractTests, ExtractAllTest) cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity); auto sv = cudf::strings_column_view(input); - auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)"); + auto results = cudf::strings::extract_all_record(sv, "(\\d+) (\\w+)"); bool valids[] = {true, true, true, false, false, false, true}; using LCW = cudf::test::lists_column_wrapper; @@ -201,7 +234,7 @@ TEST_F(StringsExtractTests, Errors) cudf::test::strings_column_wrapper input({"this column intentionally left blank"}); auto sv = cudf::strings_column_view(input); EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error); - EXPECT_THROW(cudf::strings::extract_all(sv, "\\w+"), cudf::logic_error); + EXPECT_THROW(cudf::strings::extract_all_record(sv, "\\w+"), cudf::logic_error); } TEST_F(StringsExtractTests, MediumRegex) diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 5872a9e5bb7..81122b1c5d8 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,6 +112,22 @@ TEST_F(StringsConvertTest, ToFixedPointDecimal128) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsConvertTest, ToFixedPointLargeScale) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const strings = cudf::test::strings_column_wrapper({"0.05", "0.06", "0.50", "5.01"}); + + auto const scale = scale_type{-25}; + auto const type = cudf::data_type{cudf::type_to_id(), scale}; + auto const results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), type); + + auto const expected = fp_wrapper{{5, 6, 50, 501}, scale_type{-2}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TEST_F(StringsConvertTest, FromFixedPointDecimal128) { using namespace numeric; diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index de4e48fd70a..f0d7315929b 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,14 +18,15 @@ #include #include #include +#include #include #include #include #include #include +#include #include -#include #include @@ -240,41 +241,6 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, *expected); } -TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) -{ - cudf::column_view zero_size_strings_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto results = cudf::strings::split(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); - results = cudf::strings::rsplit(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); -} - -// This test specifically for https://github.com/rapidsai/custrings/issues/119 -TEST_F(StringsSplitTest, AllNullsCase) -{ - std::vector h_strings{nullptr, nullptr, nullptr}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - auto results = cudf::strings::split(cudf::strings_column_view(strings)); - EXPECT_TRUE(results->num_columns() == 1); - auto column = results->get_column(0).view(); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); - results = cudf::strings::split(cudf::strings_column_view(strings), cudf::string_scalar("-")); - EXPECT_TRUE(results->num_columns() == 1); - column = results->get_column(0); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); -} - TEST_F(StringsSplitTest, SplitRecord) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; @@ -339,6 +305,127 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::split_re(sv, "\\s+"); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity); + cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + + { + auto result = cudf::strings::split_re(sv, "[eé]"); + + cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity); + cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, + {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"s", "", " ", "", ""}, {1, 0, 1, 0, 0}); + cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2, col3}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_re(sv, "[eé]"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitRecordRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + using LCW = cudf::test::lists_column_wrapper; + { + auto result = cudf::strings::split_record_re(sv, "\\s+"); + + LCW expected( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } + + { + auto result = cudf::strings::split_record_re(sv, "[eé]"); + + LCW expected({LCW{" H", "llo th", "s", ""}, + LCW{}, + LCW{"ar", " som", " "}, + LCW{"t", "st String"}, + LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // rsplit == split when using default parameters + result = cudf::strings::rsplit_record_re(sv, "[eé]"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + { + auto result = cudf::strings::split_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, + {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // split everything is the same output as maxsplit==2 for the test input column here + result = cudf::strings::split_re(sv, "\\s+", 2); + auto expected2 = cudf::strings::split_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); + } + { + auto result = cudf::strings::split_record_re(sv, "\\s", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected1( + {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); + + result = cudf::strings::split_record_re(sv, "\\s", 2); + LCW expected2( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); + + // split everything is the same output as maxsplit==3 for the test input column here + result = cudf::strings::split_record_re(sv, "\\s", 3); + auto expected0 = cudf::strings::split_record_re(sv, "\\s"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); + } +} + TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ @@ -430,14 +517,100 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } -TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns) +TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::rsplit_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity); + cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + { + auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // split everything is the same output as any maxsplit > 2 for the test input column here + result = cudf::strings::rsplit_record_re(sv, "\\s+", 3); + auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); + } +} + +TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto split_record_result = cudf::strings::split_record(zero_size_strings_column); - EXPECT_TRUE(split_record_result->size() == 0); - auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column); - EXPECT_TRUE(rsplit_record_result->size() == 0); + auto results = cudf::strings::split(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::split_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + + auto list_result = cudf::strings::split_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); +} + +// This test specifically for https://github.com/rapidsai/custrings/issues/119 +TEST_F(StringsSplitTest, AllNullsCase) +{ + cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0}); + auto sv = cudf::strings_column_view(input); + + auto results = cudf::strings::split(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + + auto list_result = cudf::strings::split_record(sv); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record(sv); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::split_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); } TEST_F(StringsSplitTest, Partition) @@ -612,13 +785,20 @@ TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns) TEST_F(StringsSplitTest, InvalidParameter) { - std::vector h_strings{"string left intentionally blank"}; - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); - auto strings_view = cudf::strings_column_view(strings); + cudf::test::strings_column_wrapper input({"string left intentionally blank"}); + auto strings_view = cudf::strings_column_view(input); EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)), diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst index 190978a7581..141e5adba93 100644 --- a/docs/cudf/source/api_docs/groupby.rst +++ b/docs/cudf/source/api_docs/groupby.rst @@ -61,6 +61,7 @@ Computations / descriptive stats GroupBy.sum GroupBy.var GroupBy.corr + GroupBy.cov The following methods are available in both ``SeriesGroupBy`` and ``DataFrameGroupBy`` objects, but may differ slightly, usually in that diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index cbc8f7e712f..f74853769f6 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -180,7 +180,8 @@ that support them: +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ | corr | ✅ | | | | | | | ✅ | +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ - + | cov | ✅ | | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ GroupBy apply ------------- diff --git a/java/pom.xml b/java/pom.xml old mode 100755 new mode 100644 index 8f0fb1000d8..02828a21e67 --- a/java/pom.xml +++ b/java/pom.xml @@ -147,7 +147,7 @@ org.apache.hadoop hadoop-common - 3.1.0 + 3.1.4 test diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index 1d73bd71246..d10329ca0f2 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,8 +66,9 @@ enum Kind { MERGE_M2(27), RANK(28), DENSE_RANK(29), - TDIGEST(30), // This can take a delta argument for accuracy level - MERGE_TDIGEST(31); // This can take a delta argument for accuracy level + PERCENT_RANK(30), + TDIGEST(31), // This can take a delta argument for accuracy level + MERGE_TDIGEST(32); // This can take a delta argument for accuracy level final int nativeId; @@ -754,6 +755,19 @@ static DenseRankAggregation denseRank() { return new DenseRankAggregation(); } + static final class PercentRankAggregation extends NoParamAggregation { + private PercentRankAggregation() { + super(Kind.PERCENT_RANK); + } + } + + /** + * Get the row's percent ranking. + */ + static PercentRankAggregation percentRank() { + return new PercentRankAggregation(); + } + /** * Collect the values into a list. Nulls will be skipped. */ diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 3ff2a370e4f..f91ee5535b1 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -826,18 +826,18 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co /** * Creates a deep copy of a column while replacing the validity mask. The validity mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the current column. - * The result column will have the same number of rows as the current column. + * The result column will have the same number of rows as the current column. * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the row value is undefined. - * + * * @param boolColumn bool column whose value is to be used as the validity mask. * @return Deep copy of the column with replaced validity mask. - */ + */ public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) { return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView())); } @@ -2345,88 +2345,128 @@ public final ColumnVector stringLocate(Scalar substring, int start, int end) { } /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter, int maxSplit) { + public final Table stringSplit(String pattern, int limit, boolean splitByRegex) { assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new Table(stringSplit(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new Table(stringSplit(this.getNativeView(), pattern, limit, splitByRegex)); } - + /** - * Returns a list of columns by splitting each string using the specified delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return list of strings columns as a table. */ - public final Table stringSplit(Scalar delimiter) { - return stringSplit(delimiter, -1); + public final Table stringSplit(String pattern, boolean splitByRegex) { + return stringSplit(pattern, -1, splitByRegex); } /** - * Returns a list of columns by splitting each string using whitespace as the delimiter. - * The number of rows in the output columns will be the same as the input column. - * Null entries are added for a row where split results have been exhausted. - * Null string entries return corresponding null output columns. - * @return New table of strings columns. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return list of strings columns as a table. */ - public final Table stringSplit() { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplit(emptyString, -1); - } + public final Table stringSplit(String delimiter, int limit) { + return stringSplit(delimiter, limit, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. + * Returns a list of columns by splitting each string using the specified string literal + * delimiter. The number of rows in the output columns will be the same as the input column. + * Null entries are added for a row where split results have been exhausted. Null input entries + * result in all nulls in the corresponding rows of the output columns. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return list of strings columns as a table. */ - public final ColumnVector stringSplitRecord() { - return stringSplitRecord(-1); + public final Table stringSplit(String delimiter) { + return stringSplit(delimiter, -1, false); } /** - * Returns a column of lists of strings by splitting each string using whitespace as the delimiter. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(int maxSplit) { - try (Scalar emptyString = Scalar.fromString("")) { - return stringSplitRecord(emptyString, maxSplit); - } + public final ColumnVector stringSplitRecord(String pattern, int limit, boolean splitByRegex) { + assert type.equals(DType.STRING) : "column type must be String"; + assert pattern != null : "pattern is null"; + assert pattern.length() > 0 : "empty pattern is not supported"; + assert limit != 0 && limit != 1 : "split limit == 0 and limit == 1 are not supported"; + return new ColumnVector( + stringSplitRecord(this.getNativeView(), pattern, limit, splitByRegex)); + } + + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified pattern. + * + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + * @return a LIST column of string elements. + */ + public final ColumnVector stringSplitRecord(String pattern, boolean splitByRegex) { + return stringSplitRecord(pattern, -1, splitByRegex); } /** - * Returns a column of lists of strings by splitting each string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter) { - return stringSplitRecord(delimiter, -1); + public final ColumnVector stringSplitRecord(String delimiter, int limit) { + return stringSplitRecord(delimiter, limit, false); } /** - * Returns a column that is a list of strings. Each string list is made by splitting each input - * string using the specified delimiter. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * An empty string indicates split on whitespace. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. - * @return New table of strings columns. + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param delimiter UTF-8 encoded string identifying the split delimiter for each input string. + * @return a LIST column of string elements. */ - public final ColumnVector stringSplitRecord(Scalar delimiter, int maxSplit) { - assert type.equals(DType.STRING) : "column type must be a String"; - assert delimiter != null : "delimiter may not be null"; - assert delimiter.getType().equals(DType.STRING) : "delimiter must be a string scalar"; - return new ColumnVector( - stringSplitRecord(this.getNativeView(), delimiter.getScalarHandle(), maxSplit)); + public final ColumnVector stringSplitRecord(String delimiter) { + return stringSplitRecord(delimiter, -1, false); } /** @@ -3248,7 +3288,7 @@ public enum FindOptions {FIND_FIRST, FIND_LAST}; * Create a column of int32 indices, indicating the position of the scalar search key * in each list row. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key is null. * 2. The list row is null. * @param key The scalar search key @@ -3265,7 +3305,7 @@ public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) { * Create a column of int32 indices, indicating the position of each row in the * search key column in the corresponding row of the lists column. * All indices are 0-based. If a search key is not found, the index is set to -1. - * The index is set to null if one of the following is true: + * The index is set to null if one of the following is true: * 1. The search key row is null. * 2. The list row is null. * @param keys ColumnView of search keys. @@ -3531,15 +3571,36 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle private static native long substringLocate(long columnView, long substringScalar, int start, int end); /** - * Native method which returns array of columns by splitting each string using the specified - * delimiter. - * @param columnView native handle of the cudf::column_view being operated on. - * @param delimiter UTF-8 encoded string identifying the split points in each string. - * @param maxSplit the maximum number of splits to perform, or -1 for all possible splits. + * Returns a list of columns by splitting each string using the specified pattern. The number of + * rows in the output columns will be the same as the input column. Null entries are added for a + * row where split results have been exhausted. Null input entries result in all nulls in the + * corresponding rows of the output columns. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. */ - private static native long[] stringSplit(long columnView, long delimiter, int maxSplit); + private static native long[] stringSplit(long nativeHandle, String pattern, int limit, + boolean splitByRegex); - private static native long stringSplitRecord(long nativeView, long scalarHandle, int maxSplit); + /** + * Returns a column that are lists of strings in which each list is made by splitting the + * corresponding input string using the specified string literal delimiter. + * + * @param nativeHandle native handle of the input strings column that being operated on. + * @param pattern UTF-8 encoded string identifying the split pattern for each input string. + * @param limit the maximum size of the list resulting from splitting each input string, + * or -1 for all possible splits. Note that limit = 0 (all possible splits without + * trailing empty strings) and limit = 1 (no split at all) are not supported. + * @param splitByRegex a boolean flag indicating whether the input strings will be split by a + * regular expression pattern or just by a string literal delimiter. + */ + private static native long stringSplitRecord(long nativeHandle, String pattern, int limit, + boolean splitByRegex); /** * Native method to calculate substring from a given string column. 0 indexing. @@ -3714,7 +3775,7 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat /** * Native method to search list rows for null elements. * @param nativeView the column view handle of the list - * @return column handle of the resultant boolean column + * @return column handle of the resultant boolean column */ private static native long listContainsNulls(long nativeView); @@ -3896,20 +3957,20 @@ private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] vi /** * Native method to deep copy a column while replacing the null mask. The null mask is the * device_vector equivalent of the boolean column given as argument. - * + * * The boolColumn must have the same number of rows as the exemplar column. * The result column will have the same number of rows as the exemplar. * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i. * For all other values (i.e. `false` or `null`), the result column will have nulls. - * + * * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`, * then the resultant row value is undefined. - * + * * @param exemplarViewHandle column view of the column that is deep copied. * @param boolColumnViewHandle bool column whose value is to be used as the null mask. * @return Deep copy of the column with replaced null mask. - */ - private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, + */ + private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, long boolColumnViewHandle) throws CudfException; //////// diff --git a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java index 0e49636fae6..78b3d5d52ec 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,6 +40,9 @@ private ColumnWriterOptions(AbstractStructBuilder builder) { (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]); } + // The sentinel value of unknown precision (default value) + public static int UNKNOWN_PRECISION = -1; + /** * Constructor used for list */ @@ -103,7 +106,7 @@ protected ColumnWriterOptions withDecimal(String name, int precision, protected ColumnWriterOptions withTimestamp(String name, boolean isInt96, boolean isNullable) { - return new ColumnWriterOptions(name, isInt96, 0, isNullable); + return new ColumnWriterOptions(name, isInt96, UNKNOWN_PRECISION, isNullable); } /** @@ -243,7 +246,7 @@ public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96, public ColumnWriterOptions(String columnName, boolean isNullable) { this.isTimestampTypeInt96 = false; - this.precision = 0; + this.precision = UNKNOWN_PRECISION; this.isNullable = isNullable; this.columnName = columnName; } diff --git a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java index 219b6dde05d..006941deec6 100644 --- a/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/GroupByScanAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -115,4 +115,11 @@ public static GroupByScanAggregation rank() { public static GroupByScanAggregation denseRank() { return new GroupByScanAggregation(Aggregation.denseRank()); } + + /** + * Get the row's percent ranking. + */ + public static GroupByScanAggregation percentRank() { + return new GroupByScanAggregation(Aggregation.percentRank()); + } } diff --git a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java index 08489562adc..752fd57355b 100644 --- a/java/src/main/java/ai/rapids/cudf/ScanAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/ScanAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,4 +97,11 @@ public static ScanAggregation rank() { public static ScanAggregation denseRank() { return new ScanAggregation(Aggregation.denseRank()); } + + /** + * Get the row's percent rank. + */ + public static ScanAggregation percentRank() { + return new ScanAggregation(Aggregation.percentRank()); + } } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index bb0321d0a16..17e10933b65 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -539,14 +539,11 @@ private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long r private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; - private static native long leftJoinRowCount(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long leftJoinRowCount(long leftTable, long rightHashJoin) throws CudfException; - private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long[] leftHashJoinGatherMaps(long leftTable, long rightHashJoin) throws CudfException; private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, - boolean nullsEqual, long outputRowCount) throws CudfException; private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable, @@ -555,14 +552,11 @@ private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; - private static native long innerJoinRowCount(long table, long hashJoin, - boolean nullsEqual) throws CudfException; + private static native long innerJoinRowCount(long table, long hashJoin) throws CudfException; - private static native long[] innerHashJoinGatherMaps(long table, long hashJoin, - boolean nullsEqual) throws CudfException; + private static native long[] innerHashJoinGatherMaps(long table, long hashJoin) throws CudfException; private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin, - boolean nullsEqual, long outputRowCount) throws CudfException; private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable, @@ -571,14 +565,11 @@ private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long r private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys, boolean compareNullsEqual) throws CudfException; - private static native long fullJoinRowCount(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long fullJoinRowCount(long leftTable, long rightHashJoin) throws CudfException; - private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin, - boolean nullsEqual) throws CudfException; + private static native long[] fullHashJoinGatherMaps(long leftTable, long rightHashJoin) throws CudfException; private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin, - boolean nullsEqual, long outputRowCount) throws CudfException; private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable, @@ -2318,8 +2309,7 @@ public long leftJoinRowCount(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - return leftJoinRowCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + return leftJoinRowCount(getNativeView(), rightHash.getNativeView()); } /** @@ -2337,9 +2327,7 @@ public GatherMap[] leftJoinGatherMaps(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + long[] gatherMapData = leftHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); return buildJoinGatherMaps(gatherMapData); } @@ -2363,9 +2351,8 @@ public GatherMap[] leftJoinGatherMaps(HashJoin rightHash, long outputRowCount) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - leftHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls(), outputRowCount); + long[] gatherMapData = leftHashJoinGatherMapsWithCount(getNativeView(), + rightHash.getNativeView(), outputRowCount); return buildJoinGatherMaps(gatherMapData); } @@ -2545,8 +2532,7 @@ public long innerJoinRowCount(HashJoin otherHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "otherKeys: " + otherHash.getNumberOfColumns()); } - return innerJoinRowCount(getNativeView(), otherHash.getNativeView(), - otherHash.getCompareNulls()); + return innerJoinRowCount(getNativeView(), otherHash.getNativeView()); } /** @@ -2564,9 +2550,7 @@ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + long[] gatherMapData = innerHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); return buildJoinGatherMaps(gatherMapData); } @@ -2590,9 +2574,8 @@ public GatherMap[] innerJoinGatherMaps(HashJoin rightHash, long outputRowCount) throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - innerHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls(), outputRowCount); + long[] gatherMapData = innerHashJoinGatherMapsWithCount(getNativeView(), + rightHash.getNativeView(), outputRowCount); return buildJoinGatherMaps(gatherMapData); } @@ -2778,8 +2761,7 @@ public long fullJoinRowCount(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - return fullJoinRowCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + return fullJoinRowCount(getNativeView(), rightHash.getNativeView()); } /** @@ -2797,9 +2779,7 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls()); + long[] gatherMapData = fullHashJoinGatherMaps(getNativeView(), rightHash.getNativeView()); return buildJoinGatherMaps(gatherMapData); } @@ -2823,9 +2803,8 @@ public GatherMap[] fullJoinGatherMaps(HashJoin rightHash, long outputRowCount) { throw new IllegalArgumentException("column count mismatch, this: " + getNumberOfColumns() + "rightKeys: " + rightHash.getNumberOfColumns()); } - long[] gatherMapData = - fullHashJoinGatherMapsWithCount(getNativeView(), rightHash.getNativeView(), - rightHash.getCompareNulls(), outputRowCount); + long[] gatherMapData = fullHashJoinGatherMapsWithCount(getNativeView(), + rightHash.getNativeView(), outputRowCount); return buildJoinGatherMaps(gatherMapData); } diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp index 93a01854ced..f8c448566c8 100644 --- a/java/src/main/native/src/AggregationJni.cpp +++ b/java/src/main/native/src/AggregationJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -85,6 +85,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv return cudf::make_rank_aggregation(); case 29: // DENSE_RANK return cudf::make_dense_rank_aggregation(); + case 30: // PERCENT_RANK + return cudf::make_percent_rank_aggregation(); default: throw std::logic_error("Unsupported No Parameter Aggregation Operation"); } }(); @@ -139,10 +141,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv std::unique_ptr ret; // These numbers come from Aggregation.java and must stay in sync switch (kind) { - case 30: // TDIGEST + case 31: // TDIGEST ret = cudf::make_tdigest_aggregation(delta); break; - case 31: // MERGE_TDIGEST + case 32: // MERGE_TDIGEST ret = cudf::make_merge_tdigest_aggregation(delta); break; default: throw std::logic_error("Unsupported TDigest Aggregation Operation"); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index eec4a78a457..548844aa0d3 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -561,34 +562,78 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter_ptr, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter_ptr, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::strings_column_view const scv{*reinterpret_cast(column_view)}; - auto delimiter = reinterpret_cast(delimiter_ptr); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; - return cudf::jni::convert_table_for_return(env, - cudf::strings::split(scv, *delimiter, max_split)); + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = split_by_regex ? + cudf::strings::split_re(strs_input, pattern, max_split) : + cudf::strings::split(strs_input, cudf::string_scalar{pattern}, max_split); + return cudf::jni::convert_table_for_return(env, std::move(result)); } CATCH_STD(env, 0); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass, - jlong column_view, - jlong delimiter, - jint max_split) { - JNI_NULL_CHECK(env, column_view, "column is null", 0); - JNI_NULL_CHECK(env, delimiter, "string scalar delimiter is null", 0); + jlong input_handle, + jstring pattern_obj, + jint limit, + jboolean split_by_regex) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + + if (limit == 0 || limit == 1) { + // Cannot achieve the results of splitting with limit == 0 or limit == 1. + // This is because cudf operates on a different parameter (`max_split`) which is converted from + // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an + // unlimited split. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "limit == 0 and limit == 1 are not supported", 0); + } + try { cudf::jni::auto_set_device(env); - cudf::column_view *cv = reinterpret_cast(column_view); - cudf::strings_column_view scv(*cv); - cudf::string_scalar *ss_scalar = reinterpret_cast(delimiter); - return release_as_jlong(cudf::strings::split_record(scv, *ss_scalar, max_split)); + auto const input = reinterpret_cast(input_handle); + auto const strs_input = cudf::strings_column_view{*input}; + + auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj); + if (pattern_jstr.is_empty()) { + // Java's split API produces different behaviors than cudf when splitting with empty + // pattern. + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Empty pattern is not supported", 0); + } + + auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes()); + auto const max_split = limit > 1 ? limit - 1 : limit; + auto result = + split_by_regex ? + cudf::strings::split_record_re(strs_input, pattern, max_split) : + cudf::strings::split_record(strs_input, cudf::string_scalar{pattern}, max_split); + return release_as_jlong(result); } CATCH_STD(env, 0); } diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index aeac1856db0..1cf56da35da 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -676,9 +676,10 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, int write_index = 0; for (int i = 0; i < num_children; i++, write_index++) { cudf::io::column_in_metadata child; - child.set_name(col_names[read_index]) - .set_decimal_precision(precisions[read_index]) - .set_nullability(nullability[read_index]); + child.set_name(col_names[read_index]).set_nullability(nullability[read_index]); + if (precisions[read_index] > -1) { + child.set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); } @@ -717,8 +718,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) { metadata.column_metadata[write_index] .set_name(cpp_names[read_index]) - .set_nullability(col_nullability[read_index]) - .set_decimal_precision(precisions[read_index]); + .set_nullability(col_nullability[read_index]); + if (precisions[read_index] > -1) { + metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]); + } if (!is_int96.is_null()) { metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]); } @@ -812,15 +815,14 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys, // a hash table built from the join's right table. template jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join, - jboolean compare_nulls_equal, T join_func) { + T join_func) { JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL); JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL); try { cudf::jni::auto_set_device(env); auto left_keys = reinterpret_cast(j_left_keys); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - return gather_maps_to_java(env, join_func(*left_keys, *hash_join, nulleq)); + return gather_maps_to_java(env, join_func(*left_keys, *hash_join)); } CATCH_STD(env, NULL); } @@ -2172,41 +2174,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass, jlong j_left_table, - jlong j_right_hash_join, - jboolean compare_nulls_equal) { + jlong j_right_hash_join) { JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); try { cudf::jni::auto_set_device(env); auto left_table = reinterpret_cast(j_left_table); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - auto row_count = hash_join->left_join_size(*left_table, nulleq); + auto row_count = hash_join->left_join_size(*left_table); return static_cast(row_count); } CATCH_STD(env, 0); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, - jboolean compare_nulls_equal) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) { return cudf::jni::hash_join_gather_maps( - env, j_left_table, j_right_hash_join, compare_nulls_equal, - [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { - return hash.left_join(left, nulleq); + env, j_left_table, j_right_hash_join, + [](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.left_join(left); }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, - jlong j_output_row_count) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) { auto output_row_count = static_cast(j_output_row_count); - return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, - [output_row_count](cudf::table_view const &left, - cudf::hash_join const &hash, - cudf::null_equality nulleq) { - return hash.left_join(left, nulleq, output_row_count); - }); + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, + [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.left_join(left, output_row_count); + }); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(JNIEnv *env, jclass, @@ -2305,41 +2302,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass, jlong j_left_table, - jlong j_right_hash_join, - jboolean compare_nulls_equal) { + jlong j_right_hash_join) { JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); try { cudf::jni::auto_set_device(env); auto left_table = reinterpret_cast(j_left_table); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - auto row_count = hash_join->inner_join_size(*left_table, nulleq); + auto row_count = hash_join->inner_join_size(*left_table); return static_cast(row_count); } CATCH_STD(env, 0); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, - jboolean compare_nulls_equal) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) { return cudf::jni::hash_join_gather_maps( - env, j_left_table, j_right_hash_join, compare_nulls_equal, - [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { - return hash.inner_join(left, nulleq); + env, j_left_table, j_right_hash_join, + [](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.inner_join(left); }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, - jlong j_output_row_count) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) { auto output_row_count = static_cast(j_output_row_count); - return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, - [output_row_count](cudf::table_view const &left, - cudf::hash_join const &hash, - cudf::null_equality nulleq) { - return hash.inner_join(left, nulleq, output_row_count); - }); + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, + [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.inner_join(left, output_row_count); + }); } JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(JNIEnv *env, jclass, @@ -2438,41 +2430,36 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass, jlong j_left_table, - jlong j_right_hash_join, - jboolean compare_nulls_equal) { + jlong j_right_hash_join) { JNI_NULL_CHECK(env, j_left_table, "left table is null", 0); JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0); try { cudf::jni::auto_set_device(env); auto left_table = reinterpret_cast(j_left_table); auto hash_join = reinterpret_cast(j_right_hash_join); - auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL; - auto row_count = hash_join->full_join_size(*left_table, nulleq); + auto row_count = hash_join->full_join_size(*left_table); return static_cast(row_count); } CATCH_STD(env, 0); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, - jboolean compare_nulls_equal) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) { return cudf::jni::hash_join_gather_maps( - env, j_left_table, j_right_hash_join, compare_nulls_equal, - [](cudf::table_view const &left, cudf::hash_join const &hash, cudf::null_equality nulleq) { - return hash.full_join(left, nulleq); + env, j_left_table, j_right_hash_join, + [](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.full_join(left); }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount( - JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jboolean compare_nulls_equal, - jlong j_output_row_count) { + JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) { auto output_row_count = static_cast(j_output_row_count); - return cudf::jni::hash_join_gather_maps(env, j_left_table, j_right_hash_join, compare_nulls_equal, - [output_row_count](cudf::table_view const &left, - cudf::hash_join const &hash, - cudf::null_equality nulleq) { - return hash.full_join(left, nulleq, output_row_count); - }); + return cudf::jni::hash_join_gather_maps( + env, j_left_table, j_right_hash_join, + [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) { + return hash.full_join(left, output_row_count); + }); } JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps( diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu index 4a5265b1d2e..5a2aa44261d 100644 --- a/java/src/main/native/src/row_conversion.cu +++ b/java/src/main/native/src/row_conversion.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -187,8 +189,82 @@ struct batch_data { std::vector row_batches; // information about each batch such as byte count }; -struct row_offset_functor { - row_offset_functor(size_type fixed_width_only_row_size) +/** + * @brief builds row size information for tables that contain strings + * + * @param tbl table from which to compute row size information + * @param fixed_width_and_validity_size size of fixed-width and validity data in this table + * @param stream cuda stream on which to operate + * @return device vector of size_types of the row sizes of the table + */ +rmm::device_uvector build_string_row_sizes(table_view const &tbl, + size_type fixed_width_and_validity_size, + rmm::cuda_stream_view stream) { + auto const num_rows = tbl.num_rows(); + rmm::device_uvector d_row_sizes(num_rows, stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), 0); + + auto d_offsets_iterators = [&]() { + std::vector offsets_iterators; + auto offsets_iter = thrust::make_transform_iterator( + tbl.begin(), [](auto const &col) -> strings_column_view::offset_iterator { + if (!is_fixed_width(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRING, "only string columns are supported!"); + return strings_column_view(col).offsets_begin(); + } else { + return nullptr; + } + }); + std::copy_if(offsets_iter, offsets_iter + tbl.num_columns(), + std::back_inserter(offsets_iterators), + [](auto const &offset_ptr) { return offset_ptr != nullptr; }); + return make_device_uvector_async(offsets_iterators, stream); + }(); + + auto const num_columns = static_cast(d_offsets_iterators.size()); + + thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_columns * num_rows), + [d_offsets_iterators = d_offsets_iterators.data(), num_columns, num_rows, + d_row_sizes = d_row_sizes.data()] __device__(auto element_idx) { + auto const row = element_idx % num_rows; + auto const col = element_idx / num_rows; + auto const val = + d_offsets_iterators[col][row + 1] - d_offsets_iterators[col][row]; + atomicAdd(&d_row_sizes[row], val); + }); + + // transform the row sizes to include fixed width size and alignment + thrust::transform(rmm::exec_policy(stream), d_row_sizes.begin(), d_row_sizes.end(), + d_row_sizes.begin(), [fixed_width_and_validity_size] __device__(auto row_size) { + return util::round_up_unsafe(fixed_width_and_validity_size + row_size, + JCUDF_ROW_ALIGNMENT); + }); + + return d_row_sizes; +} + +/** + * @brief functor to return the offset of a row in a table with string columns + * + */ +struct string_row_offset_functor { + string_row_offset_functor(device_span _d_row_offsets) + : d_row_offsets(_d_row_offsets){}; + + __device__ inline size_type operator()(int row_number, int) const { + return d_row_offsets[row_number]; + } + + device_span d_row_offsets; +}; + +/** + * @brief functor to return the offset of a row in a table with only fixed-width columns + * + */ +struct fixed_width_row_offset_functor { + fixed_width_row_offset_functor(size_type fixed_width_only_row_size) : _fixed_width_only_row_size(fixed_width_only_row_size){}; __device__ inline size_type operator()(int row_number, int tile_row_start) const { @@ -542,6 +618,10 @@ __global__ void copy_to_rows(const size_type num_rows, const size_type num_colum auto const relative_col = el / num_fetch_rows; auto const relative_row = el % num_fetch_rows; auto const absolute_col = relative_col + fetch_tile.start_col; + if (input_data[absolute_col] == nullptr) { + // variable-width data + continue; + } auto const absolute_row = relative_row + fetch_tile.start_row; auto const col_size = col_sizes[absolute_col]; auto const col_offset = col_offsets[absolute_col]; @@ -1194,10 +1274,8 @@ static size_type compute_column_information(iterator begin, iterator end, auto validity_offset = fixed_width_size_per_row; column_starts.push_back(validity_offset); - return util::round_up_unsafe( - fixed_width_size_per_row + - util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT), - JCUDF_ROW_ALIGNMENT); + return fixed_width_size_per_row + + util::div_rounding_up_safe(static_cast(std::distance(begin, end)), CHAR_BIT); } /** @@ -1512,20 +1590,27 @@ void determine_tiles(std::vector const &column_sizes, } } -#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - -} // namespace detail - -std::vector> convert_to_rows(table_view const &tbl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource *mr) { -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 - auto const num_columns = tbl.num_columns(); - auto const num_rows = tbl.num_rows(); - - auto const fixed_width_only = std::all_of( - tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); }); - +/** + * @brief convert cudf table into JCUDF row format + * + * @tparam offsetFunctor functor type for offset functor + * @param tbl table to convert to JCUDF row format + * @param batch_info information about the batches of data + * @param offset_functor functor that returns the starting offset of each row + * @param column_starts starting offset of a column in a row + * @param column_sizes size of each element in a column + * @param fixed_width_size_per_row size of fixed-width data in a row of this table + * @param stream stream used + * @param mr selected memory resource for returned data + * @return vector of list columns containing byte columns of the JCUDF row data + */ +template +std::vector> +convert_to_rows(table_view const &tbl, batch_data &batch_info, offsetFunctor offset_functor, + std::vector const &column_starts, + std::vector const &column_sizes, + size_type const fixed_width_size_per_row, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { int device_id; CUDA_TRY(cudaGetDevice(&device_id)); int total_shmem_in_bytes; @@ -1537,23 +1622,12 @@ std::vector> convert_to_rows(table_view const &tbl, sizeof(cuda::barrier) * NUM_TILES_PER_KERNEL_LOADED; auto const shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED; - // break up the work into tiles, which are a starting and ending row/col #. - // this tile size is calculated based on the shared memory size available - // we want a single tile to fill up the entire shared memory space available - // for the transpose-like conversion. - - // There are two different processes going on here. The GPU conversion of the data - // and the writing of the data into the list of byte columns that are a maximum of - // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand - // this limitation because the column must own the data inside and as a result it must be - // a distinct allocation for that column. Copying the data into these final buffers would - // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. - // The tiles are broken at the boundaries of specific rows based on the row sizes up - // to that point. These are row batches and they are decided first before building the - // tiles so the tiles can be properly cut around them. + auto const num_rows = tbl.num_rows(); + auto const num_columns = tbl.num_columns(); + auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); + auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); // Get the pointers to the input columnar data ready - auto data_begin = thrust::make_transform_iterator( tbl.begin(), [](auto const &c) { return c.template data(); }); std::vector input_data(data_begin, data_begin + tbl.num_columns()); @@ -1565,27 +1639,6 @@ std::vector> convert_to_rows(table_view const &tbl, auto dev_input_data = make_device_uvector_async(input_data, stream, mr); auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr); - std::vector column_sizes; // byte size of each column - std::vector column_starts; // offset of column inside a row including alignment - column_sizes.reserve(num_columns); - column_starts.reserve(num_columns + 1); // we add a final offset for validity data start - - auto schema_column_iter = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), - [&tbl](auto i) -> std::tuple { - return {tbl.column(i).type(), tbl.column(i)}; - }); - - auto const fixed_width_size_per_row = detail::compute_column_information( - schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes); - - auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr); - auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr); - - // total encoded row size. This includes fixed-width data, validity, and variable-width data. - auto row_size_iter = thrust::make_constant_iterator(fixed_width_size_per_row); - auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); - // the first batch always exists unless we were sent an empty table auto const first_batch_size = batch_info.row_batches[0].row_count; @@ -1636,8 +1689,6 @@ std::vector> convert_to_rows(table_view const &tbl, util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL)); dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); - detail::row_offset_functor offset_functor(fixed_width_size_per_row); - detail::copy_to_rows<<>>( num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(), dev_col_sizes.data(), dev_col_starts.data(), offset_functor, @@ -1670,6 +1721,76 @@ std::vector> convert_to_rows(table_view const &tbl, }); return ret; +} +#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +} // namespace detail + +std::vector> convert_to_rows(table_view const &tbl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + auto const num_columns = tbl.num_columns(); + auto const num_rows = tbl.num_rows(); + + auto const fixed_width_only = std::all_of( + tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); }); + + // break up the work into tiles, which are a starting and ending row/col #. + // this tile size is calculated based on the shared memory size available + // we want a single tile to fill up the entire shared memory space available + // for the transpose-like conversion. + + // There are two different processes going on here. The GPU conversion of the data + // and the writing of the data into the list of byte columns that are a maximum of + // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand + // this limitation because the column must own the data inside and as a result it must be + // a distinct allocation for that column. Copying the data into these final buffers would + // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer. + // The tiles are broken at the boundaries of specific rows based on the row sizes up + // to that point. These are row batches and they are decided first before building the + // tiles so the tiles can be properly cut around them. + + std::vector column_sizes; // byte size of each column + std::vector column_starts; // offset of column inside a row including alignment + column_sizes.reserve(num_columns); + column_starts.reserve(num_columns + 1); // we add a final offset for validity data start + + auto schema_column_iter = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + [&tbl](auto i) -> std::tuple { + return {tbl.column(i).type(), tbl.column(i)}; + }); + + auto const fixed_width_size_per_row = detail::compute_column_information( + schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes); + if (fixed_width_only) { + // total encoded row size. This includes fixed-width data and validity only. It does not include + // variable-width data since it isn't copied with the fixed-width and validity kernel. + auto row_size_iter = thrust::make_constant_iterator( + util::round_up_unsafe(fixed_width_size_per_row, JCUDF_ROW_ALIGNMENT)); + + auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); + + detail::fixed_width_row_offset_functor offset_functor( + util::round_up_unsafe(fixed_width_size_per_row, JCUDF_ROW_ALIGNMENT)); + + return detail::convert_to_rows(tbl, batch_info, offset_functor, column_starts, column_sizes, + fixed_width_size_per_row, stream, mr); + } else { + auto row_sizes = detail::build_string_row_sizes(tbl, fixed_width_size_per_row, stream); + + auto row_size_iter = cudf::detail::make_counting_transform_iterator( + 0, detail::row_size_functor(num_rows, row_sizes.data(), 0)); + + auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr); + + detail::string_row_offset_functor offset_functor(batch_info.batch_row_offsets); + + return detail::convert_to_rows(tbl, batch_info, offset_functor, column_starts, column_sizes, + fixed_width_size_per_row, stream, mr); + } + #else CUDF_FAIL("Column to row conversion optimization requires volta or later hardware."); return {}; @@ -1768,8 +1889,9 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) { return std::make_tuple(schema[i], nullptr); }); - auto const fixed_width_size_per_row = - detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes); + auto const fixed_width_size_per_row = util::round_up_unsafe( + detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes), + JCUDF_ROW_ALIGNMENT); // Ideally we would check that the offsets are all the same, etc. but for now // this is probably fine @@ -1842,7 +1964,7 @@ std::unique_ptr
convert_from_rows(lists_column_view const &input, dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu)); - detail::row_offset_functor offset_functor(fixed_width_size_per_row); + detail::fixed_width_row_offset_functor offset_functor(fixed_width_size_per_row); detail::copy_from_rows<<>>( num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(), diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index f9c8029ed84..b759c746735 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3099,6 +3099,28 @@ void testScanDenseRank() { } } + @Test + void testScanPercentRank() { + try (ColumnVector col1 = ColumnVector.fromBoxedInts(-97, -97, -97, null, -16, 5, null, null, 6, 6, 34, null); + ColumnVector col2 = ColumnVector.fromBoxedInts( 3, 3, 4, 7, 7, 7, 7, 7, 8, 8, 8, 9); + ColumnVector struct_order = ColumnVector.makeStruct(col1, col2); + ColumnVector expected = ColumnVector.fromBoxedDoubles( + 0.0, 0.0, 2.0/11, 3.0/11, 4.0/11, 5.0/11, 6.0/11, 6.0/11, 8.0/11, 8.0/11, 10.0/11, 1.0)) { + try (ColumnVector result = struct_order.scan(ScanAggregation.percentRank(), + ScanType.INCLUSIVE, NullPolicy.INCLUDE)) { + assertColumnsAreEqual(expected, result); + } + + // Exclude should have identical results + try (ColumnVector result = struct_order.scan(ScanAggregation.percentRank(), + ScanType.INCLUSIVE, NullPolicy.EXCLUDE)) { + assertColumnsAreEqual(expected, result); + } + + // Percent rank aggregations do not support ScanType.EXCLUSIVE + } + } + @Test void testWindowStatic() { try (Scalar one = Scalar.fromInt(1); @@ -4342,10 +4364,10 @@ void testExtractListElements() { ColumnVector expected = ColumnVector.fromStrings("Héllo", "thésé", null, - null, + "", "ARé", "test"); - ColumnVector tmp = v.stringSplitRecord(); + ColumnVector tmp = v.stringSplitRecord(" "); ColumnVector result = tmp.extractListElement(0)) { assertColumnsAreEqual(expected, result); } @@ -4739,28 +4761,12 @@ void testListSortRowsWithStringChild() { } } - @Test - void testStringSplitRecord() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there", "thésé", "null", "", "ARé some", "test strings"); - ColumnVector expected = ColumnVector.fromLists( - new HostColumnVector.ListType(true, - new HostColumnVector.BasicType(true, DType.STRING)), - Arrays.asList("Héllo", "there"), - Arrays.asList("thésé"), - Arrays.asList("null"), - Arrays.asList(""), - Arrays.asList("ARé", "some"), - Arrays.asList("test", "strings")); - Scalar pattern = Scalar.fromString(" "); - ColumnVector result = v.stringSplitRecord(pattern, -1)) { - assertColumnsAreEqual(expected, result); - } - } - @Test void testStringSplit() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", "ARé some things", "test strings here"); - Table expectedSplitOnce = new Table.TestBuilder() + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + Table expectedSplitLimit2 = new Table.TestBuilder() .column("Héllo", "thésé", null, "", "ARé", "test") .column("there all", null, null, null, "some things", "strings here") .build(); @@ -4769,41 +4775,92 @@ void testStringSplit() { .column("there", null, null, null, "some", "strings") .column("all", null, null, null, "things", "here") .build(); - Scalar pattern = Scalar.fromString(" "); - Table resultSplitOnce = v.stringSplit(pattern, 1); + Table resultSplitLimit2 = v.stringSplit(pattern, 2); Table resultSplitAll = v.stringSplit(pattern)) { - assertTablesAreEqual(expectedSplitOnce, resultSplitOnce); + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitWhiteSpace() { - try (ColumnVector v = ColumnVector.fromStrings("Héllo thesé", null, "are\tsome", "tést\nString", " "); - Table expected = new Table.TestBuilder().column("Héllo", null, "are", "tést", null) - .column("thesé", null, "some", "String", null) - .build(); - Table result = v.stringSplit()) { - assertTablesAreEqual(expected, result); + void testStringSplitByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + Table expectedSplitLimit2 = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there all", null, null, null, "some_things", "strings_here") + .build(); + Table expectedSplitAll = new Table.TestBuilder() + .column("Héllo", "thésé", null, "", "ARé", "test") + .column("there", null, null, null, "some", "strings") + .column("all", null, null, null, "things", "here") + .build(); + Table resultSplitLimit2 = v.stringSplit(pattern, 2, true); + Table resultSplitAll = v.stringSplit(pattern, true)) { + assertTablesAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertTablesAreEqual(expectedSplitAll, resultSplitAll); } } @Test - void teststringSplitThrowsException() { - assertThrows(CudfException.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromString(null); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Scalar delimiter = Scalar.fromInt(1); - Table result = cv.stringSplit(delimiter)) {} - }); - assertThrows(AssertionError.class, () -> { - try (ColumnVector cv = ColumnVector.fromStrings("Héllo", "thésé", null, "", "ARé", "strings"); - Table result = cv.stringSplit(null)) {} - }); + void testStringSplitRecord() { + String pattern = " "; + try (ColumnVector v = ColumnVector.fromStrings("Héllo there all", "thésé", null, "", + "ARé some things", "test strings here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some things"), + Arrays.asList("test", "strings here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } + } + + @Test + void testStringSplitRecordByRegularExpression() { + String pattern = "[_ ]"; + try (ColumnVector v = ColumnVector.fromStrings("Héllo_there all", "thésé", null, "", + "ARé some_things", "test_strings_here"); + ColumnVector expectedSplitLimit2 = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some_things"), + Arrays.asList("test", "strings_here")); + ColumnVector expectedSplitAll = ColumnVector.fromLists( + new HostColumnVector.ListType(true, + new HostColumnVector.BasicType(true, DType.STRING)), + Arrays.asList("Héllo", "there", "all"), + Arrays.asList("thésé"), + null, + Arrays.asList(""), + Arrays.asList("ARé", "some", "things"), + Arrays.asList("test", "strings", "here")); + ColumnVector resultSplitLimit2 = v.stringSplitRecord(pattern, 2, true); + ColumnVector resultSplitAll = v.stringSplitRecord(pattern, true)) { + assertColumnsAreEqual(expectedSplitLimit2, resultSplitLimit2); + assertColumnsAreEqual(expectedSplitAll, resultSplitAll); + } } @Test diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index db1327c5471..f309b1ee703 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -3930,8 +3930,8 @@ void testValidityFill() { @Test void testGroupByScan() { try (Table t1 = new Table.TestBuilder() - .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") - .column( 0, 1, 3, 3, 5, 5, 5, 5, 5, 5, 5) + .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") // GBY Key#0 + .column( 0, 1, 3, 3, 5, 5, 5, 5, 5, 5, 5) // GBY Key#1 .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0, null, null, 11.0, null, 10.0) .column( -9, null, -5, 0, 4, 4, 8, 2, 2, 2, null) .build()) { @@ -3945,16 +3945,18 @@ void testGroupByScan() { GroupByScanAggregation.min().onColumn(2), GroupByScanAggregation.max().onColumn(2), GroupByScanAggregation.rank().onColumn(3), - GroupByScanAggregation.denseRank().onColumn(3)); + GroupByScanAggregation.denseRank().onColumn(3), + GroupByScanAggregation.percentRank().onColumn(3)); Table expected = new Table.TestBuilder() - .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") + .column( "1", "1", "1", "1", "1", "1", "1", "2", "2", "2", "2") .column( 0, 1, 3, 3, 5, 5, 5, 5, 5, 5, 5) .column(12.0, 14.0, 13.0, 30.0, 17.0, 34.0, null, null, 11.0, null, 21.0) .column( 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 3) // odd why is this not 1 based? .column(12.0, 14.0, 13.0, 13.0, 17.0, 17.0, null, null, 11.0, null, 10.0) .column(12.0, 14.0, 13.0, 17.0, 17.0, 17.0, null, null, 11.0, null, 11.0) - .column(1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 4) - .column(1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2) + .column( 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 4) + .column( 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2) + .column( 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0) .build()) { assertTablesAreEqual(expected, result); } diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 68f7101b6ee..35b806909ec 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from enum import Enum, IntEnum @@ -59,6 +59,7 @@ class AggregationKind(Enum): PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION + COVARIANCE = libcudf_aggregation.aggregation.Kind.COVARIANCE class CorrelationType(IntEnum): @@ -354,6 +355,7 @@ cdef class Aggregation: )) return agg + cdef class RollingAggregation: """A Cython wrapper for rolling window aggregations. @@ -742,6 +744,21 @@ cdef class GroupbyAggregation: )) return agg + @classmethod + def cov( + cls, + libcudf_types.size_type min_periods, + libcudf_types.size_type ddof=1 + ): + cdef GroupbyAggregation agg = cls() + + agg.c_obj = move( + libcudf_aggregation. + make_covariance_aggregation[groupby_aggregation]( + min_periods, ddof + )) + return agg + cdef class GroupbyScanAggregation: """A Cython wrapper for groupby scan aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 3982b4fecbb..04eead4ee1b 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -40,6 +40,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' CORRELATION 'cudf::aggregation::CORRELATION' + COVARIANCE 'cudf::aggregation::COVARIANCE' Kind kind @@ -117,3 +118,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_correlation_aggregation[T]( correlation_type type, size_type min_periods) except + + + cdef unique_ptr[T] make_covariance_aggregation[T]( + size_type min_periods, size_type ddof) except + diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd index fd9992152a6..41d10b7b6da 100644 --- a/python/cudf/cudf/_lib/cpp/hash.pxd +++ b/python/cudf/cudf/_lib/cpp/hash.pxd @@ -1,18 +1,25 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector -cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil: + + ctypedef enum hash_id "cudf::hash_id": + HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" + HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" + HASH_MD5 "cudf::hash_id::HASH_MD5" + HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3" + HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" + cdef unique_ptr[column] hash "cudf::hash" ( const table_view& input, - const libcudf_types.hash_id hash_function, + const hash_id hash_function, const uint32_t seed ) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/contains.pxd b/python/cudf/cudf/_lib/cpp/strings/contains.pxd index 8014a60617d..59131f6e1aa 100644 --- a/python/cudf/cudf/_lib/cpp/strings/contains.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/contains.pxd @@ -1,20 +1,13 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.strings.regex_flags cimport regex_flags -cdef extern from "cudf/strings/regex/flags.hpp" \ - namespace "cudf::strings" nogil: - - ctypedef enum regex_flags: - DEFAULT 'cudf::strings::regex_flags::DEFAULT' - MULTILINE 'cudf::strings::regex_flags::MULTILINE' - DOTALL 'cudf::strings::regex_flags::DOTALL' - cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] contains_re( diff --git a/python/cudf/cudf/_lib/cpp/strings/extract.pxd b/python/cudf/cudf/_lib/cpp/strings/extract.pxd index 606369c8994..43a43594997 100644 --- a/python/cudf/cudf/_lib/cpp/strings/extract.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/extract.pxd @@ -1,10 +1,11 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.strings.contains cimport regex_flags from cudf._lib.cpp.table.table cimport table @@ -12,4 +13,5 @@ cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( column_view source_strings, - string pattern) except + + string pattern, + regex_flags flags) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd b/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd new file mode 100644 index 00000000000..2a5701fa6a3 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +cdef extern from "cudf/strings/regex/flags.hpp" \ + namespace "cudf::strings" nogil: + + ctypedef enum regex_flags: + DEFAULT 'cudf::strings::regex_flags::DEFAULT' + MULTILINE 'cudf::strings::regex_flags::MULTILINE' + DOTALL 'cudf::strings::regex_flags::DOTALL' diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index 23727a20ec2..b1a257feedf 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -81,13 +81,6 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: DECIMAL64 "cudf::type_id::DECIMAL64" DECIMAL128 "cudf::type_id::DECIMAL128" - ctypedef enum hash_id "cudf::hash_id": - HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY" - HASH_MURMUR3 "cudf::hash_id::HASH_MURMUR3" - HASH_MD5 "cudf::hash_id::HASH_MD5" - HASH_SERIAL_MURMUR3 "cudf::hash_id::HASH_SERIAL_MURMUR3" - HASH_SPARK_MURMUR3 "cudf::hash_id::HASH_SPARK_MURMUR3" - cdef cppclass data_type: data_type() except + data_type(const data_type&) except + diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 314542c9549..49a924c9104 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from collections import defaultdict @@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", "UNIQUE"} _LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = {"CORRELATION"} +_STRUCT_AGGS = {"CORRELATION", "COVARIANCE"} _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", "NTH", "COLLECT"} diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index adc48159aac..301f571f5fb 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp cimport bool @@ -10,7 +10,7 @@ from libcpp.vector cimport vector cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.hash cimport hash as cpp_hash +from cudf._lib.cpp.hash cimport hash as cpp_hash, hash_id as cpp_hash_id from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -58,11 +58,11 @@ def hash(source_table, str method, int seed=0): cdef table_view c_source_view = table_view_from_table( source_table, ignore_index=True) cdef unique_ptr[column] c_result - cdef libcudf_types.hash_id c_hash_function + cdef cpp_hash_id c_hash_function if method == "murmur3": - c_hash_function = libcudf_types.hash_id.HASH_MURMUR3 + c_hash_function = cpp_hash_id.HASH_MURMUR3 elif method == "md5": - c_hash_function = libcudf_types.hash_id.HASH_MD5 + c_hash_function = cpp_hash_id.HASH_MD5 else: raise ValueError(f"Unsupported hash function: {method}") with nogil: diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index f18d0eb7f36..31f0fb2ec04 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr @@ -12,8 +12,8 @@ from cudf._lib.cpp.strings.contains cimport ( contains_re as cpp_contains_re, count_re as cpp_count_re, matches_re as cpp_matches_re, - regex_flags as regex_flags, ) +from cudf._lib.cpp.strings.regex_flags cimport regex_flags from cudf._lib.scalar cimport DeviceScalar diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 9d554aa2162..bac282dccc5 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,5 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -8,12 +9,13 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.strings.extract cimport extract as cpp_extract +from cudf._lib.cpp.strings.regex_flags cimport regex_flags from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport data_from_unique_ptr -def extract(Column source_strings, object pattern): +def extract(Column source_strings, object pattern, uint32_t flags): """ Returns data which contains extracted capture groups provided in `pattern` for all `source_strings`. @@ -24,11 +26,13 @@ def extract(Column source_strings, object pattern): cdef column_view source_view = source_strings.view() cdef string pattern_string = str(pattern).encode() + cdef regex_flags c_flags = flags with nogil: c_result = move(cpp_extract( source_view, - pattern_string + pattern_string, + c_flags )) return data_from_unique_ptr( diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index c4e4b6c38d8..2d62a3f9a4b 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -9,11 +9,11 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport string_scalar -from cudf._lib.cpp.strings.contains cimport regex_flags from cudf._lib.cpp.strings.findall cimport ( findall as cpp_findall, findall_record as cpp_findall_record, ) +from cudf._lib.cpp.strings.regex_flags cimport regex_flags from cudf._lib.cpp.table.table cimport table from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport data_from_unique_ptr diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 7fa745719c9..89e6dbf0826 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,8 +1,9 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from __future__ import annotations import pickle +from functools import cached_property from typing import Any, Set import pandas as pd @@ -31,7 +32,6 @@ is_mixed_with_object_dtype, numeric_normalize_types, ) -from cudf.utils.utils import cached_property class BaseIndex(Serializable): @@ -1201,9 +1201,9 @@ def rename(self, name, inplace=False): self.name = name return None else: - out = self.copy(deep=False) + out = self.copy(deep=True) out.name = name - return out.copy(deep=True) + return out def astype(self, dtype, copy=False): """ diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 24f9dc83ca9..f57d88fb39d 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -1048,7 +1048,9 @@ def find_and_replace( f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}" ) - df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} + ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: fill_value = df._data["new"][df._data["old"].isnull()][0] @@ -1058,8 +1060,7 @@ def find_and_replace( new_categories = self.categories.append( column.as_column([fill_value]) ) - replaced = self.copy() - replaced = replaced._set_categories(new_categories) + replaced = self._set_categories(new_categories) replaced = replaced.fillna(fill_value) df = df.dropna(subset=["old"]) to_replace_col = df._data["old"] @@ -1078,35 +1079,41 @@ def find_and_replace( replacement_col = df._data["new"] # create a dataframe containing the pre-replacement categories - # and a copy of them to work with. The index of this dataframe - # represents the original ints that map to the categories - old_cats = cudf.DataFrame() - old_cats["cats"] = column.as_column(replaced.dtype.categories) - new_cats = old_cats.copy(deep=True) - - # Create a column with the appropriate labels replaced - old_cats["cats_replace"] = old_cats["cats"].replace( - to_replace_col, replacement_col + # and a column with the appropriate labels replaced. + # The index of this dataframe represents the original + # ints that map to the categories + cats_col = column.as_column(replaced.dtype.categories) + old_cats = cudf.DataFrame._from_data( + { + "cats": cats_col, + "cats_replace": cats_col.find_and_replace( + to_replace_col, replacement_col + ), + } ) # Construct the new categorical labels # If a category is being replaced by an existing one, we # want to map it to None. If it's totally new, we want to # map it to the new label it is to be replaced by - dtype_replace = cudf.Series(replacement_col) - dtype_replace[dtype_replace.isin(old_cats["cats"])] = None - new_cats["cats"] = new_cats["cats"].replace( - to_replace_col, dtype_replace + dtype_replace = cudf.Series._from_data({None: replacement_col}) + dtype_replace[dtype_replace.isin(cats_col)] = None + new_cats_col = cats_col.find_and_replace( + to_replace_col, dtype_replace._column ) # anything we mapped to None, we want to now filter out since # those categories don't exist anymore # Resetting the index creates a column 'index' that associates # the original integers to the new labels - bmask = new_cats._data["cats"].notnull() - new_cats = cudf.DataFrame( - {"cats": new_cats._data["cats"].apply_boolean_mask(bmask)} - ).reset_index() + bmask = new_cats_col.notnull() + new_cats_col = new_cats_col.apply_boolean_mask(bmask) + new_cats = cudf.DataFrame._from_data( + { + "index": cudf.core.column.arange(len(new_cats_col)), + "cats": new_cats_col, + } + ) # old_cats contains replaced categories and the ints that # previously mapped to those categories and the index of @@ -1507,9 +1514,15 @@ def _set_categories( old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) new_codes = column.arange(len(new_cats), dtype=out_code_dtype) - new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) - old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) - cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) + new_df = cudf.DataFrame._from_data( + data={"new_codes": new_codes, "cats": new_cats} + ) + old_df = cudf.DataFrame._from_data( + data={"old_codes": old_codes, "cats": cur_cats} + ) + cur_df = cudf.DataFrame._from_data( + data={"old_codes": cur_codes, "order": cur_order} + ) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") @@ -1519,7 +1532,7 @@ def _set_categories( df.reset_index(drop=True, inplace=True) ordered = ordered if ordered is not None else self.ordered - new_codes = df["new_codes"]._column + new_codes = df._data["new_codes"] # codes can't have masks, so take mask out before moving in return column.build_categorical_column( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2c3951c0e5e..1c1c2ef2bf6 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -177,34 +177,28 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: return self.binary_operator("NULL_EQUALS", other).all() def all(self, skipna: bool = True) -> bool: + # The skipna argument is only used for numerical columns. # If all entries are null the result is True, including when the column # is empty. - result_col = self.nans_to_nulls() if skipna else self - if result_col.null_count == result_col.size: + if self.null_count == self.size: return True - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) - - return result_col + return libcudf.reduce.reduce("all", self, dtype=np.bool_) def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. - result_col = self.nans_to_nulls() if skipna else self - if not skipna and result_col.has_nulls(): + + if not skipna and self.has_nulls(): return True - elif skipna and result_col.null_count == result_col.size: + elif skipna and self.null_count == self.size: return False - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) - - return result_col + return libcudf.reduce.reduce("any", self, dtype=np.bool_) def dropna(self, drop_nan: bool = False) -> ColumnBase: - col = self.nans_to_nulls() if drop_nan else self - return drop_nulls([col])[0] + # The drop_nan argument is only used for numerical columns. + return drop_nulls([self])[0] def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -341,6 +335,14 @@ def _fill( return self def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: + # libcudf currently doesn't handle case when offset > len(df) + # ticket to fix the bug in link below: + # https://github.com/rapidsai/cudf/issues/10314 + if abs(offset) > len(self): + if fill_value is None: + return column_empty_like(self, masked=True) + else: + return full(len(self), fill_value, dtype=self.dtype) return libcudf.copying.shift(self, offset, fill_value) @property @@ -1164,8 +1166,9 @@ def corr(self, other: ColumnBase): f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - def nans_to_nulls(self: T) -> T: - return self + @property + def contains_na_entries(self) -> bool: + return self.null_count != 0 def _process_for_reduction( self, skipna: bool = None, min_count: int = 0 @@ -1173,14 +1176,13 @@ def _process_for_reduction( skipna = True if skipna is None else skipna if skipna: - result_col = self.nans_to_nulls() - if result_col.has_nulls(): - result_col = result_col.dropna() + if self.has_nulls(): + result_col = self.dropna() else: if self.has_nulls(): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - result_col = self + result_col = self if min_count > 0: valid_count = len(result_col) - result_col.null_count @@ -1608,8 +1610,8 @@ def build_struct_column( Parameters ---------- - names : list-like - Field names to map to children dtypes + names : sequence of strings + Field names to map to children dtypes, must be strings. children : tuple mask: Buffer @@ -2093,24 +2095,16 @@ def as_column( dtype = "bool" np_type = np.dtype(dtype).type pa_type = np_to_pa_dtype(np.dtype(dtype)) - # TODO: A warning is emitted from pyarrow 5.0.0's function - # pyarrow.lib._sequence_to_array: - # "DeprecationWarning: an integer is required (got type float). - # Implicit conversion to integers using __int__ is deprecated, - # and may be removed in a future version of Python." - # This warning does not appear in pyarrow 6.0.1 and will be - # resolved by https://github.com/rapidsai/cudf/pull/9686/. - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) - pa_array = pa.array( + data = as_column( + pa.array( arbitrary, type=pa_type, from_pandas=True if nan_as_null is None else nan_as_null, - ) - data = as_column( - pa_array, dtype=dtype, nan_as_null=nan_as_null, + ), + dtype=dtype, + nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 9b54c4d9acd..2c483cbd00b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -20,6 +20,7 @@ import cudf from cudf import _lib as libcudf +from cudf._lib.stream_compaction import drop_nulls from cudf._typing import BinaryOperand, ColumnLike, Dtype, DtypeObj, ScalarLike from cudf.api.types import is_integer_dtype, is_number from cudf.core.buffer import Buffer @@ -316,6 +317,27 @@ def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: return self return libcudf.unary.cast(self, dtype) + def all(self, skipna: bool = True) -> bool: + # If all entries are null the result is True, including when the column + # is empty. + result_col = self.nans_to_nulls() if skipna else self + + if result_col.null_count == result_col.size: + return True + + return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + + def any(self, skipna: bool = True) -> bool: + # Early exit for fast cases. + result_col = self.nans_to_nulls() if skipna else self + + if not skipna and result_col.has_nulls(): + return True + elif skipna and result_col.null_count == result_col.size: + return False + + return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + @property def nan_count(self) -> int: if self.dtype.kind != "f": @@ -325,6 +347,14 @@ def nan_count(self) -> int: self._nan_count = nan_col.sum() return self._nan_count + def dropna(self, drop_nan: bool = False) -> NumericalColumn: + col = self.nans_to_nulls() if drop_nan else self + return drop_nulls([col])[0] + + @property + def contains_na_entries(self) -> bool: + return (self.nan_count != 0) or (self.null_count != 0) + def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: @@ -346,12 +376,14 @@ def _can_return_nan(self, skipna: bool = None) -> bool: def _process_for_reduction( self, skipna: bool = None, min_count: int = 0 - ) -> Union[ColumnBase, ScalarLike]: + ) -> Union[NumericalColumn, ScalarLike]: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - return super()._process_for_reduction( + + col = self.nans_to_nulls() if skipna else self + return super(NumericalColumn, col)._process_for_reduction( skipna=skipna, min_count=min_count ) @@ -399,7 +431,6 @@ def find_and_replace( replacement_col = _normalize_find_and_replace_input( self.dtype, replacement ) - replaced = self.copy() if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to( @@ -407,11 +438,13 @@ def find_and_replace( ) ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: - return replaced + return self.copy() to_replace_col, replacement_col, replaced = numeric_normalize_types( - to_replace_col, replacement_col, replaced + to_replace_col, replacement_col, self + ) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} ) - df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: replaced = replaced.fillna( @@ -420,7 +453,7 @@ def find_and_replace( df = df.dropna(subset=["old"]) return libcudf.replace.replace( - replaced, df["old"]._column, df["new"]._column + replaced, df._data["old"], df._data["new"] ) def fillna( diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ee1ddb58abc..5e40356ed0f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -574,6 +574,8 @@ def extract( ---------- pat : str Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + Flags to pass through to the regex engine (e.g. re.MULTILINE) expand : bool, default True If True, return DataFrame with one column per capture group. If False, return a Series/Index if there is one capture group or @@ -588,8 +590,8 @@ def extract( Notes ----- - The `flags` parameter is not yet supported and will raise a - NotImplementedError if anything other than the default value is passed. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. Examples -------- @@ -618,10 +620,12 @@ def extract( 2 dtype: object """ # noqa W605 - if flags != 0: - raise NotImplementedError("`flags` parameter is not yet supported") + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) - data, index = libstrings.extract(self._column, pat) + data, index = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: data = next(iter(data.values())) else: @@ -752,7 +756,9 @@ def contains( flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise ValueError("invalid `flags` parameter value") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) if pat is None: result_col = column.column_empty( @@ -3393,7 +3399,9 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise ValueError("invalid `flags` parameter value") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) return self._return_or_inplace( libstrings.count_re(self._column, pat, flags) @@ -3969,7 +3977,9 @@ def match( flags = pat.flags & ~re.U pat = pat.pattern if not _is_supported_regex_flags(flags): - raise ValueError("invalid `flags` parameter value") + raise NotImplementedError( + "unsupported value for `flags` parameter" + ) return self._return_or_inplace( libstrings.match_re(self._column, pat, flags) @@ -5329,7 +5339,9 @@ def find_and_replace( and replacement_col.dtype != self.dtype ): return self.copy() - df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) + df = cudf.DataFrame._from_data( + {"old": to_replace_col, "new": replacement_col} + ) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: res = self.fillna(df._data["new"][df._data["old"].isnull()][0]) @@ -5401,7 +5413,7 @@ def binary_operator( return cast( "column.ColumnBase", libstrings.concatenate( - cudf.DataFrame({0: lhs, 1: rhs}), + cudf.DataFrame._from_data(data={0: lhs, 1: rhs}), sep=cudf.Scalar(""), na_rep=cudf.Scalar(None, "str"), ), diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 67976ac27d4..9cb86ca1cd2 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -4,7 +4,7 @@ import itertools from collections.abc import MutableMapping -from functools import reduce +from functools import cached_property, reduce from typing import ( TYPE_CHECKING, Any, @@ -20,7 +20,6 @@ import cudf from cudf.core import column -from cudf.utils.utils import cached_property if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -360,9 +359,9 @@ def select_by_index(self, index: Any) -> ColumnAccessor: start, stop, step = index.indices(len(self._data)) keys = self.names[start:stop:step] elif pd.api.types.is_integer(index): - keys = [self.names[index]] + keys = (self.names[index],) else: - keys = (self.names[i] for i in index) + keys = tuple(self.names[i] for i in index) data = {k: self._data[k] for k in keys} return self.__class__( data, multiindex=self.multiindex, level_names=self.level_names, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 99b5de757a6..b31f9f2c30e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -113,6 +113,7 @@ def __setitem__(self, key, value): key = (key, slice(None)) return self._setitem_tuple_arg(key, value) + @annotate("_CAN_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used @@ -153,6 +154,7 @@ def _can_downcast_to_series(self, df, arg): return True return False + @annotate("_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _downcast_to_series(self, df, arg): """ "Downcast" from a DataFrame to a Series @@ -194,6 +196,7 @@ class _DataFrameLocIndexer(_DataFrameIndexer): For selection by label. """ + @annotate("_GETITEM_SCALAR", color="green", domain="cudf_python") def _getitem_scalar(self, arg): return self._frame[arg[1]].loc[arg[0]] @@ -635,6 +638,9 @@ def __init__( if dtype: self._data = self.astype(dtype)._data + @annotate( + "DATAFRAME_INIT_FROM_SERIES_LIST", color="blue", domain="cudf_python" + ) def _init_from_series_list(self, data, columns, index): if index is None: # When `index` is `None`, the final index of @@ -733,6 +739,9 @@ def _init_from_series_list(self, data, columns, index): ) self._data = self._data.select_by_label(columns) + @annotate( + "DATAFRAME_INIT_FROM_LIST_LIKE", color="blue", domain="cudf_python" + ) def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) @@ -769,6 +778,9 @@ def _init_from_list_like(self, data, index=None, columns=None): self.columns = columns + @annotate( + "DATAFRAME_INIT_FROM_DICT_LIKE", color="blue", domain="cudf_python" + ) def _init_from_dict_like( self, data, index=None, columns=None, nan_as_null=None ): @@ -820,8 +832,8 @@ def _init_from_dict_like( self._data.multiindex = self._data.multiindex and isinstance( col_name, tuple ) - self.insert( - i, col_name, data[col_name], nan_as_null=nan_as_null + self._insert( + i, col_name, data[col_name], nan_as_null=nan_as_null, ) if columns is not None: @@ -842,6 +854,11 @@ def _from_data( return out @staticmethod + @annotate( + "DATAFRAME_ALIGN_INPUT_SERIES_INDICES", + color="blue", + domain="cudf_python", + ) def _align_input_series_indices(data, index): data = data.copy() @@ -1077,7 +1094,7 @@ def __setitem__(self, arg, value): ) else: for col_name in self._data: - scatter_map = arg[col_name] + scatter_map = arg._data[col_name] if is_scalar(value): self._data[col_name][scatter_map] = value else: @@ -1187,6 +1204,7 @@ def __delitem__(self, name): """ self._drop_column(name) + @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python") def _slice(self: T, arg: slice) -> T: """ _slice : slice the frame as per the arg @@ -1248,18 +1266,13 @@ def _slice(self: T, arg: slice) -> T: result.columns = self.columns return result + @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") def memory_usage(self, index=True, deep=False): return Series( {str(k): v for k, v in super().memory_usage(index, deep).items()} ) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(self) - else: - return NotImplemented - + @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): cudf_df_module = DataFrame @@ -1298,6 +1311,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented # The _get_numeric_data method is necessary for dask compatibility. + @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python") def _get_numeric_data(self): """Return a dataframe with only numeric data types""" columns = [ @@ -1307,6 +1321,7 @@ def _get_numeric_data(self): ] return self[columns] + @annotate("DATAFRAME_ASSIGN", color="blue", domain="cudf_python") def assign(self, **kwargs): """ Assign columns to DataFrame from keyword arguments. @@ -1322,10 +1337,16 @@ def assign(self, **kwargs): 1 1 4 2 2 5 """ - new = self.copy() + new_df = cudf.DataFrame(index=self.index.copy()) + for name, col in self._data.items(): + if name in kwargs: + new_df[name] = kwargs.pop(name) + else: + new_df._data[name] = col.copy() + for k, v in kwargs.items(): - new[k] = v - return new + new_df[k] = v + return new_df @classmethod @annotate("CONCAT", color="orange", domain="cudf_python") @@ -1785,10 +1806,12 @@ def _get_renderable_dataframe(self): return output + @annotate("DATAFRAME_REPR", color="blue", domain="cudf_python") def __repr__(self): output = self._get_renderable_dataframe() return self._clean_renderable_dataframe(output) + @annotate("DATAFRAME_REPR_HTML", color="blue", domain="cudf_python") def _repr_html_(self): lines = ( self._get_renderable_dataframe() @@ -1805,9 +1828,13 @@ def _repr_html_(self): lines.append("") return "\n".join(lines) + @annotate("DATAFRAME_REPR_LATEX", color="blue", domain="cudf_python") def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() + @annotate( + "DATAFRAME_GET_COLUMNS_BY_LABEL", color="blue", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Return columns of dataframe by `labels` @@ -1830,7 +1857,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out - def _binaryop( + def _prep_for_binop( self, other: Any, fn: str, @@ -1850,6 +1877,7 @@ def _binaryop( # implementation assumes that binary operations between a column and # NULL are always commutative, even for binops (like subtraction) that # are normally anticommutative. + # TODO: We probably should support pandas DataFrame/Series objects. if isinstance(rhs, Sequence): # TODO: Consider validating sequence length (pandas does). operands = { @@ -1913,13 +1941,33 @@ def _binaryop( right = right_dict[col] operands[col] = (left, right, reflect, fill_value) else: + return NotImplemented, None + + return operands, lhs._index + + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") + def _binaryop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) + if operands is NotImplemented: return NotImplemented return self._from_data( ColumnAccessor(type(self)._colwise_binop(operands, fn)), - index=lhs._index, + index=out_index, ) + @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") def update( self, other, @@ -2013,14 +2061,27 @@ def update( self._mimic_inplace(source_df, inplace=True) + @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python") def __iter__(self): return iter(self.columns) + @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") def iteritems(self): + """Iterate over column names and series pairs""" + warnings.warn( + "iteritems is deprecated and will be removed in a future version. " + "Use .items instead.", + FutureWarning, + ) + return self.items() + + @annotate("DATAFRAME_ITEMS", color="blue", domain="cudf_python") + def items(self): """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) + @annotate("DATAFRAME_EQUALS", color="blue", domain="cudf_python") def equals(self, other, **kwargs): ret = super().equals(other) # If all other checks matched, validate names. @@ -2083,6 +2144,7 @@ def columns(self, columns): data, multiindex=is_multiindex, level_names=columns.names, ) + @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python") def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): @@ -2159,6 +2221,7 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) + @annotate("DATAFRAME_REINDEX", color="blue", domain="cudf_python") def reindex( self, labels=None, axis=None, index=None, columns=None, copy=True ): @@ -2237,6 +2300,7 @@ def reindex( inplace=False, ) + @annotate("DATAFRAME_SET_INDEX", color="blue", domain="cudf_python") def set_index( self, keys, @@ -2279,7 +2343,7 @@ def set_index( 3 4 d 4.0 4 5 e 5.0 - Set the index to become the ‘b’ column: + Set the index to become the 'b' column: >>> df.set_index('b') a c @@ -2290,7 +2354,7 @@ def set_index( d 4 4.0 e 5 5.0 - Create a MultiIndex using columns ‘a’ and ‘b’: + Create a MultiIndex using columns 'a' and 'b': >>> df.set_index(["a", "b"]) c @@ -2497,7 +2561,7 @@ def reset_index( inplace=inplace, ) - @annotate("INSERT", color="green", domain="cudf_python") + @annotate("DATAFRAME_INSERT", color="green", domain="cudf_python") def insert(self, loc, name, value, nan_as_null=None): """Add a column to DataFrame at the index specified by loc. @@ -2508,6 +2572,29 @@ def insert(self, loc, name, value, nan_as_null=None): name : number or string name or label of column to be inserted value : Series or array-like + nan_as_null : bool, Default None + If ``None``/``True``, converts ``np.nan`` values to + ``null`` values. + If ``False``, leaves ``np.nan`` values as is. + """ + return self._insert( + loc=loc, + name=name, + value=value, + nan_as_null=nan_as_null, + ignore_index=False, + ) + + @annotate("DATAFRAME__INSERT", color="green", domain="cudf_python") + def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): + """ + Same as `insert`, with additional `ignore_index` param. + + ignore_index : bool, default True + If True, there will be no index equality check & reindexing + happening. + If False, a reindexing operation is performed if + `value.index` is not equal to `self.index`. """ if name in self._data: raise NameError(f"duplicated column name {name}") @@ -2528,7 +2615,8 @@ def insert(self, loc, name, value, nan_as_null=None): if len(self) == 0: if isinstance(value, (pd.Series, Series)): - self._index = as_index(value.index) + if not ignore_index: + self._index = as_index(value.index) elif len(value) > 0: self._index = RangeIndex(start=0, stop=len(value)) new_data = self._data.__class__() @@ -2541,9 +2629,11 @@ def insert(self, loc, name, value, nan_as_null=None): ) self._data = new_data elif isinstance(value, (pd.Series, Series)): - value = Series(value, nan_as_null=nan_as_null)._align_to_index( - self._index, how="right", sort=False - ) + value = Series(value, nan_as_null=nan_as_null) + if not ignore_index: + value = value._align_to_index( + self._index, how="right", sort=False + ) value = column.as_column(value, nan_as_null=nan_as_null) @@ -2623,6 +2713,7 @@ def diff(self, periods=1, axis=0): return self - self.shift(periods=periods) + @annotate("DATAFRAME_DROP", color="green", domain="cudf_python") def drop( self, labels=None, @@ -2796,12 +2887,14 @@ def drop( if not inplace: return out + @annotate("DATAFRAME_DROP_COLUMN", color="green", domain="cudf_python") def _drop_column(self, name): """Drop a column by *name*""" if name not in self._data: raise KeyError(f"column '{name}' does not exist") del self._data[name] + @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python") def drop_duplicates( self, subset=None, keep="first", inplace=False, ignore_index=False ): @@ -2879,12 +2972,14 @@ def drop_duplicates( return self._mimic_inplace(outdf, inplace=inplace) + @annotate("DATAFRAME_POP", color="green", domain="cudf_python") def pop(self, item): """Return a column and drop it from the DataFrame.""" popped = self[item] del self[item] return popped + @annotate("DATAFRAME_RENAME", color="green", domain="cudf_python") def rename( self, mapper=None, @@ -3028,6 +3123,7 @@ def rename( else: return out.copy(deep=copy) + @annotate("DATAFRAME_ADD_PREFIX", color="green", domain="cudf_python") def add_prefix(self, prefix): out = self.copy(deep=True) out.columns = [ @@ -3035,6 +3131,7 @@ def add_prefix(self, prefix): ] return out + @annotate("DATAFRAME_ADD_SUFFIX", color="green", domain="cudf_python") def add_suffix(self, suffix): out = self.copy(deep=True) out.columns = [ @@ -3042,6 +3139,7 @@ def add_suffix(self, suffix): ] return out + @annotate("DATAFRAME_AGG", color="green", domain="cudf_python") def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -3173,6 +3271,7 @@ def agg(self, aggs, axis=None): else: raise ValueError("argument must be a string, list or dict") + @annotate("DATAFRAME_NLARGEST", color="green", domain="cudf_python") def nlargest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n largest value of *columns* @@ -3304,6 +3403,7 @@ def nsmallest(self, n, columns, keep="first"): """ return self._n_largest_or_smallest(False, n, columns, keep) + @annotate("DATAFRAME_TRANSPOSE", color="green", domain="cudf_python") def transpose(self): """Transpose index and columns. @@ -3334,6 +3434,7 @@ def transpose(self): T = property(transpose, doc=transpose.__doc__) + @annotate("DATAFRAME_MELT", color="green", domain="cudf_python") def melt(self, **kwargs): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -3363,7 +3464,7 @@ def melt(self, **kwargs): return melt(self, **kwargs) - @annotate("JOIN", color="blue", domain="cudf_python") + @annotate("DATAFRAME_JOIN", color="blue", domain="cudf_python") def merge( self, right, @@ -3545,6 +3646,7 @@ def join( ) return df + @annotate("DATAFRAME_GROUPBY", color="green", domain="cudf_python") @copy_docstring(DataFrameGroupBy) def groupby( self, @@ -3660,7 +3762,7 @@ def query(self, expr, local_dict=None): """ # can't use `annotate` decorator here as we inspect the calling # environment. - with annotate("QUERY", color="purple", domain="cudf_python"): + with annotate("DATAFRAME_QUERY", color="purple", domain="cudf_python"): if local_dict is None: local_dict = {} @@ -3684,6 +3786,7 @@ def query(self, expr, local_dict=None): boolmask = queryutils.query_execute(self, expr, callenv) return self._apply_boolean_mask(boolmask) + @annotate("DATAFRAME_APPLY", color="green", domain="cudf_python") def apply( self, func, axis=1, raw=False, result_type=None, args=(), **kwargs ): @@ -3832,6 +3935,7 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) + @annotate("DATAFRAME_APPLY_ROWS", color="green", domain="cudf_python") @applyutils.doc_apply() def apply_rows( self, @@ -3910,6 +4014,7 @@ def apply_rows( cache_key=cache_key, ) + @annotate("DATAFRAME_APPLY_CHUNKS", color="green", domain="cudf_python") @applyutils.doc_applychunks() def apply_chunks( self, @@ -3977,6 +4082,9 @@ def apply_chunks( tpb=tpb, ) + @annotate( + "DATAFRAME_PARTITION_BY_HASH", color="green", domain="cudf_python" + ) def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. @@ -4314,6 +4422,7 @@ def _sizeof_fmt(num, size_qualifier): cudf.utils.ioutils.buffer_write_lines(buf, lines) + @annotate("DATAFRAME_DESCRIBE", color="green", domain="cudf_python") @docutils.doc_describe() def describe( self, @@ -4373,6 +4482,7 @@ def describe( sort=False, ) + @annotate("DATAFRAME_TO_PANDAS", color="green", domain="cudf_python") def to_pandas(self, nullable=False, **kwargs): """ Convert to a Pandas DataFrame. @@ -4459,6 +4569,7 @@ def to_pandas(self, nullable=False, **kwargs): return out_df @classmethod + @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(cls, dataframe, nan_as_null=None): """ Convert from a Pandas DataFrame. @@ -4496,7 +4607,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): df = cls() # Set columns - for col_name, col_value in dataframe.iteritems(): + for col_name, col_value in dataframe.items(): # necessary because multi-index can return multiple # columns for a single key if len(col_value.shape) == 1: @@ -4528,6 +4639,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): return result @classmethod + @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python") def from_arrow(cls, table): """ Convert from PyArrow Table to DataFrame. @@ -4583,6 +4695,7 @@ def from_arrow(cls, table): return out + @annotate("DATAFRAME_TO_ARROW", color="green", domain="cudf_python") def to_arrow(self, preserve_index=True): """ Convert to a PyArrow Table. @@ -4606,10 +4719,17 @@ def to_arrow(self, preserve_index=True): a: int64 b: int64 index: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + index: [[1,2,3]] >>> df.to_arrow(preserve_index=False) pyarrow.Table a: int64 b: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] """ data = self.copy(deep=False) @@ -4638,8 +4758,8 @@ def to_arrow(self, preserve_index=True): for gen_name, col_name in zip( gen_names, self.index._data.names ): - data.insert( - data.shape[1], gen_name, self.index._data[col_name] + data._insert( + data.shape[1], gen_name, self.index._data[col_name], ) descr = gen_names[0] index_descr.append(descr) @@ -4657,6 +4777,7 @@ def to_arrow(self, preserve_index=True): return out.replace_schema_metadata(metadata) + @annotate("DATAFRAME_TO_RECORDS", color="green", domain="cudf_python") def to_records(self, index=True): """Convert to a numpy recarray @@ -4680,6 +4801,7 @@ def to_records(self, index=True): return ret @classmethod + @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python") def from_records(cls, data, index=None, columns=None, nan_as_null=False): """ Convert structured or record ndarray to DataFrame. @@ -4741,6 +4863,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): return df @classmethod + @annotate( + "DATAFRAME_FROM_ARRAYS_INTERNAL", color="green", domain="cudf_python" + ) def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): """Convert a numpy/cupy array to DataFrame. @@ -4800,6 +4925,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._index = as_index(index) return df + @annotate("DATAFRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -4830,6 +4956,7 @@ def interpolate( **kwargs, ) + @annotate("DATAFRAME_QUANTILE", color="green", domain="cudf_python") def quantile( self, q=0.5, @@ -4945,6 +5072,7 @@ def quantile( result.index = q return result + @annotate("DATAFRAME_QUANTILES", color="green", domain="cudf_python") def quantiles(self, q=0.5, interpolation="nearest"): """ Return values at the given quantile. @@ -4984,6 +5112,7 @@ def quantiles(self, q=0.5, interpolation="nearest"): result.index = as_index(q) return result + @annotate("DATAFRAME_ISIN", color="green", domain="cudf_python") def isin(self, values): """ Whether each element in the DataFrame is contained in values. @@ -5122,6 +5251,9 @@ def make_false_column_like_self(): # # Stats # + @annotate( + "DATAFRAME_PREPARE_FOR_ROWWISE_OP", color="green", domain="cudf_python" + ) def _prepare_for_rowwise_op(self, method, skipna): """Prepare a DataFrame for CuPy-based row-wise operations.""" @@ -5171,6 +5303,7 @@ def _prepare_for_rowwise_op(self, method, skipna): coerced = coerced.astype("int64", copy=False) return coerced, mask, common_dtype + @annotate("DATAFRAME_COUNT", color="green", domain="cudf_python") def count(self, axis=0, level=None, numeric_only=False, **kwargs): """ Count ``non-NA`` cells for each column or row. @@ -5217,6 +5350,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): "columns": 1, } + @annotate("DATAFRAME_REDUCE", color="green", domain="cudf_python") def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): @@ -5241,6 +5375,7 @@ def _reduce( elif axis == 1: return self._apply_cupy_method_axis_1(op, **kwargs) + @annotate("DATAFRAME_SCAN", color="green", domain="cudf_python") def _scan( self, op, axis=None, *args, **kwargs, ): @@ -5251,6 +5386,7 @@ def _scan( elif axis == 1: return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs) + @annotate("DATAFRAME_MODE", color="green", domain="cudf_python") def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -5350,6 +5486,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df + @annotate("DATAFRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5358,6 +5495,7 @@ def kurtosis( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5366,14 +5504,17 @@ def skew( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).all(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).any(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_APPLY_CUPY", color="green", domain="cudf_python") def _apply_cupy_method_axis_1(self, method, *args, **kwargs): # This method uses cupy to perform scans and reductions along rows of a # DataFrame. Since cuDF is designed around columnar storage and @@ -5474,6 +5615,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_df.columns = prepared.columns return result_df + @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python") def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. @@ -5482,6 +5624,7 @@ def _columns_view(self, columns): {col: self._data[col] for col in columns}, index=self.index ) + @annotate("DATAFRAME_SELECT_DTYPES", color="green", domain="cudf_python") def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame’s columns based on the column dtypes. @@ -5610,7 +5753,7 @@ def select_dtypes(self, include=None, exclude=None): for k, col in self._data.items(): infered_type = cudf_dtype_from_pydata_dtype(col.dtype) if infered_type in inclusion: - df.insert(len(df._data), k, col) + df._insert(len(df._data), k, col) return df @@ -5668,6 +5811,7 @@ def to_orc(self, fname, compression=None, *args, **kwargs): orc.to_orc(self, fname, compression, *args, **kwargs) + @annotate("DATAFRAME_STACK", color="green", domain="cudf_python") def stack(self, level=-1, dropna=True): """Stack the prescribed level(s) from columns to index @@ -5729,6 +5873,7 @@ def stack(self, level=-1, dropna=True): else: return result + @annotate("DATAFRAME_COV", color="green", domain="cudf_python") def cov(self, **kwargs): """Compute the covariance matrix of a DataFrame. @@ -5746,6 +5891,7 @@ def cov(self, **kwargs): df.columns = self.columns return df + @annotate("DATAFRAME_CORR", color="green", domain="cudf_python") def corr(self): """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) @@ -5753,6 +5899,7 @@ def corr(self): df.columns = self.columns return df + @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python") def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. @@ -5766,8 +5913,16 @@ def to_struct(self, name=None): ----- Note that a copy of the columns is made. """ + if not all(isinstance(name, str) for name in self._data.names): + warnings.warn( + "DataFrame contains non-string column name(s). Struct column " + "requires field name to be string. Non-string column names " + "will be casted to string as the field name." + ) + field_names = [str(name) for name in self._data.names] + col = cudf.core.column.build_struct_column( - names=self._data.names, children=self._data.columns, size=len(self) + names=field_names, children=self._data.columns, size=len(self) ) return cudf.Series._from_data( cudf.core.column_accessor.ColumnAccessor( @@ -5777,6 +5932,7 @@ def to_struct(self, name=None): name=name, ) + @annotate("DATAFRAME_KEYS", color="green", domain="cudf_python") def keys(self): """ Get the columns. @@ -5824,6 +5980,7 @@ def iterrows(self): "if you wish to iterate over each row." ) + @annotate("DATAFRAME_APPEND", color="green", domain="cudf_python") def append( self, other, ignore_index=False, verify_integrity=False, sort=False ): @@ -5976,6 +6133,7 @@ def append( return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) + @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python") @copy_docstring(reshape.pivot) def pivot(self, index, columns, values=None): @@ -5983,12 +6141,14 @@ def pivot(self, index, columns, values=None): self, index=index, columns=columns, values=values ) + @annotate("DATAFRAME_UNSTACK", color="green", domain="cudf_python") @copy_docstring(reshape.unstack) def unstack(self, level=-1, fill_value=None): return cudf.core.reshape.unstack( self, level=level, fill_value=fill_value ) + @annotate("DATAFRAME_EXPLODE", color="green", domain="cudf_python") def explode(self, column, ignore_index=False): """ Transform each element of a list-like to a row, replicating index @@ -6038,6 +6198,46 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) + def pct_change( + self, periods=1, fill_method="ffill", limit=None, freq=None + ): + """ + Calculates the percent change between sequential elements + in the DataFrame. + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change. + fill_method : str, default 'ffill' + How to handle NAs before computing percent changes. + limit : int, optional + The number of consecutive NAs to fill before stopping. + Not yet implemented. + freq : str, optional + Increment to use from time series API. + Not yet implemented. + + Returns + ------- + DataFrame + """ + if limit is not None: + raise NotImplementedError("limit parameter not supported yet.") + if freq is not None: + raise NotImplementedError("freq parameter not supported yet.") + elif fill_method not in {"ffill", "pad", "bfill", "backfill"}: + raise ValueError( + "fill_method must be one of 'ffill', 'pad', " + "'bfill', or 'backfill'." + ) + + data = self.fillna(method=fill_method, limit=limit) + + return data.diff(periods=periods) / data.shift( + periods=periods, freq=freq + ) + def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True ): @@ -6194,6 +6394,7 @@ def func(left, right, output): ) +@annotate("CUDF_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(obj, nan_as_null=None): """ Convert certain Pandas objects into the cudf equivalent. @@ -6314,6 +6515,7 @@ def from_pandas(obj, nan_as_null=None): ) +@annotate("CUDF_MERGE", color="green", domain="cudf_python") def merge(left, right, *args, **kwargs): return left.merge(right, *args, **kwargs) @@ -6406,7 +6608,11 @@ def _setitem_with_dataframe( raise ValueError("Can not insert new column with a bool mask") else: # handle append case - input_df.insert(len(input_df._data), col_1, replace_df[col_2]) + input_df._insert( + loc=len(input_df._data), + name=col_1, + value=replace_df[col_2], + ) def extract_col(df, col): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index d55ab901b59..d478baf2d69 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -90,22 +90,22 @@ def _num_rows(self) -> int: return len(self._data.columns[0]) @property - def _column_names(self) -> List[Any]: # TODO: List[str]? - return self._data.names + def _column_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? + return tuple(self._data.names) @property - def _index_names(self) -> List[Any]: # TODO: List[str]? + def _index_names(self) -> Optional[Tuple[Any, ...]]: # TODO: Tuple[str]? # TODO: Temporarily suppressing mypy warnings to avoid introducing bugs # by returning an empty list where one is not expected. return ( None # type: ignore if self._index is None - else self._index._data.names + else tuple(self._index._data.names) ) @property - def _columns(self) -> List[Any]: # TODO: List[Column]? - return self._data.columns + def _columns(self) -> Tuple[Any, ...]: # TODO: Tuple[Column]? + return tuple(self._data.columns) def serialize(self): header = { @@ -123,6 +123,7 @@ def deserialize(cls, header, frames): return cls_deserialize._from_data(dict(zip(column_names, columns))) @classmethod + @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python") def _from_data( cls, data: MutableMapping, @@ -133,6 +134,7 @@ def _from_data( return obj @classmethod + @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") def _from_columns( cls, columns: List[ColumnBase], @@ -163,6 +165,9 @@ def _from_columns( return cls._from_data(data, index) + @annotate( + "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python" + ) def _from_columns_like_self( self, columns: List[ColumnBase], @@ -360,6 +365,7 @@ def memory_usage(self, deep=False): def __len__(self): return self._num_rows + @annotate("FRAME_COPY", color="green", domain="cudf_python") def copy(self: T, deep: bool = True) -> T: """ Make a copy of this object's indices and data. @@ -445,6 +451,7 @@ def copy(self: T, deep: bool = True) -> T: return new_frame + @annotate("FRAME_EQUALS", color="green", domain="cudf_python") def equals(self, other, **kwargs): """ Test whether two objects contain the same elements. @@ -527,6 +534,7 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) + @annotate("FRAME_EXPLODE", color="green", domain="cudf_python") def _explode(self, explode_column: Any, ignore_index: bool): """Helper function for `explode` in `Series` and `Dataframe`, explodes a specified nested column. Other columns' corresponding rows are @@ -550,6 +558,9 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res + @annotate( + "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` @@ -557,6 +568,9 @@ def _get_columns_by_label(self, labels, downcast=False): """ return self._data.select_by_label(labels) + @annotate( + "FRAME_GET_COLUMNS_BY_INDEX", color="green", domain="cudf_python" + ) def _get_columns_by_index(self, indices): """ Returns columns of the Frame specified by `labels` @@ -580,6 +594,7 @@ def _as_column(self): return self._data[None].copy(deep=False) + @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") def _empty_like(self, keep_index=True): result = self.__class__._from_data( *libcudf.copying.table_empty_like(self, keep_index) @@ -669,6 +684,13 @@ def get_column_values_na(col): matrix[:, i] = get_column_values_na(col) return matrix + # TODO: As of now, calling cupy.asarray is _much_ faster than calling + # to_cupy. We should investigate the reasons why and whether we can provide + # a more efficient method here by exploiting __cuda_array_interface__. In + # particular, we need to benchmark how much of the overhead is coming from + # (potentially unavoidable) local copies in to_cupy and how much comes from + # inefficiencies in the implementation. + @annotate("FRAME_TO_CUPY", color="green", domain="cudf_python") def to_cupy( self, dtype: Union[Dtype, None] = None, @@ -703,6 +725,7 @@ def to_cupy( na_value, ) + @annotate("FRAME_TO_NUMPY", color="green", domain="cudf_python") def to_numpy( self, dtype: Union[Dtype, None] = None, @@ -737,6 +760,7 @@ def to_numpy( (lambda col: col.values_host), np.empty, dtype, na_value ) + @annotate("FRAME_CLIP", color="green", domain="cudf_python") def clip(self, lower=None, upper=None, inplace=False, axis=1): """ Trim values at input threshold(s). @@ -864,6 +888,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) + @annotate("FRAME_WHERE", color="green", domain="cudf_python") def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -922,6 +947,7 @@ def where(self, cond, other=None, inplace=False): frame=self, cond=cond, other=other, inplace=inplace ) + @annotate("FRAME_MASK", color="green", domain="cudf_python") def mask(self, cond, other=None, inplace=False): """ Replace values where the condition is True. @@ -983,6 +1009,7 @@ def mask(self, cond, other=None, inplace=False): return self.where(cond=~cond, other=other, inplace=inplace) + @annotate("FRAME_PIPE", color="green", domain="cudf_python") def pipe(self, func, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``. @@ -1113,6 +1140,7 @@ def scatter_by_map( return result + @annotate("FRAME_FILLNA", color="green", domain="cudf_python") def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1238,7 +1266,7 @@ def fillna( value = value elif not isinstance(value, abc.Mapping): value = {name: copy.deepcopy(value) for name in self._data.names} - elif isinstance(value, abc.Mapping): + else: value = { key: value.reindex(self.index) if isinstance(value, cudf.Series) @@ -1246,19 +1274,28 @@ def fillna( for key, value in value.items() } - copy_data = self._data.copy(deep=True) - - for name in copy_data.keys(): + filled_data = {} + for col_name, col in self._data.items(): + if col_name in value and method is None: + replace_val = value[col_name] + else: + replace_val = None should_fill = ( - name in value - and not libcudf.scalar._is_null_host_scalar(value[name]) + col_name in value + and col.contains_na_entries + and not libcudf.scalar._is_null_host_scalar(replace_val) ) or method is not None if should_fill: - copy_data[name] = copy_data[name].fillna(value[name], method) - result = self._from_data(copy_data, self._index) + filled_data[col_name] = col.fillna(replace_val, method) + else: + filled_data[col_name] = col.copy(deep=True) - return self._mimic_inplace(result, inplace=inplace) + return self._mimic_inplace( + self._from_data(data=filled_data, index=self._index), + inplace=inplace, + ) + @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python") def _drop_na_columns(self, how="any", subset=None, thresh=None): """ Drop columns containing nulls @@ -1276,16 +1313,21 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): else: thresh = len(df) - for col in self._data.names: + for name, col in df._data.items(): + try: + check_col = col.nans_to_nulls() + except AttributeError: + check_col = col no_threshold_valid_count = ( - len(df[col]) - df[col].nans_to_nulls().null_count + len(col) - check_col.null_count ) < thresh if no_threshold_valid_count: continue - out_cols.append(col) + out_cols.append(name) return self[out_cols] + @annotate("FRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -1355,6 +1397,7 @@ def interpolate( else result._gather(perm_sort.argsort()) ) + @annotate("FRAME_QUANTILES", color="green", domain="cudf_python") def _quantiles( self, q, @@ -1387,6 +1430,7 @@ def _quantiles( result._copy_type_metadata(self) return result + @annotate("FRAME_RANK", color="green", domain="cudf_python") def rank( self, axis=0, @@ -1463,6 +1507,7 @@ def rank( return self._from_data(data, index).astype(np.float64) + @annotate("FRAME_REPEAT", color="green", domain="cudf_python") def repeat(self, repeats, axis=None): """Repeats elements consecutively. @@ -1552,6 +1597,7 @@ def repeat(self, repeats, axis=None): result._copy_type_metadata(self) return result + @annotate("FRAME_SHIFT", color="green", domain="cudf_python") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" axis = self._get_axis_from_axis_arg(axis) @@ -1567,7 +1613,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): zip(self._column_names, data_columns), self._index ) - @annotate("SAMPLE", color="orange", domain="cudf_python") + @annotate("FRAME_SAMPLE", color="orange", domain="cudf_python") def sample( self, n=None, @@ -1761,7 +1807,7 @@ def sample( return result @classmethod - @annotate("FROM_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python") def from_arrow(cls, data): """Convert from PyArrow Table to Frame @@ -1901,7 +1947,7 @@ def from_arrow(cls, data): return cls._from_data({name: result[name] for name in column_names}) - @annotate("TO_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_TO_ARROW", color="orange", domain="cudf_python") def to_arrow(self): """ Convert to arrow Table @@ -1916,6 +1962,10 @@ def to_arrow(self): a: int64 b: int64 index: int64 + ---- + a: [[1,2,3]] + b: [[4,5,6]] + index: [[1,2,3]] """ return pa.Table.from_pydict( {name: col.to_arrow() for name, col in self._data.items()} @@ -1933,6 +1983,7 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] + @annotate("FRAME_REPLACE", color="green", domain="cudf_python") def replace( self, to_replace=None, @@ -2146,7 +2197,7 @@ def replace( ) if not (to_replace is None and value is None): - copy_data = self._data.copy(deep=False) + copy_data = {} ( all_na_per_column, to_replace_per_column, @@ -2155,11 +2206,11 @@ def replace( to_replace=to_replace, value=value, columns_dtype_map={ - col: copy_data._data[col].dtype for col in copy_data._data + col: self._data[col].dtype for col in self._data }, ) - for name, col in copy_data.items(): + for name, col in self._data.items(): try: copy_data[name] = col.find_and_replace( to_replace_per_column[name], @@ -2219,6 +2270,7 @@ def _copy_type_metadata( return self + @annotate("FRAME_ISNULL", color="green", domain="cudf_python") def isnull(self): """ Identify missing values. @@ -2300,6 +2352,7 @@ def isnull(self): # Alias for isnull isna = isnull + @annotate("FRAME_NOTNULL", color="green", domain="cudf_python") def notnull(self): """ Identify non-missing values. @@ -2381,6 +2434,7 @@ def notnull(self): # Alias for notnull notna = notnull + @annotate("FRAME_INTERLEAVE_COLUMNS", color="green", domain="cudf_python") def interleave_columns(self): """ Interleave Series columns of a table into a single column. @@ -2420,6 +2474,7 @@ def interleave_columns(self): return result + @annotate("FRAME_TILE", color="green", domain="cudf_python") def tile(self, count): """ Repeats the rows from `self` DataFrame `count` times to form a @@ -2449,6 +2504,7 @@ def tile(self, count): result._copy_type_metadata(self) return result + @annotate("FRAME_SEARCHSORTED", color="green", domain="cudf_python") def searchsorted( self, values, side="left", ascending=True, na_position="last" ): @@ -2533,7 +2589,7 @@ def searchsorted( else: return result - @annotate("ARGSORT", color="yellow", domain="cudf_python") + @annotate("FRAME_ARGSORT", color="yellow", domain="cudf_python") def argsort( self, by=None, @@ -2636,6 +2692,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) + @annotate("FRAME_SIN", color="green", domain="cudf_python") def sin(self): """ Get Trigonometric sine, element-wise. @@ -2697,6 +2754,7 @@ def sin(self): """ return self._unaryop("sin") + @annotate("FRAME_COS", color="green", domain="cudf_python") def cos(self): """ Get Trigonometric cosine, element-wise. @@ -2758,6 +2816,7 @@ def cos(self): """ return self._unaryop("cos") + @annotate("FRAME_TAN", color="green", domain="cudf_python") def tan(self): """ Get Trigonometric tangent, element-wise. @@ -2819,6 +2878,7 @@ def tan(self): """ return self._unaryop("tan") + @annotate("FRAME_ASIN", color="green", domain="cudf_python") def asin(self): """ Get Trigonometric inverse sine, element-wise. @@ -2869,6 +2929,7 @@ def asin(self): """ return self._unaryop("asin") + @annotate("FRAME_ACOS", color="green", domain="cudf_python") def acos(self): """ Get Trigonometric inverse cosine, element-wise. @@ -2927,6 +2988,7 @@ def acos(self): result = result.mask((result < 0) | (result > np.pi + 1)) return result + @annotate("FRAME_ATAN", color="green", domain="cudf_python") def atan(self): """ Get Trigonometric inverse tangent, element-wise. @@ -2987,6 +3049,7 @@ def atan(self): """ return self._unaryop("atan") + @annotate("FRAME_EXP", color="green", domain="cudf_python") def exp(self): """ Get the exponential of all elements, element-wise. @@ -3049,6 +3112,7 @@ def exp(self): """ return self._unaryop("exp") + @annotate("FRAME_LOG", color="green", domain="cudf_python") def log(self): """ Get the natural logarithm of all elements, element-wise. @@ -3110,6 +3174,7 @@ def log(self): """ return self._unaryop("log") + @annotate("FRAME_SQRT", color="green", domain="cudf_python") def sqrt(self): """ Get the non-negative square-root of all elements, element-wise. @@ -3165,6 +3230,7 @@ def sqrt(self): """ return self._unaryop("sqrt") + @annotate("FRAME_ABS", color="green", domain="cudf_python") def abs(self): """ Return a Series/DataFrame with absolute numeric value of each element. @@ -3191,6 +3257,7 @@ def abs(self): return self._unaryop("abs") # Rounding + @annotate("FRAME_CEIL", color="green", domain="cudf_python") def ceil(self): """ Rounds each value upward to the smallest integral value not less @@ -3227,6 +3294,7 @@ def ceil(self): return self._unaryop("ceil") + @annotate("FRAME_FLOOR", color="green", domain="cudf_python") def floor(self): """Rounds each value downward to the largest integral value not greater than the original. @@ -3266,6 +3334,7 @@ def floor(self): return self._unaryop("floor") + @annotate("FRAME_SCALE", color="green", domain="cudf_python") def scale(self): """ Scale values to [0, 1] in float64 @@ -3300,6 +3369,7 @@ def scale(self): scaled._index = self._index.copy(deep=False) return scaled + @annotate("FRAME_INTERNAL_MERGE", color="green", domain="cudf_python") def _merge( self, right, @@ -3343,6 +3413,7 @@ def _merge( suffixes=suffixes, ).perform_merge() + @annotate("FRAME_IS_SORTED", color="green", domain="cudf_python") def _is_sorted(self, ascending=None, null_position=None): """ Returns a boolean indicating whether the data of the Frame are sorted @@ -3373,12 +3444,14 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) + @annotate("FRAME_SPLIT", color="green", domain="cudf_python") def _split(self, splits, keep_index=True): results = libcudf.copying.table_split( self, splits, keep_index=keep_index ) return [self.__class__._from_data(*result) for result in results] + @annotate("FRAME_ENCODE", color="green", domain="cudf_python") def _encode(self): data, index, indices = libcudf.transform.table_encode(self) for name, col in data.items(): @@ -3386,6 +3459,7 @@ def _encode(self): keys = self.__class__._from_data(data, index) return keys, indices + @annotate("FRAME_UNARYOP", color="green", domain="cudf_python") def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) return self.__class__._from_data( @@ -3425,6 +3499,7 @@ def _binaryop( raise NotImplementedError @classmethod + @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python") def _colwise_binop( cls, operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], @@ -3583,6 +3658,7 @@ def _colwise_binop( return output + @annotate("FRAME_DOT", color="green", domain="cudf_python") def dot(self, other, reflect=False): """ Get dot product of frame and other, (binary operator `dot`). @@ -3622,6 +3698,8 @@ def dot(self, other, reflect=False): >>> [1, 2, 3, 4] @ s 10 """ + # TODO: This function does not currently support nulls. + # TODO: This function does not properly support misaligned indexes. lhs = self.values if isinstance(other, Frame): rhs = other.values @@ -3632,6 +3710,16 @@ def dot(self, other, reflect=False): ): rhs = cupy.asarray(other) else: + # TODO: This should raise an exception, not return NotImplemented, + # but __matmul__ relies on the current behavior. We should either + # move this implementation to __matmul__ and call it from here + # (checking for NotImplemented and raising NotImplementedError if + # that's what's returned), or __matmul__ should catch a + # NotImplementedError from here and return NotImplemented. The + # latter feels cleaner (putting the implementation in this method + # rather than in the operator) but will be slower in the (highly + # unlikely) case that we're multiplying a cudf object with another + # type of object that somehow supports this behavior. return NotImplemented if reflect: lhs, rhs = rhs, lhs @@ -3743,6 +3831,7 @@ def _reduce(self, *args, **kwargs): f"Reductions are not supported for objects of type {type(self)}." ) + @annotate("FRAME_MIN", color="green", domain="cudf_python") def min( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3788,6 +3877,7 @@ def min( **kwargs, ) + @annotate("FRAME_MAX", color="green", domain="cudf_python") def max( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3833,6 +3923,7 @@ def max( **kwargs, ) + @annotate("FRAME_SUM", color="green", domain="cudf_python") def sum( self, axis=None, @@ -3891,6 +3982,7 @@ def sum( **kwargs, ) + @annotate("FRAME_PRODUCT", color="green", domain="cudf_python") def product( self, axis=None, @@ -3955,6 +4047,7 @@ def product( # Alias for pandas compatibility. prod = product + @annotate("FRAME_MEAN", color="green", domain="cudf_python") def mean( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -3999,6 +4092,7 @@ def mean( **kwargs, ) + @annotate("FRAME_STD", color="green", domain="cudf_python") def std( self, axis=None, @@ -4055,6 +4149,7 @@ def std( **kwargs, ) + @annotate("FRAME_VAR", color="green", domain="cudf_python") def var( self, axis=None, @@ -4110,6 +4205,7 @@ def var( **kwargs, ) + @annotate("FRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4178,6 +4274,7 @@ def kurt( **kwargs, ) + @annotate("FRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4235,6 +4332,7 @@ def skew( **kwargs, ) + @annotate("FRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether all elements are True in DataFrame. @@ -4270,6 +4368,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): "all", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether any elements is True in DataFrame. @@ -4305,6 +4404,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_SUM_OF_SQUARES", color="green", domain="cudf_python") def sum_of_squares(self, dtype=None): """Return the sum of squares of values. @@ -4328,6 +4428,7 @@ def sum_of_squares(self, dtype=None): """ return self._reduce("sum_of_squares", dtype=dtype) + @annotate("FRAME_MEDIAN", color="green", domain="cudf_python") def median( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4373,22 +4474,28 @@ def median( ) # Scans + @annotate("FRAME_SCAN", color="green", domain="cudf_python") def _scan(self, op, axis=None, skipna=True, cast_to_int=False): skipna = True if skipna is None else skipna results = {} for name, col in self._data.items(): if skipna: - result_col = self._data[name].nans_to_nulls() + try: + result_col = col.nans_to_nulls() + except AttributeError: + result_col = col else: - result_col = self._data[name].copy() - if result_col.has_nulls(include_nan=True): + if col.has_nulls(include_nan=True): # Workaround as find_first_value doesn't seem to work # incase of bools. first_index = int( - result_col.isnull().astype("int8").find_first_value(1) + col.isnull().astype("int8").find_first_value(1) ) + result_col = col.copy() result_col[first_index:] = None + else: + result_col = col if ( cast_to_int @@ -4407,6 +4514,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): # for Index._from_data and simplify. return self._from_data(results, index=self._index) + @annotate("FRAME_CUMMIN", color="green", domain="cudf_python") def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series or DataFrame. @@ -4450,6 +4558,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMMAX", color="green", domain="cudf_python") def cummax(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative maximum of the Series or DataFrame. @@ -4493,6 +4602,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMSUM", color="green", domain="cudf_python") def cumsum(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative sum of the Series or DataFrame. @@ -4539,6 +4649,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_CUMPROD", color="green", domain="cudf_python") def cumprod(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative product of the Series or DataFrame. @@ -4584,6 +4695,7 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_TO_JSON", color="green", domain="cudf_python") @ioutils.doc_to_json() def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" @@ -4592,18 +4704,21 @@ def to_json(self, path_or_buf=None, *args, **kwargs): self, path_or_buf=path_or_buf, *args, **kwargs ) + @annotate("FRAME_TO_HDF", color="green", domain="cudf_python") @ioutils.doc_to_hdf() def to_hdf(self, path_or_buf, key, *args, **kwargs): """{docstring}""" cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + @annotate("FRAME_TO_DLPACK", color="green", domain="cudf_python") @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" return cudf.io.dlpack.to_dlpack(self) + @annotate("FRAME_TO_STRING", color="green", domain="cudf_python") def to_string(self): """ Convert to string @@ -4629,12 +4744,15 @@ def to_string(self): def __str__(self): return self.to_string() + @annotate("FRAME_DEEP_COPY", color="green", domain="cudf_python") def __deepcopy__(self, memo): return self.copy(deep=True) + @annotate("FRAME_COPY", color="green", domain="cudf_python") def __copy__(self): return self.copy(deep=False) + @annotate("FRAME_HEAD", color="green", domain="cudf_python") def head(self, n=5): """ Return the first `n` rows. @@ -4718,6 +4836,7 @@ def head(self, n=5): """ return self.iloc[:n] + @annotate("FRAME_TAIL", color="green", domain="cudf_python") def tail(self, n=5): """ Returns the last n rows as a new DataFrame or Series @@ -4749,6 +4868,7 @@ def tail(self, n=5): return self.iloc[-n:] + @annotate("FRAME_ROLLING", color="green", domain="cudf_python") @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None @@ -4762,6 +4882,7 @@ def rolling( win_type=win_type, ) + @annotate("FRAME_NANS_TO_NULLS", color="green", domain="cudf_python") def nans_to_nulls(self): """ Convert nans (if any) to nulls @@ -4808,14 +4929,15 @@ def nans_to_nulls(self): 1 3.14 2 """ - return self._from_data( - { - name: col.copy().nans_to_nulls() - for name, col in self._data.items() - }, - self._index, - ) + result_data = {} + for name, col in self._data.items(): + try: + result_data[name] = col.nans_to_nulls() + except AttributeError: + result_data[name] = col.copy() + return self._from_data(result_data, self._index) + @annotate("FRAME_INVERT", color="green", domain="cudf_python") def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data( @@ -4826,6 +4948,7 @@ def __invert__(self): self._index, ) + @annotate("FRAME_ADD", color="green", domain="cudf_python") def add(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4896,6 +5019,7 @@ def add(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value) + @annotate("FRAME_RADD", color="green", domain="cudf_python") def radd(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4975,6 +5099,7 @@ def radd(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value, reflect=True) + @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python") def subtract(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5057,6 +5182,7 @@ def subtract(self, other, axis, level=None, fill_value=None): sub = subtract + @annotate("FRAME_RSUB", color="green", domain="cudf_python") def rsub(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5140,6 +5266,7 @@ def rsub(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "sub", fill_value, reflect=True) + @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python") def multiply(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5224,6 +5351,7 @@ def multiply(self, other, axis, level=None, fill_value=None): mul = multiply + @annotate("FRAME_RMUL", color="green", domain="cudf_python") def rmul(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5308,6 +5436,7 @@ def rmul(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mul", fill_value, reflect=True) + @annotate("FRAME_MOD", color="green", domain="cudf_python") def mod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5378,6 +5507,7 @@ def mod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value) + @annotate("FRAME_RMOD", color="green", domain="cudf_python") def rmod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5460,6 +5590,7 @@ def rmod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value, reflect=True) + @annotate("FRAME_POW", color="green", domain="cudf_python") def pow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe series and other, element-wise @@ -5539,6 +5670,7 @@ def pow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value) + @annotate("FRAME_RPOW", color="green", domain="cudf_python") def rpow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe or series and other, element-wise @@ -5618,6 +5750,7 @@ def rpow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value, reflect=True) + @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python") def floordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5697,6 +5830,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value) + @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python") def rfloordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5793,6 +5927,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value, reflect=True) + @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python") def truediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5881,6 +6016,7 @@ def truediv(self, other, axis, level=None, fill_value=None): div = truediv divide = truediv + @annotate("FRAME_RTRUEDIV", color="green", domain="cudf_python") def rtruediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5973,6 +6109,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # Alias for rtruediv rdiv = rtruediv + @annotate("FRAME_EQ", color="green", domain="cudf_python") def eq(self, other, axis="columns", level=None, fill_value=None): """Equal to, element-wise (binary operator eq). @@ -6048,6 +6185,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None): other=other, fn="eq", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_NE", color="green", domain="cudf_python") def ne(self, other, axis="columns", level=None, fill_value=None): """Not equal to, element-wise (binary operator ne). @@ -6123,6 +6261,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ne", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LT", color="green", domain="cudf_python") def lt(self, other, axis="columns", level=None, fill_value=None): """Less than, element-wise (binary operator lt). @@ -6198,6 +6337,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="lt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LE", color="green", domain="cudf_python") def le(self, other, axis="columns", level=None, fill_value=None): """Less than or equal, element-wise (binary operator le). @@ -6273,6 +6413,7 @@ def le(self, other, axis="columns", level=None, fill_value=None): other=other, fn="le", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GT", color="green", domain="cudf_python") def gt(self, other, axis="columns", level=None, fill_value=None): """Greater than, element-wise (binary operator gt). @@ -6348,6 +6489,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="gt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GE", color="green", domain="cudf_python") def ge(self, other, axis="columns", level=None, fill_value=None): """Greater than or equal, element-wise (binary operator ge). @@ -6446,6 +6588,11 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True): } +@annotate( + "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS", + color="green", + domain="cudf_python", +) def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: @@ -6610,6 +6757,7 @@ def _is_series(obj): return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None +@annotate("FRAME_DROP_ROWS_BY_LABELS", color="green", domain="cudf_python") def _drop_rows_by_labels( obj: DataFrameOrSeries, labels: Union[ColumnLike, abc.Iterable, str], @@ -6658,10 +6806,9 @@ def _drop_rows_by_labels( join_res = working_df.join(to_join, how="leftanti") # 4. Reconstruct original layout, and rename - join_res.insert( + join_res._insert( ilevel, name=join_res._index.name, value=join_res._index ) - join_res = join_res.reset_index(drop=True) midx = cudf.MultiIndex.from_frame( join_res.iloc[:, 0:idx_nlv], names=obj._index.names diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ff700144bed..b90f857ce84 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -4,6 +4,7 @@ import itertools import pickle import warnings +from functools import cached_property import numpy as np import pandas as pd @@ -11,12 +12,13 @@ import cudf from cudf._lib import groupby as libgroupby +from cudf._lib.reshape import interleave_columns from cudf._typing import DataFrameOrSeries from cudf.api.types import is_list_like from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import GetAttrGetItemMixin, cached_property +from cudf.utils.utils import GetAttrGetItemMixin # The three functions below return the quantiles [25%, 50%, 75%] @@ -286,7 +288,7 @@ def agg(self, func): if not self._as_index: for col_name in reversed(self.grouping._named_columns): - result.insert( + result._insert( 0, col_name, result.index.get_level_values(col_name)._values, @@ -958,6 +960,177 @@ def corr(self, method="pearson", min_periods=1): return res + def cov(self, min_periods=0, ddof=1): + """ + Compute the pairwise covariance among the columns of a DataFrame, + excluding NA/null values. + + The returned DataFrame is the covariance matrix of the columns of + the DataFrame. + + Both NA and null values are automatically excluded from the + calculation. See the note below about bias from missing values. + + A threshold can be set for the minimum number of observations + for each value created. Comparisons with observations below this + threshold will be returned as `NA`. + + This method is generally used for the analysis of time series data to + understand the relationship between different measures across time. + + Parameters + ---------- + min_periods: int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + ddof: int, optional + Delta degrees of freedom, default is 1. + + Returns + ------- + DataFrame + Covariance matrix. + + Notes + ----- + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-ddof. + + For DataFrames that have Series that are missing data + (assuming that data is missing at random) the returned covariance + matrix will be an unbiased estimate of the variance and covariance + between the member Series. + + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be + positive semi-definite. This could lead to estimate correlations + having absolute values which are greater than one, and/or a + non-invertible covariance matrix. See + `Estimation of covariance matrices + ` + for more details. + + Examples + -------- + >>> import cudf + >>> gdf = cudf.DataFrame({ + ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + ... }) + >>> gdf + id val1 val2 val3 + 0 a 5 4 4 + 1 a 4 5 5 + 2 a 6 6 6 + 3 b 4 1 1 + 4 b 8 2 2 + 5 b 7 9 9 + 6 c 4 8 8 + 7 c 5 5 5 + 8 c 2 1 1 + >>> gdf.groupby("id").cov() + val1 val2 val3 + id + a val1 1.000000 0.500000 0.500000 + val2 0.500000 1.000000 1.000000 + val3 0.500000 1.000000 1.000000 + b val1 4.333333 3.500000 3.500000 + val2 3.500000 19.000000 19.000000 + val3 3.500000 19.000000 19.000000 + c val1 2.333333 3.833333 3.833333 + val2 3.833333 12.333333 12.333333 + val3 3.833333 12.333333 12.333333 + """ + + # create expanded dataframe consisting all combinations of the + # struct columns-pairs used in the covariance calculation + # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) + column_names = self.grouping.values.columns.tolist() + num_cols = len(column_names) + + column_pair_structs = {} + for x, y in itertools.combinations_with_replacement(column_names, 2): + # The number of output columns is the number of input columns + # squared. We directly call the struct column factory here to + # reduce overhead and avoid copying data. Since libcudf groupby + # maintains a cache of aggregation requests, reusing the same + # column also makes use of previously cached column means and + # reduces kernel costs. + + # checks if input column names are string, raise a warning if + # not so and cast them to strings + if not (isinstance(x, str) and isinstance(y, str)): + warnings.warn( + "DataFrame contains non-string column name(s). " + "Struct columns require field names to be strings. " + "Non-string column names will be cast to strings " + "in the result's field names." + ) + x, y = str(x), str(y) + + column_pair_structs[(x, y)] = cudf.core.column.build_struct_column( + names=(x, y), + children=(self.obj._data[x], self.obj._data[y]), + size=len(self.obj), + ) + + column_pair_groupby = cudf.DataFrame._from_data( + column_pair_structs + ).groupby(by=self.grouping.keys) + + try: + gb_cov = column_pair_groupby.agg( + lambda x: x.cov(min_periods, ddof) + ) + except RuntimeError as e: + if "Unsupported groupby reduction type-agg combination" in str(e): + raise TypeError( + "Covariance accepts only numerical column-pairs" + ) + raise + + # ensure that column-pair labels are arranged in ascending order + cols_list = [ + (y, x) if i > j else (x, y) + for j, y in enumerate(column_names) + for i, x in enumerate(column_names) + ] + cols_split = [ + cols_list[i : i + num_cols] + for i in range(0, len(cols_list), num_cols) + ] + + def combine_columns(gb_cov, ys): + list_of_columns = [gb_cov._data[y] for y in ys] + frame = cudf.core.frame.Frame._from_columns(list_of_columns, ys) + return interleave_columns(frame) + + # interleave: combine the correlation results for each column-pair + # into a single column + res = cudf.DataFrame._from_data( + { + x: combine_columns(gb_cov, ys) + for ys, x in zip(cols_split, column_names) + } + ) + + # create a multiindex for the groupby correlated dataframe, + # to match pandas behavior + unsorted_idx = gb_cov.index.repeat(num_cols) + idx_sort_order = unsorted_idx._get_sorted_inds() + sorted_idx = unsorted_idx._gather(idx_sort_order) + if len(gb_cov): + # TO-DO: Should the operation below be done on the CPU instead? + sorted_idx._data[None] = as_column( + np.tile(column_names, len(gb_cov.index)) + ) + res.index = MultiIndex._from_data(sorted_idx._data) + + return res + def var(self, ddof=1): """Compute the column-wise variance of the values in each group. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f71f930a21c..5b60e8dbd1c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,10 +1,11 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from __future__ import annotations import math import pickle import warnings +from functools import cached_property from numbers import Number from typing import ( Any, @@ -54,7 +55,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import find_common_type -from cudf.utils.utils import cached_property, search_range +from cudf.utils.utils import search_range T = TypeVar("T", bound="Frame") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fab5d75f62b..bc7337d0a42 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -6,6 +6,7 @@ import operator import warnings from collections import Counter, abc +from functools import cached_property from typing import Callable, Type, TypeVar from uuid import uuid4 @@ -24,13 +25,11 @@ is_integer_dtype, is_list_like, ) -from cudf.core.column import arange, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame -from cudf.utils.utils import cached_property doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -59,15 +58,14 @@ def _indices_from_labels(obj, labels): - from cudf.core.column import column if not isinstance(labels, cudf.MultiIndex): - labels = column.as_column(labels) + labels = cudf.core.column.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) - labels = column.build_categorical_column( + labels = cudf.core.column.build_categorical_column( categories=labels.dtype.categories, codes=codes, ordered=labels.dtype.ordered, @@ -78,8 +76,12 @@ def _indices_from_labels(obj, labels): # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" - lhs = cudf.DataFrame({"__": arange(len(labels))}, index=labels) - rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index) + lhs = cudf.DataFrame( + {"__": cudf.core.column.arange(len(labels))}, index=labels + ) + rhs = cudf.DataFrame( + {"_": cudf.core.column.arange(len(obj))}, index=obj.index + ) return lhs.join(rhs).sort_values("__")["_"] @@ -256,8 +258,6 @@ def iloc(self): Selecting rows and column by position. - Examples - -------- >>> df = cudf.DataFrame({'a': range(20), ... 'b': range(20), ... 'c': range(20)}) @@ -334,7 +334,7 @@ def sort_index( Parameters ---------- - axis : {0 or ‘index’, 1 or ‘columns’}, default 0 + axis : {0 or 'index', 1 or 'columns'}, default 0 The axis along which to sort. The value 0 identifies the rows, and 1 identifies the columns. level : int or level name or list of ints or list of level names @@ -346,7 +346,7 @@ def sort_index( If True, perform operation in-place. kind : sorting method such as `quick sort` and others. Not yet supported. - na_position : {‘first’, ‘last’}, default ‘last’ + na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if first; last puts NaNs at the end. sort_remaining : bool, default True Not yet supported @@ -858,7 +858,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): except Exception as e: raise RuntimeError("UDF kernel execution failed.") from e - col = as_column(ans_col) + col = cudf.core.column.as_column(ans_col) col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) result = cudf.Series._from_data({None: col}, self._index) @@ -1000,9 +1000,9 @@ def _align_to_index( # to recover ordering after index alignment. sort_col_id = str(uuid4()) if how == "left": - lhs[sort_col_id] = arange(len(lhs)) + lhs[sort_col_id] = cudf.core.column.arange(len(lhs)) elif how == "right": - rhs[sort_col_id] = arange(len(rhs)) + rhs[sort_col_id] = cudf.core.column.arange(len(rhs)) result = lhs.join(rhs, how=how, sort=sort) if how in ("left", "right"): @@ -1695,6 +1695,154 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: + return NotImplemented + + # Binary operations + binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", + } + + # First look for methods of the class. + fname = ufunc.__name__ + if fname in binary_operations: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + # These operators need to be mapped to their inverses when + # performing a reflected operation because no reflected version of + # the operators themselves exist. + ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", + } + + op = binary_operations[fname] + if reflect and op in ops_without_reflection: + op = ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" + + # pandas bitwise operations return bools if indexes are misaligned. + if ( + "bitwise" in fname + and isinstance(other, IndexedFrame) + and not self.index.equals(other.index) + ): + return getattr(self, op)(other).astype(bool) + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations, but those operations + # are not defined on pd.Series/DataFrame. For now those operations will + # dispatch to cupy, but if ufuncs are ever a bottleneck we could add + # special handling to dispatch those (or any other) functions that we + # could implement without cupy. + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cp, fname) + if cupy_func: + # Indices must be aligned before converting to arrays. + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs, index = self._prep_for_binop(other, fname) + else: + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } + index = self._index + + mask = None + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in inputs.items(): + cupy_inputs = [] + # TODO: I'm jumping through multiple hoops to get the unary + # behavior to match up with the binary. I should see if there + # are better patterns to employ here. + for inp in (left, right) if ufunc.nin == 2 else (left,): + if ( + isinstance(inp, cudf.core.column.ColumnBase) + and inp.has_nulls() + ): + new_mask = cudf.core.column.as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cp.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = cudf.core.column.as_column(out).set_mask( + mask + ) + + out = tuple( + self.__class__._from_data(out, index=index) for out in data + ) + return out[0] if ufunc.nout == 1 else out + + return NotImplemented + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 39ff4718550..c7e46cf0165 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,7 +1,7 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations -from typing import TYPE_CHECKING, Callable, cast +from typing import TYPE_CHECKING, Any, Callable, List, cast import cudf from cudf import _lib as libcudf @@ -320,7 +320,7 @@ def _sort_result(self, result: Frame) -> Frame: # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. - by = [] + by: List[Any] = [] if self._using_left_index and self._using_right_index: if result._index is not None: by.extend(result._index._data.columns) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 8581b97c217..5e0cd2ca8cb 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -6,6 +6,7 @@ import numbers import pickle from collections.abc import Sequence +from functools import cached_property from numbers import Integral from typing import Any, List, MutableMapping, Optional, Tuple, Union @@ -22,11 +23,7 @@ from cudf.core._compat import PANDAS_GE_120 from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index -from cudf.utils.utils import ( - NotIterable, - _maybe_indices_to_slice, - cached_property, -) +from cudf.utils.utils import NotIterable, _maybe_indices_to_slice class MultiIndex(Frame, BaseIndex, NotIterable): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 2fd062152de..8574a152c44 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -958,14 +958,6 @@ def to_frame(self, name=None): def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__": - return get_appropriate_dispatched_func( - cudf, cudf.Series, cupy, ufunc, inputs, kwargs - ) - else: - return NotImplemented - def __array_function__(self, func, types, args, kwargs): handled_types = [cudf.Series] for t in types: @@ -1214,9 +1206,9 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _binaryop( + def _prep_for_binop( self, - other: Frame, + other: Any, fn: str, fill_value: Any = None, reflect: bool = False, @@ -1248,24 +1240,55 @@ def _binaryop( lhs = self operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return operands, lhs._index + + def _binaryop( + self, + other: Frame, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) return ( - lhs._from_data( - data=lhs._colwise_binop(operands, fn), index=lhs._index, + self._from_data( + data=self._colwise_binop(operands, fn), index=out_index, ) if operands is not NotImplemented else NotImplemented ) def logical_and(self, other): + warnings.warn( + "Series.logical_and is deprecated and will be removed.", + FutureWarning, + ) return self._binaryop(other, "l_and").astype(np.bool_) def remainder(self, other): + warnings.warn( + "Series.remainder is deprecated and will be removed.", + FutureWarning, + ) return self._binaryop(other, "mod") def logical_or(self, other): + warnings.warn( + "Series.logical_or is deprecated and will be removed.", + FutureWarning, + ) return self._binaryop(other, "l_or").astype(np.bool_) def logical_not(self): + warnings.warn( + "Series.logical_not is deprecated and will be removed.", + FutureWarning, + ) return self._unaryop("not") @copy_docstring(CategoricalAccessor) # type: ignore @@ -1782,10 +1805,7 @@ def astype(self, dtype, copy=False, errors="raise"): try: data = self._column.astype(dtype) - return self._from_data( - {self.name: (data.copy(deep=True) if copy else data)}, - index=self._index, - ) + return self._from_data({self.name: data}, index=self._index) except Exception as e: if errors == "raise": @@ -3289,14 +3309,16 @@ def merge( return result def add_prefix(self, prefix): - result = self.copy(deep=True) - result.index = prefix + self.index.astype(str) - return result + return Series._from_data( + data=self._data.copy(deep=True), + index=prefix + self.index.astype(str), + ) def add_suffix(self, suffix): - result = self.copy(deep=True) - result.index = self.index.astype(str) + suffix - return result + return Series._from_data( + data=self._data.copy(deep=True), + index=self.index.astype(str) + suffix, + ) def keys(self): """ @@ -3838,8 +3860,8 @@ def is_leap_year(self): Series Booleans indicating if dates belong to a leap year. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y')) @@ -3896,7 +3918,7 @@ def quarter(self): Integer indicating which quarter the date belongs to. Examples - ------- + -------- >>> import cudf >>> s = cudf.Series(["2020-05-31 08:00:00","1999-12-31 18:40:00"], ... dtype="datetime64[ms]") @@ -3972,8 +3994,8 @@ def days_in_month(self): Series Integers representing the number of days in month - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M')) @@ -4023,8 +4045,8 @@ def is_month_end(self): Series Booleans indicating if dates are the last day of the month. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-08-26', end='2000-09-03', freq='1D')) @@ -4069,8 +4091,8 @@ def is_quarter_start(self): Series Booleans indicating if dates are the begining of a quarter - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) @@ -4115,8 +4137,8 @@ def is_quarter_end(self): Series Booleans indicating if dates are the end of a quarter - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series( ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) @@ -4163,8 +4185,8 @@ def is_year_start(self): Series Booleans indicating if dates are the first day of the year. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> s = cudf.Series(pd.date_range("2017-12-30", periods=3)) >>> dates @@ -4197,8 +4219,8 @@ def is_year_end(self): Series Booleans indicating if dates are the last day of the year. - Example - ------- + Examples + -------- >>> import pandas as pd, cudf >>> dates = cudf.Series(pd.date_range("2017-12-30", periods=3)) >>> dates diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index bf867923b57..50b206d3388 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -274,7 +274,7 @@ def factorize(self, na_sentinel=-1): def _make_operands_for_binop( self, - other: T, + other: Any, fill_value: Any = None, reflect: bool = False, *args, @@ -310,7 +310,7 @@ def _make_operands_for_binop( else: result_name = self.name - # This needs to be tested correctly + # TODO: This needs to be tested correctly if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 0f4256e49a6..8ffd75b1d76 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -302,8 +302,8 @@ def apply(self, func, *args, **kwargs): ----- See notes of the :meth:`cudf.Series.applymap` - Example - ------- + Examples + -------- >>> import cudf >>> def count_if_gt_3(window): diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index a09fb1f8e12..62260cbb822 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -20,8 +20,8 @@ def _make_empty_df(filepath_or_buffer, columns): orc_file = orc.ORCFile(filepath_or_buffer) schema = orc_file.schema col_names = schema.names if columns is None else columns - return cudf.DataFrame( - { + return cudf.DataFrame._from_data( + data={ col_name: cudf.core.column.column_empty( row_count=0, dtype=schema.field(col_name).type.to_pandas_dtype(), diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index a919b00692d..948428de4f0 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -7,6 +7,7 @@ from uuid import uuid4 import numpy as np +from nvtx import annotate from pyarrow import dataset as ds, parquet as pq import cudf @@ -16,6 +17,7 @@ from cudf.utils import ioutils +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _write_parquet( df, paths, @@ -73,6 +75,7 @@ def _write_parquet( # Logic chosen to match: https://arrow.apache.org/ # docs/_modules/pyarrow/parquet.html#write_to_dataset +@annotate("WRITE_TO_DATASET", color="green", domain="cudf_python") def write_to_dataset( df, root_path, @@ -161,6 +164,7 @@ def write_to_dataset( @ioutils.doc_read_parquet_metadata() +@annotate("READ_PARQUET_METADATA", color="green", domain="cudf_python") def read_parquet_metadata(path): """{docstring}""" @@ -173,6 +177,7 @@ def read_parquet_metadata(path): return num_rows, num_row_groups, col_names +@annotate("_PROCESS_DATASET", color="green", domain="cudf_python") def _process_dataset( paths, fs, filters=None, row_groups=None, categorical_partitions=True, ): @@ -308,6 +313,7 @@ def _process_dataset( @ioutils.doc_read_parquet() +@annotate("READ_PARQUET", color="green", domain="cudf_python") def read_parquet( filepath_or_buffer, engine="cudf", @@ -435,6 +441,7 @@ def read_parquet( ) +@annotate("_PARQUET_TO_FRAME", color="green", domain="cudf_python") def _parquet_to_frame( paths_or_buffers, *args, @@ -502,6 +509,7 @@ def _parquet_to_frame( ) +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _read_parquet( filepaths_or_buffers, engine, @@ -535,6 +543,7 @@ def _read_parquet( @ioutils.doc_to_parquet() +@annotate("TO_PARQUET", color="green", domain="cudf_python") def to_parquet( df, path, @@ -646,6 +655,7 @@ def _generate_filename(): return uuid4().hex + ".parquet" +@annotate("_GET_PARTITIONED", color="green", domain="cudf_python") def _get_partitioned( df, root_path, @@ -689,6 +699,7 @@ def _get_partitioned( class ParquetDatasetWriter: + @annotate("ParquetDatasetWriter_INIT", color="green", domain="cudf_python") def __init__( self, path, @@ -765,6 +776,9 @@ def __init__( self.path_cw_map: Dict[str, int] = {} self.filename = None + @annotate( + "ParquetDatasetWriter_WRITE_TABLE", color="green", domain="cudf_python" + ) def write_table(self, df): """ Write a dataframe to the file/dataset @@ -821,6 +835,9 @@ def write_table(self, df): self.path_cw_map.update({k: new_cw_idx for k in new_paths}) self._chunked_writers[-1][0].write_table(grouped_df, part_info) + @annotate( + "ParquetDatasetWriter_CLOSE", color="green", domain="cudf_python" + ) def close(self, return_metadata=False): """ Close all open files and optionally return footer metadata as a binary diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 6c602d321eb..e767c0c62be 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,5 +1,6 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. +import itertools import re import warnings from collections.abc import Mapping, Sequence @@ -46,7 +47,7 @@ def set_random_null_mask_inplace(series, null_probability=0.5, seed=None): probs = [null_probability, 1 - null_probability] rng = np.random.default_rng(seed=seed) mask = rng.choice([False, True], size=len(series), p=probs) - series[mask] = None + series.iloc[mask] = None # TODO: This function should be removed. Anywhere that it is being used should @@ -330,3 +331,9 @@ def does_not_raise(): def xfail_param(param, **kwargs): return pytest.param(param, marks=pytest.mark.xfail(**kwargs)) + + +parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( + "left_dtype,right_dtype", + list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), +) diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 041bd055f0a..4d5b5926d6e 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,10 +1,41 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + +import os import pathlib import pytest import rmm # noqa: F401 +_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent) + @pytest.fixture(scope="session") def datadir(): return pathlib.Path(__file__).parent / "data" + + +# To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use +# the sessionstart and sessionfinish hooks rather than a simple autouse, +# session-scope fixture because we need to set these variable before collection +# occurs because the environment variable will be checked as soon as cudf is +# imported anywhere. +def pytest_sessionstart(session): + """ + Called after the Session object has been created and + before performing collection and entering the run test loop. + """ + os.environ["NO_EXTERNAL_ONLY_APIS"] = "1" + os.environ["_CUDF_TEST_ROOT"] = _CURRENT_DIRECTORY + + +def pytest_sessionfinish(session, exitstatus): + """ + Called after whole test run finished, right before + returning the exit status to the system. + """ + try: + del os.environ["NO_EXTERNAL_ONLY_APIS"] + del os.environ["_CUDF_TEST_ROOT"] + except KeyError: + pass diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index 3fe0321ec54..e4b4d5020ea 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -1,175 +1,208 @@ +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + +import operator +import warnings +from contextlib import contextmanager +from functools import reduce + import cupy as cp import numpy as np -import pandas as pd import pytest import cudf -from cudf.testing._utils import assert_eq - - -@pytest.fixture -def np_ar_tup(): - np.random.seed(0) - return (np.random.random(100), np.random.random(100)) - +from cudf.testing._utils import assert_eq, set_random_null_mask_inplace -comparison_ops_ls = [ - np.greater, - np.greater_equal, - np.less, - np.less_equal, - np.equal, - np.not_equal, +_UFUNCS = [ + obj + for obj in (getattr(np, name) for name in dir(np)) + if isinstance(obj, np.ufunc) ] -@pytest.mark.parametrize( - "func", comparison_ops_ls + [np.subtract, np.fmod, np.power] -) -def test_ufunc_cudf_non_nullseries(np_ar_tup, func): - x, y = np_ar_tup[0], np_ar_tup[1] - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", [np.bitwise_and, np.bitwise_or, np.bitwise_xor], -) -def test_ufunc_cudf_series_bitwise(func): - np.random.seed(0) - x = np.random.randint(size=100, low=0, high=100) - y = np.random.randint(size=100, low=0, high=100) - - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", - [ - np.subtract, - np.multiply, - np.floor_divide, - np.true_divide, - np.power, - np.remainder, - np.divide, - ], -) -def test_ufunc_cudf_null_series(np_ar_tup, func): - x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32) - x[0] = np.nan - y[1] = np.nan - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - scalar = 0.5 - expect = func(x, scalar) - got = func(s_1, scalar) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - expect = func(scalar, x) - got = func(scalar, s_1) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - -@pytest.mark.xfail( - reason="""cuDF comparison operations with incorrectly - returns False rather than """ -) -@pytest.mark.parametrize( - "func", comparison_ops_ls, -) -def test_ufunc_cudf_null_series_comparison_ops(np_ar_tup, func): - x, y = np_ar_tup[0].astype(np.float32), np_ar_tup[1].astype(np.float32) - x[0] = np.nan - y[1] = np.nan - s_1, s_2 = cudf.Series(x), cudf.Series(y) - expect = func(x, y) - got = func(s_1, s_2) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - scalar = 0.5 - expect = func(x, scalar) - got = func(s_1, scalar) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - expect = func(scalar, x) - got = func(scalar, s_1) - assert_eq(expect, got.fillna(np.nan).to_numpy()) - - -@pytest.mark.parametrize( - "func", [np.logaddexp, np.fmax, np.fmod], -) -def test_ufunc_cudf_series_cupy_array(np_ar_tup, func): - x, y = np_ar_tup[0], np_ar_tup[1] - expect = func(x, y) - - cudf_s = cudf.Series(x) - cupy_ar = cp.array(y) - got = func(cudf_s, cupy_ar) - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.parametrize( - "func", - [np.fmod, np.logaddexp, np.bitwise_and, np.bitwise_or, np.bitwise_xor], -) -def test_error_with_null_cudf_series(func): - s_1 = cudf.Series([1, 2]) - s_2 = cudf.Series([1, None]) - - # this thows a value error - # because of nulls in cudf.Series - with pytest.raises(ValueError): - func(s_1, s_2) - - s_1 = cudf.Series([1, 2]) - s_2 = cudf.Series([1, 2, None]) - - # this throws a value-error if indexes are not aligned - # following pandas behavior for ufunc numpy dispatching - with pytest.raises( - ValueError, match="Can only compare identically-labeled Series objects" +@contextmanager +def _hide_ufunc_warnings(ufunc): + # pandas raises warnings for some inputs to the following ufuncs: + name = ufunc.__name__ + if name in { + "arccos", + "arccosh", + "arcsin", + "arctanh", + "fmod", + "log", + "log10", + "log2", + "reciprocal", + }: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + f"invalid value encountered in {name}", + category=RuntimeWarning, + ) + warnings.filterwarnings( + "ignore", + f"divide by zero encountered in {name}", + category=RuntimeWarning, + ) + yield + else: + yield + + +@pytest.mark.parametrize("ufunc", _UFUNCS) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_series(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + if indexed and fname in ( + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + "equal", ): - func(s_1, s_2) + pytest.skip("Comparison operators do not support misaligned indexes.") + + if (indexed or has_nulls) and fname == "matmul": + pytest.xfail("Frame.dot currently does not support indexes or nulls") + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + pandas_args = args = [ + cudf.Series( + cp.random.randint(low=1, high=10, size=N), + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg) + pandas_args = [arg.fillna(0) for arg in args] + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.series._align_indices(args, allow_non_unique=True) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce(operator.or_, (a.isna() for a in aligned)).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + with _hide_ufunc_warnings(ufunc): + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - expect[not_equal.to_pandas()] + if diffs.abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise @pytest.mark.parametrize( - "func", [np.absolute, np.sign, np.exp2, np.tanh], + "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] ) -def test_ufunc_cudf_series_with_index(func): - data = [-1, 2, 3, 0] - index = [2, 3, 1, 0] - cudf_s = cudf.Series(data=data, index=index) - pd_s = pd.Series(data=data, index=index) - - expect = func(pd_s) - got = func(cudf_s) - - assert_eq(got, expect) - - -@pytest.mark.parametrize( - "func", [np.logaddexp2], -) -def test_ufunc_cudf_series_with_nonaligned_index(func): - cudf_s1 = cudf.Series(data=[-1, 2, 3, 0], index=[2, 3, 1, 0]) - cudf_s2 = cudf.Series(data=[-1, 2, 3, 0], index=[3, 1, 0, 2]) - - # this throws a value-error if indexes are not aligned - # following pandas behavior for ufunc numpy dispatching - with pytest.raises( - ValueError, match="Can only compare identically-labeled Series objects" - ): - func(cudf_s1, cudf_s2) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +@pytest.mark.parametrize("type_", ["cupy", "numpy", "list"]) +@pytest.mark.parametrize("reflect", [True, False]) +def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): + fname = ufunc.__name__ + if fname in ("greater", "greater_equal", "logical_and") and has_nulls: + pytest.xfail( + "The way cudf casts nans in arrays to nulls during binops with " + "cudf objects is currently incompatible with pandas." + ) + if reflect and has_nulls and type_ == "cupy": + pytest.skip( + "When cupy is the left operand there is no way for us to avoid " + "calling its binary operators, which cannot handle cudf objects " + "that contain nulls." + ) + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + args = [ + cudf.Series( + cp.random.rand(N), + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg) + + # Cupy doesn't support nulls, so we fill with nans before converting. + args[1] = args[1].fillna(cp.nan) + mask = args[0].isna().to_pandas() + + arg1 = args[1].to_cupy() if type_ == "cupy" else args[1].to_numpy() + if type_ == "list": + arg1 = arg1.tolist() + + if reflect: + got = ufunc(arg1, args[0]) + expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) + else: + got = ufunc(args[0], arg1) + expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) + + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + if type_ == "cupy" and reflect: + assert (cp.asnumpy(g) == e).all() + else: + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + if type_ == "cupy" and reflect: + assert (cp.asnumpy(got) == expect).all() + else: + assert_eq(got, expect) @pytest.mark.parametrize( @@ -182,3 +215,103 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # this throws a value-error because of presence of out kwarg with pytest.raises(TypeError): func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) + + +# Skip matmul since it requires aligned shapes. +@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_dataframe(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + # TODO: When pandas starts supporting misaligned indexes properly, remove + # this check but enable the one below. + if indexed: + pytest.xfail( + "pandas does not currently support misaligned indexes in " + "DataFrames, but we do. Until this is fixed we will skip these " + "tests. See the error here: " + "https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L212, " # noqa: E501 + "called from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L258" # noqa: E501 + ) + # TODO: Enable the check below when we remove the check above. + # if indexed and fname in ( + # "greater", + # "greater_equal", + # "less", + # "less_equal", + # "not_equal", + # "equal", + # ): + # pytest.skip("Comparison operators do not support misaligned indexes.") # noqa: E501 + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + # TODO: Add tests of mismatched columns etc. + pandas_args = args = [ + cudf.DataFrame( + {"foo": cp.random.randint(low=1, high=10, size=N)}, + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg["foo"]) + pandas_args = [arg.copy() for arg in args] + for arg in pandas_args: + arg["foo"] = arg["foo"].fillna(0) + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.dataframe._align_indices(*args) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + with _hide_ufunc_warnings(ufunc): + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - cudf.from_pandas( + expect[not_equal.to_pandas()] + ) + if diffs["foo"].abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index a064bec9e82..9eb01ae31b4 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -210,7 +210,7 @@ def test_can_parse_no_schema(): assert_eq(expected, actual) -@pytest.mark.parametrize("rows", [0, 1, 10, 100000]) +@pytest.mark.parametrize("rows", [0, 1, 10, 1000]) @pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) def test_avro_compression(rows, codec): schema = { diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 76add8b9c5d..02ca7a0cd58 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -4,7 +4,7 @@ import decimal import operator import random -from itertools import product +from itertools import combinations_with_replacement, product import cupy as cp import numpy as np @@ -216,13 +216,12 @@ def test_series_compare(cmpop, obj_class, dtype): def _series_compare_nulls_typegen(): - tests = [] - tests += list(product(DATETIME_TYPES, DATETIME_TYPES)) - tests += list(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES)) - tests += list(product(NUMERIC_TYPES, NUMERIC_TYPES)) - tests += list(product(STRING_TYPES, STRING_TYPES)) - - return tests + return [ + *combinations_with_replacement(DATETIME_TYPES, 2), + *combinations_with_replacement(TIMEDELTA_TYPES, 2), + *combinations_with_replacement(NUMERIC_TYPES, 2), + *combinations_with_replacement(STRING_TYPES, 2), + ] @pytest.mark.parametrize("cmpop", _cmpops) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index b8724fe36f5..1ab5931fe5f 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -574,7 +574,7 @@ def test_concat_empty_dataframes(df, other, ignore_index): expected = pd.concat(other_pd, ignore_index=ignore_index) actual = gd.concat(other_gd, ignore_index=ignore_index) if expected.shape != df.shape: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): if not is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: @@ -1184,7 +1184,7 @@ def test_concat_join_empty_dataframes( ) if expected.shape != df.shape: if axis == 0: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): if not is_categorical_dtype(expected[key].dtype): # TODO: Pandas bug: @@ -1306,7 +1306,7 @@ def test_concat_join_empty_dataframes_axis_1( ) if expected.shape != df.shape: if axis == 0: - for key, col in actual[actual.columns].iteritems(): + for key, col in actual[actual.columns].items(): if is_categorical_dtype(col.dtype): expected[key] = expected[key].fillna("-1") actual[key] = col.astype("str").fillna("-1") diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 9208b8c7cd4..f3d69e1745e 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import gzip import os @@ -8,6 +8,7 @@ from io import BytesIO, StringIO from pathlib import Path +import cupy as cp import numpy as np import pandas as pd import pytest @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir): def test_csv_reader_carriage_return(tmpdir): rows = 1000 names = ["int_row", "int_double_row"] - buffer = ",".join(names) + "\r\n" for row in range(rows): buffer += str(row) + ", " + str(2 * row) + "\r\n" df = read_csv(StringIO(buffer)) + expect = cudf.DataFrame( + {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2} + ) assert len(df) == rows - for row in range(0, rows): - assert df[names[0]][row] == row - assert df[names[1]][row] == 2 * row + assert_eq(expect, df) def test_csv_reader_tabs(): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c7dfdd9d9f9..acd9e28c661 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -917,9 +917,7 @@ def test_dataframe_dtypes(): dtypes = pd.Series( [np.int32, np.float32, np.float64], index=["c", "a", "b"] ) - df = cudf.DataFrame( - {k: np.ones(10, dtype=v) for k, v in dtypes.iteritems()} - ) + df = cudf.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.items()}) assert df.dtypes.equals(dtypes) @@ -1956,7 +1954,7 @@ def test_dataframe_reductions(data, axis, func, skipna): elif func not in cudf.core.dataframe._cupy_nan_methods_map: if skipna is False: expected_exception = NotImplementedError - elif any(col.nullable for name, col in gdf.iteritems()): + elif any(col.nullable for name, col in gdf.items()): expected_exception = ValueError elif func in ("cummin", "cummax"): expected_exception = AttributeError @@ -2134,7 +2132,7 @@ def test_iter(pdf, gdf): def test_iteritems(gdf): - for k, v in gdf.iteritems(): + for k, v in gdf.items(): assert k in gdf.columns assert isinstance(v, cudf.Series) assert_eq(v, gdf[k]) @@ -3444,29 +3442,37 @@ def test_get_numeric_data(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) +@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) @pytest.mark.parametrize("data_empty", [False, True]) def test_shift(dtype, period, data_empty): - + # TODO : this function currently tests for series.shift() + # but should instead test for dataframe.shift() if data_empty: data = None else: if dtype == np.int8: # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) + data = gen_rand(dtype, 10, low=-2, high=2) else: - data = gen_rand(dtype, 100000) + data = gen_rand(dtype, 10) - gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) + gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) + ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) - shifted_outcome = gdf.a.shift(period).fillna(0) - expected_outcome = pdf.a.shift(period).fillna(0).astype(dtype) + shifted_outcome = gs.a.shift(period) + expected_outcome = ps.a.shift(period) + # pandas uses NaNs to signal missing value and force converts the + # results columns to float types if data_empty: - assert_eq(shifted_outcome, expected_outcome, check_index_type=False) + assert_eq( + shifted_outcome, + expected_outcome, + check_index_type=False, + check_dtype=False, + ) else: - assert_eq(shifted_outcome, expected_outcome) + assert_eq(shifted_outcome, expected_outcome, check_dtype=False) @pytest.mark.parametrize("dtype", NUMERIC_TYPES) @@ -8811,6 +8817,124 @@ def test_dataframe_add_suffix(): assert_eq(got, expected) +@pytest.mark.parametrize( + "data, gkey", + [ + ( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + ["id"], + ), + ( + { + "id": [0, 0, 0, 0, 1, 1, 1], + "a": [10.0, 3, 4, 2.0, -3.0, 9.0, 10.0], + "b": [10.0, 23, -4.0, 2, -3.0, 9, 19.0], + }, + ["id", "a"], + ), + ], +) +@pytest.mark.parametrize( + "min_periods", [0, 3], +) +@pytest.mark.parametrize( + "ddof", [1, 2], +) +def test_groupby_covariance(data, gkey, min_periods, ddof): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) + expected = pdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) + + assert_eq(expected, actual) + + +def test_groupby_covariance_multiindex_dataframe(): + gdf = cudf.DataFrame( + { + "a": [1, 1, 2, 2], + "b": [1, 1, 2, 3], + "c": [2, 3, 4, 5], + "d": [6, 8, 9, 1], + } + ).set_index(["a", "b"]) + + actual = gdf.groupby(level=["a", "b"]).cov() + expected = gdf.to_pandas().groupby(level=["a", "b"]).cov() + + assert_eq(expected, actual) + + +def test_groupby_covariance_empty_columns(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + actual = gdf.groupby("id").cov() + expected = pdf.groupby("id").cov() + + assert_eq( + expected, actual, check_dtype=False, check_index_type=False, + ) + + +def test_groupby_cov_invalid_column_types(): + gdf = cudf.DataFrame( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + }, + ) + with pytest.raises( + TypeError, match="Covariance accepts only numerical column-pairs", + ): + gdf.groupby("id").cov() + + +def test_groupby_cov_positive_semidefinite_matrix(): + # Refer to discussions in PR #9889 re "pair-wise deletion" strategy + # being used in pandas to compute the covariance of a dataframe with + # rows containing missing values. + # Note: cuDF currently matches pandas behavior in that the covariance + # matrices are not guaranteed PSD (positive semi definite). + # https://github.com/rapidsai/cudf/pull/9889#discussion_r794158358 + gdf = cudf.DataFrame( + [[1, 2], [None, 4], [5, None], [7, 8]], columns=["v0", "v1"] + ) + actual = gdf.groupby(by=cudf.Series([1, 1, 1, 1])).cov() + actual.reset_index(drop=True, inplace=True) + + pdf = gdf.to_pandas() + expected = pdf.groupby(by=pd.Series([1, 1, 1, 1])).cov() + expected.reset_index(drop=True, inplace=True) + + assert_eq( + expected, actual, check_dtype=False, + ) + + +@pytest.mark.xfail +def test_groupby_cov_for_pandas_bug_case(): + # Handles case: pandas bug using ddof with missing data. + # Filed an issue in Pandas on GH, link below: + # https://github.com/pandas-dev/pandas/issues/45814 + pdf = pd.DataFrame( + {"id": ["a", "a"], "val1": [1.0, 2.0], "val2": [np.nan, np.nan]} + ) + expected = pdf.groupby("id").cov(ddof=2) + + gdf = cudf.from_pandas(pdf) + actual = gdf.groupby("id").cov(ddof=2) + + assert_eq(expected, actual) + + @pytest.mark.parametrize( "data", [ @@ -8925,3 +9049,24 @@ def test_dataframe_rename_duplicate_column(): ValueError, match="Duplicate column names are not allowed" ): gdf.rename(columns={"a": "b"}, inplace=True) + + +@pytest.mark.parametrize( + "data", + [ + np.random.RandomState(seed=10).randint(-50, 50, (10, 10)), + np.random.RandomState(seed=10).random_sample((4, 4)), + np.array([1.123, 2.343, 5.890, 0.0]), + {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, + ], +) +@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"]) +def test_dataframe_pct_change(data, periods, fill_method): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.pct_change(periods=periods, fill_method=fill_method) + expected = pdf.pct_change(periods=periods, fill_method=fill_method) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index 47c9448cf63..692f40873d7 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -1,13 +1,17 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import operator import cupy as cp +import numpy as np import pytest from numba import cuda, types from numba.cuda import compile_ptx +from numba.np.numpy_support import from_dtype from cudf import NA from cudf.core.udf.api import Masked from cudf.core.udf.typing import MaskedType +from cudf.testing._utils import parametrize_numeric_dtypes_pairwise arith_ops = ( operator.add, @@ -159,19 +163,21 @@ def func(x): @pytest.mark.parametrize("op", ops) -@pytest.mark.parametrize("ty1", number_types, ids=number_ids) -@pytest.mark.parametrize("ty2", number_types, ids=number_ids) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize( "masked", ((False, True), (True, False), (True, True)), ids=("um", "mu", "mm"), ) -def test_compile_arith_masked_ops(op, ty1, ty2, masked): +def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked): def func(x, y): return op(x, y) cc = (7, 5) + ty1 = from_dtype(np.dtype(left_dtype)) + ty2 = from_dtype(np.dtype(right_dtype)) + if masked[0]: ty1 = MaskedType(ty1) if masked[1]: diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 102e5b57e8e..19d7c8a10ab 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from itertools import combinations @@ -1292,45 +1292,43 @@ def test_loc_datetime_index(sli, is_dataframe): @pytest.mark.parametrize( - "gdf", + "gdf_kwargs", [ - cudf.DataFrame({"a": range(1000000)}), - cudf.DataFrame({"a": range(1000000), "b": range(1000000)}), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( - { + {"data": {"a": range(100000)}}, + {"data": {"a": range(100000), "b": range(100000)}}, + { + "data": { "a": range(20), "b": range(20), "c": ["abc", "def", "xyz", "def", "pqr"] * 4, } - ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(1000000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(1000000)), - cudf.DataFrame( - columns=["a", "col2", "...col n"], index=range(1000000) - ), - cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")), - cudf.DataFrame( - columns=["a", "b", "c", "d"], - index=cudf.Series(range(1000000)).astype("str"), - ), + }, + {"index": [1, 2, 3]}, + {"index": range(100000)}, + {"columns": ["a", "b", "c", "d"]}, + {"columns": ["a"], "index": range(100000)}, + {"columns": ["a", "col2", "...col n"], "index": range(100000)}, + {"index": cudf.Series(range(100000)).astype("str")}, + { + "columns": ["a", "b", "c", "d"], + "index": cudf.Series(range(100000)).astype("str"), + }, ], ) @pytest.mark.parametrize( "slice", [ - slice(250000, 500000), - slice(250000, 250001), - slice(500000), + slice(25000, 50000), + slice(25000, 25001), + slice(50000), slice(1, 10), slice(10, 20), slice(15, 24000), slice(6), ], ) -def test_dataframe_sliced(gdf, slice): +def test_dataframe_sliced(gdf_kwargs, slice): + gdf = cudf.DataFrame(**gdf_kwargs) pdf = gdf.to_pandas() actual = gdf[slice] diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 8689f773a02..623098741a9 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -16,6 +16,7 @@ import cudf from cudf.io.orc import ORCWriter +from cudf.testing import assert_frame_equal from cudf.testing._utils import ( assert_eq, gen_rand_series, @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine): path, engine=engine, columns=columns, use_index=use_index ) - assert_eq(expect, got, check_categorical=False) + assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False) def test_orc_reader_filenotfound(tmpdir): @@ -384,11 +385,13 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression): else: print(type(excpr).__name__) - expect = orcfile.read(columns=columns).to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression) - got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas()) + expect.to_orc(gdf_fname.strpath, compression=compression) + got = cudf.from_pandas( + pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() + ) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -405,11 +408,11 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq): else: print(type(excpr).__name__) - expect = orcfile.read().to_pandas() - cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq) - got = pa.orc.ORCFile(gdf_fname).read().to_pandas() + expect = cudf.from_pandas(orcfile.read().to_pandas()) + expect.to_orc(gdf_fname.strpath, statistics=stats_freq) + got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas()) - assert_eq(expect, got) + assert_frame_equal(expect, got) @pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"]) @@ -492,8 +495,7 @@ def test_chunked_orc_writer( writer.close() got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas() - - assert_eq(expect, got) + assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got)) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index e1ca9f6f006..7feaa400446 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1105,9 +1105,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): assert_eq(expect, got) -@pytest.mark.parametrize("skip", range(0, 128)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_skiprows(skip, tmpdir): - num_rows = 128 + num_rows = 10 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), @@ -1124,9 +1124,9 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): assert_eq(expect, got, check_dtype=False) -@pytest.mark.parametrize("skip", range(0, 120)) +@pytest.mark.parametrize("skip", [0, 1, 5, 10]) def test_parquet_reader_list_num_rows(skip, tmpdir): - num_rows = 128 + num_rows = 20 src = pd.DataFrame( { "a": list_gen(int_gen, 0, num_rows, 80, 50), diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index ca02ee55df0..8f2e4811e36 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. import textwrap @@ -13,7 +13,14 @@ from cudf.testing import _utils as utils from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes -repr_categories = utils.NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] +repr_categories = [ + "uint16", + "int64", + "float64", + "str", + "category", + "datetime64[ns]", +] @pytest.mark.parametrize("dtype", repr_categories) @@ -84,36 +91,22 @@ def test_full_series(nrows, dtype): pd.reset_option("display.max_rows") +@pytest.mark.parametrize("nrows", [5, 10, 15]) +@pytest.mark.parametrize("ncols", [5, 10, 15]) +@pytest.mark.parametrize("size", [20, 21]) @pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -@pytest.mark.parametrize("ncols", [0, 1, 2, 9, 20 / 2, 11, 20 - 1, 20, 20 + 1]) -def test_full_dataframe_20(dtype, nrows, ncols): - size = 20 - pdf = pd.DataFrame( - {idx: np.random.randint(0, 100, size) for idx in range(size)} - ).astype(dtype) - gdf = cudf.from_pandas(pdf) - - assert pdf.__repr__() == gdf.__repr__() - assert pdf._repr_html_() == gdf._repr_html_() - assert pdf._repr_latex_() == gdf._repr_latex_() - - -@pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [9, 21 / 2, 11, 21 - 1]) -@pytest.mark.parametrize("ncols", [9, 21 / 2, 11, 21 - 1]) -def test_full_dataframe_21(dtype, nrows, ncols): - size = 21 +def test_full_dataframe_20(dtype, size, nrows, ncols): pdf = pd.DataFrame( {idx: np.random.randint(0, 100, size) for idx in range(size)} ).astype(dtype) gdf = cudf.from_pandas(pdf) - pd.options.display.max_rows = int(nrows) - pd.options.display.max_columns = int(ncols) - assert pdf.__repr__() == gdf.__repr__() - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") + with pd.option_context( + "display.max_rows", int(nrows), "display.max_columns", int(ncols) + ): + assert repr(pdf) == repr(gdf) + assert pdf._repr_html_() == gdf._repr_html_() + assert pdf._repr_latex_() == gdf._repr_latex_() @given( diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index b8f975f233e..2efa781c506 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import re @@ -17,9 +17,9 @@ ) -@pytest.mark.parametrize("num_id_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_value_vars", [0, 1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) +@pytest.mark.parametrize("num_id_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_value_vars", [0, 1, 2]) +@pytest.mark.parametrize("num_rows", [1, 2, 100]) @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) @pytest.mark.parametrize("nulls", ["none", "some", "all"]) def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 653c79fe603..56218372c23 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -532,12 +532,7 @@ def _cat_convert_seq_to_cudf(others): @pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) @pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) @pytest.mark.parametrize( - "index", - [ - ["1", "2", "3", "4", "5"], - pd.Series(["1", "2", "3", "4", "5"]), - pd.Index(["1", "2", "3", "4", "5"]), - ], + "index", [["1", "2", "3", "4", "5"]], ) def test_string_cat(ps_gs, others, sep, na_rep, index): ps, gs = ps_gs @@ -829,7 +824,9 @@ def test_string_join(ps_gs, sep): @pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"]) @pytest.mark.parametrize("expand", [True, False]) -@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (1, 1)]) +@pytest.mark.parametrize( + "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)] +) def test_string_extract(ps_gs, pat, expand, flags, flags_raise): ps, gs = ps_gs expectation = raise_builder([flags_raise], NotImplementedError) @@ -862,9 +859,7 @@ def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): ps, gs = ps_gs expectation = does_not_raise() - if flags_raise: - expectation = pytest.raises(ValueError) - if na_raise: + if flags_raise or na_raise: expectation = pytest.raises(NotImplementedError) with expectation: diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index dbff626c363..167f171fa26 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -205,6 +205,14 @@ def test_dataframe_to_struct(): df["a"][0] = 5 assert_eq(got, expect) + # check that a non-string (but convertible to string) named column can be + # converted to struct + df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0]) + expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}]) + with pytest.warns(UserWarning, match="will be casted"): + got = df.to_struct() + assert_eq(got, expect) + @pytest.mark.parametrize( "series, slce", diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index 56090c8eacf..faaea6eec08 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -1,3 +1,4 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import math import operator @@ -14,7 +15,11 @@ unary_ops, ) from cudf.core.udf.utils import precompiled -from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq +from cudf.testing._utils import ( + _decimal_series, + assert_eq, + parametrize_numeric_dtypes_pairwise, +) def run_masked_udf_test(func, data, args=(), **kwargs): @@ -238,10 +243,9 @@ def func(row): run_masked_udf_test(func, gdf, check_dtype=False) -@pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES)) -@pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES)) +@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) -def test_apply_mixed_dtypes(dtype_a, dtype_b, op): +def test_apply_mixed_dtypes(left_dtype, right_dtype, op): """ Test that operations can be performed between columns of different dtypes and return a column with the correct @@ -251,7 +255,7 @@ def test_apply_mixed_dtypes(dtype_a, dtype_b, op): # First perform the op on two dummy data on host, if numpy can # safely type cast, we should expect it to work in udf too. try: - op(getattr(np, dtype_a)(0), getattr(np, dtype_b)(42)) + op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42)) except TypeError: pytest.skip("Operation is unsupported for corresponding dtype.") @@ -261,8 +265,8 @@ def func(row): return op(x, y) gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) - gdf["a"] = gdf["a"].astype(dtype_a) - gdf["b"] = gdf["b"].astype(dtype_b) + gdf["a"] = gdf["a"].astype(left_dtype) + gdf["b"] = gdf["b"].astype(right_dtype) run_masked_udf_test(func, gdf, check_dtype=False) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 191e6591a74..4dadfede866 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -2,6 +2,8 @@ import decimal import functools +import os +import traceback from collections.abc import Sequence from typing import FrozenSet, Set, Union @@ -37,6 +39,60 @@ } +# The test root is set by pytest to support situations where tests are run from +# a source tree on a built version of cudf. +NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") + +_cudf_root = os.path.dirname(cudf.__file__) +# If the environment variable for the test root is not set, we default to +# using the path relative to the cudf root directory. +_tests_root = os.getenv("_CUDF_TEST_ROOT") or os.path.join(_cudf_root, "tests") + + +def _external_only_api(func, alternative=""): + """Decorator to indicate that a function should not be used internally. + + cudf contains many APIs that exist for pandas compatibility but are + intrinsically inefficient. For some of these cudf has internal + equivalents that are much faster. Usage of the slow public APIs inside + our implementation can lead to unnecessary performance bottlenecks. + Applying this decorator to such functions and setting the environment + variable NO_EXTERNAL_ONLY_APIS will cause such functions to raise + exceptions if they are called from anywhere inside cudf, making it easy + to identify and excise such usage. + + The `alternative` should be a complete phrase or sentence since it will + be used verbatim in error messages. + """ + + # If the first arg is a string then an alternative function to use in + # place of this API was provided, so we pass that to a subsequent call. + # It would be cleaner to implement this pattern by using a class + # decorator with a factory method, but there is no way to generically + # wrap docstrings on a class (we would need the docstring to be on the + # class itself, not instances, because that's what `help` looks at) and + # there is also no way to make mypy happy with that approach. + if isinstance(func, str): + return lambda actual_func: _external_only_api(actual_func, func) + + if not NO_EXTERNAL_ONLY_APIS: + return func + + @functools.wraps(func) + def wrapper(*args, **kwargs): + # Check the immediately preceding frame to see if it's in cudf. + frame, lineno = next(traceback.walk_stack(None)) + fn = frame.f_code.co_filename + if _cudf_root in fn and _tests_root not in fn: + raise RuntimeError( + f"External-only API called in {fn} at line {lineno}. " + f"{alternative}" + ) + return func(*args, **kwargs) + + return wrapper + + # TODO: We should evaluate whether calls to this could be more easily replaced # with column.full, which appears to be significantly faster in simple cases. def scalar_broadcast_to(scalar, size, dtype=None): @@ -69,15 +125,7 @@ def scalar_broadcast_to(scalar, size, dtype=None): scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype - if cudf.dtype(dtype).kind in ("O", "U"): - gather_map = column.full(size, 0, dtype="int32") - scalar_str_col = column.as_column([scalar], dtype="str") - return scalar_str_col[gather_map] - else: - out_col = column.column_empty(size, dtype=dtype) - if out_col.size != 0: - out_col.data_array_view[:] = scalar - return out_col + return cudf.core.column.full(size=size, fill_value=scalar, dtype=dtype) def initfunc(f): @@ -146,28 +194,6 @@ def set_allocator( IS_NEP18_ACTIVE = _is_nep18_active() -class cached_property: - """ - Like @property, but only evaluated upon first invocation. - To force re-evaluation of a cached_property, simply delete - it with `del`. - """ - - # TODO: Can be replaced with functools.cached_property when we drop support - # for Python 3.7. - - def __init__(self, func): - self.func = func - - def __get__(self, instance, cls): - if instance is None: - return self - else: - value = self.func(instance) - object.__setattr__(instance, self.func.__name__, value) - return value - - class GetAttrGetItemMixin: """This mixin changes `__getattr__` to attempt a `__getitem__` call. @@ -363,7 +389,10 @@ def get_appropriate_dispatched_func( cupy_compatible_args, index = _get_cupy_compatible_args_index(args) if cupy_compatible_args: cupy_output = cupy_func(*cupy_compatible_args, **kwargs) - return _cast_to_appropriate_cudf_type(cupy_output, index) + if isinstance(cupy_output, cp.ndarray): + return _cast_to_appropriate_cudf_type(cupy_output, index) + else: + return cupy_output return NotImplemented diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 89b5301ee83..1b1f3e29ab2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from collections.abc import Iterator @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from nvtx import annotate from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( @@ -39,6 +40,7 @@ @meta_nonempty.register(cudf.BaseIndex) +@annotate("_nonempty_index", color="green", domain="dask_cudf_python") def _nonempty_index(idx): if isinstance(idx, cudf.core.index.RangeIndex): return cudf.core.index.RangeIndex(2, name=idx.name) @@ -73,6 +75,7 @@ def _nonempty_index(idx): raise TypeError(f"Don't know how to handle index of type {type(idx)}") +@annotate("_get_non_empty_data", color="green", domain="dask_cudf_python") def _get_non_empty_data(s): if isinstance(s._column, cudf.core.column.CategoricalColumn): categories = ( @@ -100,6 +103,7 @@ def _get_non_empty_data(s): @meta_nonempty.register(cudf.Series) +@annotate("_nonempty_series", color="green", domain="dask_cudf_python") def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) @@ -109,6 +113,7 @@ def _nonempty_series(s, idx=None): @meta_nonempty.register(cudf.DataFrame) +@annotate("meta_nonempty_cudf", color="green", domain="dask_cudf_python") def meta_nonempty_cudf(x): idx = meta_nonempty(x.index) columns_with_dtype = dict() @@ -124,15 +129,18 @@ def meta_nonempty_cudf(x): @make_meta_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("make_meta_cudf", color="green", domain="dask_cudf_python") def make_meta_cudf(x, index=None): return x.head(0) @make_meta_dispatch.register(cudf.BaseIndex) +@annotate("make_meta_cudf_index", color="green", domain="dask_cudf_python") def make_meta_cudf_index(x, index=None): return x[:0] +@annotate("_empty_series", color="green", domain="dask_cudf_python") def _empty_series(name, dtype, index=None): if isinstance(dtype, str) and dtype == "category": return cudf.Series( @@ -142,6 +150,7 @@ def _empty_series(name, dtype, index=None): @make_meta_obj.register(object) +@annotate("make_meta_object_cudf", color="green", domain="dask_cudf_python") def make_meta_object_cudf(x, index=None): """Create an empty cudf object containing the desired metadata. @@ -212,6 +221,7 @@ def make_meta_object_cudf(x, index=None): @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex)) +@annotate("concat_cudf", color="green", domain="dask_cudf_python") def concat_cudf( dfs, axis=0, @@ -236,11 +246,13 @@ def concat_cudf( @categorical_dtype_dispatch.register( (cudf.DataFrame, cudf.Series, cudf.BaseIndex) ) +@annotate("categorical_dtype_cudf", color="green", domain="dask_cudf_python") def categorical_dtype_cudf(categories=None, ordered=None): return cudf.CategoricalDtype(categories=categories, ordered=ordered) @tolist_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("tolist_cudf", color="green", domain="dask_cudf_python") def tolist_cudf(obj): return obj.to_arrow().to_pylist() @@ -248,6 +260,9 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) ) +@annotate( + "is_categorical_dtype_cudf", color="green", domain="dask_cudf_python" +) def is_categorical_dtype_cudf(obj): return cudf.api.types.is_categorical_dtype(obj) @@ -261,6 +276,7 @@ def is_categorical_dtype_cudf(obj): ) @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex)) + @annotate("percentile_cudf", color="green", domain="dask_cudf_python") def percentile_cudf(a, q, interpolation="linear"): # Cudf dispatch to the equivalent of `np.percentile`: # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html @@ -305,6 +321,7 @@ def percentile_cudf(a, q, interpolation="linear"): @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("union_categoricals_cudf", color="green", domain="dask_cudf_python") def union_categoricals_cudf( to_union, sort_categories=False, ignore_order=False ): @@ -313,11 +330,13 @@ def union_categoricals_cudf( ) +@annotate("safe_hash", color="green", domain="dask_cudf_python") def safe_hash(frame): return cudf.Series(frame.hash_values(), index=frame.index) @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) +@annotate("hash_object_cudf", color="green", domain="dask_cudf_python") def hash_object_cudf(frame, index=True): if index: return safe_hash(frame.reset_index()) @@ -325,6 +344,7 @@ def hash_object_cudf(frame, index=True): @hash_object_dispatch.register(cudf.BaseIndex) +@annotate("hash_object_cudf_index", color="green", domain="dask_cudf_python") def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): @@ -335,6 +355,7 @@ def hash_object_cudf_index(ind, index=None): @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("group_split_cudf", color="green", domain="dask_cudf_python") def group_split_cudf(df, c, k, ignore_index=False): return dict( zip( @@ -349,10 +370,12 @@ def group_split_cudf(df, c, k, ignore_index=False): @sizeof_dispatch.register(cudf.DataFrame) +@annotate("sizeof_cudf_dataframe", color="green", domain="dask_cudf_python") def sizeof_cudf_dataframe(df): return int(df.memory_usage().sum()) @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("sizeof_cudf_series_index", color="green", domain="dask_cudf_python") def sizeof_cudf_series_index(obj): return obj.memory_usage() diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 729db6c232d..d8802f33941 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import math import warnings @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from tlz import partition_all import dask @@ -57,6 +58,7 @@ def __dask_postcompute__(self): def __dask_postpersist__(self): return type(self), (self._name, self._meta, self.divisions) + @annotate("_FRAME_INIT", color="green", domain="dask_cudf_python") def __init__(self, dsk, name, meta, divisions): if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[]) @@ -82,6 +84,9 @@ def __repr__(self): s = "" return s % (type(self).__name__, len(self.dask), self.npartitions) + @annotate( + "_FRAME_to_dask_dataframe", color="green", domain="dask_cudf_python" + ) def to_dask_dataframe(self, **kwargs): """Create a dask.dataframe object from a dask_cudf object""" nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False) @@ -99,6 +104,9 @@ def to_dask_dataframe(self, **kwargs): class DataFrame(_Frame, dd.core.DataFrame): _partition_type = cudf.DataFrame + @annotate( + "DATAFRAME_assign_column", color="green", domain="dask_cudf_python" + ) def _assign_column(self, k, v): def assigner(df, k, v): out = df.copy() @@ -108,6 +116,7 @@ def assigner(df, k, v): meta = assigner(self._meta, k, dask_make_meta(v)) return self.map_partitions(assigner, k, v, meta=meta) + @annotate("DATAFRAME_apply_rows", color="green", domain="dask_cudf_python") def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): import uuid @@ -127,6 +136,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs): do_apply_rows, func, incols, outcols, kwargs, meta=meta ) + @annotate("DATAFRAME_merge", color="green", domain="dask_cudf_python") def merge(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -138,6 +148,7 @@ def merge(self, other, **kwargs): on = list(on) return super().merge(other, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_join", color="green", domain="dask_cudf_python") def join(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -155,6 +166,7 @@ def join(self, other, **kwargs): on = list(on) return super().join(other, how=how, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_set_index", color="green", domain="dask_cudf_python") def set_index(self, other, sorted=False, divisions=None, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -226,6 +238,9 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): **kwargs, ) + @annotate( + "DATAFRAME_sort_values", color="green", domain="dask_cudf_python" + ) def sort_values( self, by, @@ -261,12 +276,14 @@ def sort_values( return df.reset_index(drop=True) return df + @annotate("DATAFRAME_to_parquet", color="green", domain="dask_cudf_python") def to_parquet(self, path, *args, **kwargs): """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" from dask_cudf.io import to_parquet return to_parquet(self, path, *args, **kwargs) + @annotate("DATAFRAME_to_orc", color="green", domain="dask_cudf_python") def to_orc(self, path, **kwargs): """Calls dask_cudf.io.to_orc""" from dask_cudf.io import to_orc @@ -274,6 +291,7 @@ def to_orc(self, path, **kwargs): return to_orc(self, path, **kwargs) @derived_from(pd.DataFrame) + @annotate("DATAFRAME_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -302,6 +320,9 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate( + "DATAFRAME_repartition", color="green", domain="dask_cudf_python" + ) def repartition(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.repartition method. Uses DataFrame.shuffle if `columns=` is specified. @@ -324,6 +345,7 @@ def repartition(self, *args, **kwargs): ) return super().repartition(*args, **kwargs) + @annotate("DATAFRAME_shuffle", color="green", domain="dask_cudf_python") def shuffle(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.shuffle method""" shuffle_arg = kwargs.pop("shuffle", None) @@ -331,18 +353,21 @@ def shuffle(self, *args, **kwargs): raise ValueError("dask_cudf does not support disk-based shuffle.") return super().shuffle(*args, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_groupby", color="green", domain="dask_cudf_python") def groupby(self, by=None, **kwargs): from .groupby import CudfDataFrameGroupBy return CudfDataFrameGroupBy(self, by=by, **kwargs) +@annotate("DATAFRAME_sum_of_squares", color="green", domain="dask_cudf_python") def sum_of_squares(x): x = x.astype("f8")._column outcol = libcudf.reduce.reduce("sum_of_squares", x) return cudf.Series(outcol) +@annotate("DATAFRAME_var_aggregate", color="green", domain="dask_cudf_python") def var_aggregate(x2, x, n, ddof): try: with warnings.catch_warnings(record=True): @@ -355,10 +380,12 @@ def var_aggregate(x2, x, n, ddof): return np.float64(np.nan) +@annotate("DATAFRAME_nlargest_agg", color="green", domain="dask_cudf_python") def nlargest_agg(x, **kwargs): return cudf.concat(x).nlargest(**kwargs) +@annotate("DATAFRAME_nsmallest_agg", color="green", domain="dask_cudf_python") def nsmallest_agg(x, **kwargs): return cudf.concat(x).nsmallest(**kwargs) @@ -366,6 +393,7 @@ def nsmallest_agg(x, **kwargs): class Series(_Frame, dd.core.Series): _partition_type = cudf.Series + @annotate("Series_count", color="green", domain="dask_cudf_python") def count(self, split_every=False): return reduction( [self], @@ -375,12 +403,14 @@ def count(self, split_every=False): meta="i8", ) + @annotate("Series_mean", color="green", domain="dask_cudf_python") def mean(self, split_every=False): sum = self.sum(split_every=split_every) n = self.count(split_every=split_every) return sum / n @derived_from(pd.DataFrame) + @annotate("Series_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -409,16 +439,19 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate("Series_groupby", color="green", domain="dask_cudf_python") def groupby(self, *args, **kwargs): from .groupby import CudfSeriesGroupBy return CudfSeriesGroupBy(self, *args, **kwargs) @property + @annotate("Series_list", color="green", domain="dask_cudf_python") def list(self): return ListMethods(self) @property + @annotate("Series_struct", color="green", domain="dask_cudf_python") def struct(self): return StructMethods(self) @@ -427,6 +460,7 @@ class Index(Series, dd.core.Index): _partition_type = cudf.Index # type: ignore +@annotate("_naive_var", color="green", domain="dask_cudf_python") def _naive_var(ddf, meta, skipna, ddof, split_every, out): num = ddf._get_numeric_data() x = 1.0 * num.sum(skipna=skipna, split_every=split_every) @@ -441,6 +475,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): return handle_out(out, result) +@annotate("_parallel_var", color="green", domain="dask_cudf_python") def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: @@ -507,6 +542,7 @@ def _finalize_var(vals): return handle_out(out, result) +@annotate("_extract_meta", color="green", domain="dask_cudf_python") def _extract_meta(x): """ Extract internal cache data (``_meta``) from dask_cudf objects @@ -522,6 +558,7 @@ def _extract_meta(x): return x +@annotate("_emulate", color="green", domain="dask_cudf_python") def _emulate(func, *args, **kwargs): """ Apply a function using args / kwargs. If arguments contain dd.DataFrame / @@ -531,6 +568,7 @@ def _emulate(func, *args, **kwargs): return func(*_extract_meta(args), **_extract_meta(kwargs)) +@annotate("align_partitions", color="green", domain="dask_cudf_python") def align_partitions(args): """Align partitions between dask_cudf objects. @@ -546,6 +584,7 @@ def align_partitions(args): return args +@annotate("reduction", color="green", domain="dask_cudf_python") def reduction( args, chunk=None, @@ -684,6 +723,7 @@ def reduction( return dd.core.new_dd_object(graph, b, meta, (None, None)) +@annotate("from_cudf", color="green", domain="dask_cudf_python") def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( @@ -705,6 +745,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): ) +@annotate("from_dask_dataframe", color="green", domain="dask_cudf_python") def from_dask_dataframe(df): return df.map_partitions(cudf.from_pandas) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 1bc270a5b9f..658e63ea923 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from dask.base import tokenize from dask.dataframe.core import ( @@ -35,11 +36,19 @@ class CudfDataFrameGroupBy(DataFrameGroupBy): + @annotate( + "CudfDataFrameGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfDataFrameGroupBy_GETITEM", + color="green", + domain="dask_cudf_python", + ) def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( @@ -53,6 +62,9 @@ def __getitem__(self, key): g._meta = g._meta[key] return g + @annotate( + "CudfDataFrameGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -66,6 +78,11 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_COLLECT", + color="green", + domain="dask_cudf_python", + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -79,6 +96,11 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_AGGREGATE", + color="green", + domain="dask_cudf_python", + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -118,11 +140,17 @@ def aggregate(self, arg, split_every=None, split_out=1): class CudfSeriesGroupBy(SeriesGroupBy): + @annotate( + "CudfSeriesGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfSeriesGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -136,6 +164,9 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_STD", color="green", domain="dask_cudf_python" + ) def std(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -149,6 +180,9 @@ def std(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_VAR", color="green", domain="dask_cudf_python" + ) def var(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -162,6 +196,9 @@ def var(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_COLLECT", color="green", domain="dask_cudf_python" + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -175,6 +212,9 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_AGGREGATE", color="green", domain="dask_cudf_python" + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -205,6 +245,7 @@ def aggregate(self, arg, split_every=None, split_out=1): ) +@annotate("groupby_agg", color="green", domain="dask_cudf_python") def groupby_agg( ddf, gb_cols, @@ -371,6 +412,7 @@ def groupby_agg( return new_dd_object(graph, gb_agg_name, _meta, divisions) +@annotate("_redirect_aggs", color="green", domain="dask_cudf_python") def _redirect_aggs(arg): """Redirect aggregations to their corresponding name in cuDF""" redirects = { @@ -397,6 +439,7 @@ def _redirect_aggs(arg): return redirects.get(arg, arg) +@annotate("_is_supported", color="green", domain="dask_cudf_python") def _is_supported(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): @@ -422,6 +465,7 @@ def _make_name(*args, sep="_"): return sep.join(_args) +@annotate("_groupby_partition_agg", color="green", domain="dask_cudf_python") def _groupby_partition_agg( df, gb_cols, aggs, columns, split_out, dropna, sort, sep ): @@ -479,6 +523,7 @@ def _groupby_partition_agg( return output +@annotate("_tree_node_agg", color="green", domain="dask_cudf_python") def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): """Node in groupby-aggregation reduction tree. @@ -513,6 +558,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): return gb +@annotate("_var_agg", color="green", domain="dask_cudf_python") def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): """Calculate variance (given count, sum, and sum-squared columns).""" @@ -534,6 +580,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): return var +@annotate("_finalize_gb_agg", color="green", domain="dask_cudf_python") def _finalize_gb_agg( gb, gb_cols, diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index af40d9ca41b..ada738c5a9b 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -1,9 +1,11 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + from collections.abc import Iterator import cupy import numpy as np import tlz as toolz +from nvtx import annotate from dask.base import tokenize from dask.dataframe import methods @@ -16,12 +18,14 @@ from cudf.api.types import is_categorical_dtype +@annotate("set_index_post", color="green", domain="dask_cudf_python") def set_index_post(df, index_name, drop, column_dtype): df2 = df.set_index(index_name, drop=drop) df2.columns = df2.columns.astype(column_dtype) return df2 +@annotate("_set_partitions_pre", color="green", domain="dask_cudf_python") def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): if ascending: partitions = divisions.searchsorted(s, side="right") - 1 @@ -38,6 +42,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): return partitions +@annotate("_quantile", color="green", domain="dask_cudf_python") def _quantile(a, q): n = len(a) if not len(a): @@ -45,6 +50,7 @@ def _quantile(a, q): return (a.quantiles(q=q.tolist(), interpolation="nearest"), n) +@annotate("merge_quantiles", color="green", domain="dask_cudf_python") def merge_quantiles(finalq, qs, vals): """Combine several quantile calculations of different data. [NOTE: Same logic as dask.array merge_percentiles] @@ -107,6 +113,7 @@ def _append_counts(val, count): return rv.reset_index(drop=True) +@annotate("_approximate_quantile", color="green", domain="dask_cudf_python") def _approximate_quantile(df, q): """Approximate quantiles of DataFrame or Series. [NOTE: Same logic as dask.dataframe Series quantile] @@ -180,6 +187,7 @@ def set_quantile_index(df): return df +@annotate("quantile_divisions", color="green", domain="cudf_python") def quantile_divisions(df, by, npartitions): qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() divisions = _approximate_quantile(df[by], qn).compute() @@ -213,6 +221,7 @@ def quantile_divisions(df, by, npartitions): return divisions +@annotate("sort_values", color="green", domain="cudf_python") def sort_values( df, by,