diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json index 84616c25cf2..15b51da8dea 100644 --- a/.devcontainer/cuda11.8-pip/devcontainer.json +++ b/.devcontainer/cuda11.8-pip/devcontainer.json @@ -5,7 +5,7 @@ "args": { "CUDA": "11.8", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04" } }, "hostRequirements": {"gpu": "optional"}, diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json similarity index 92% rename from .devcontainer/cuda12.0-conda/devcontainer.json rename to .devcontainer/cuda12.2-conda/devcontainer.json index ef2b34b41a6..31ae8426763 100644 --- a/.devcontainer/cuda12.0-conda/devcontainer.json +++ b/.devcontainer/cuda12.2-conda/devcontainer.json @@ -3,7 +3,7 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.0", + "CUDA": "12.2", "PYTHON_PACKAGE_MANAGER": "conda", "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04" } @@ -15,7 +15,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.0-envs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -24,7 +24,7 @@ "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.0-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json similarity index 87% rename from .devcontainer/cuda12.0-pip/devcontainer.json rename to .devcontainer/cuda12.2-pip/devcontainer.json index d3257b6cf43..93367527a86 100644 --- a/.devcontainer/cuda12.0-pip/devcontainer.json +++ b/.devcontainer/cuda12.2-pip/devcontainer.json @@ -3,9 +3,9 @@ "context": "${localWorkspaceFolder}/.devcontainer", "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile", "args": { - "CUDA": "12.0", + "CUDA": "12.2", "PYTHON_PACKAGE_MANAGER": "pip", - "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda12.0-ubuntu22.04" + "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04" } }, "hostRequirements": {"gpu": "optional"}, @@ -15,7 +15,7 @@ "overrideFeatureInstallOrder": [ "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], - "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs}"], + "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"], "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"], "workspaceFolder": "/home/coder", "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent", @@ -23,7 +23,7 @@ "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent", "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent", - "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.0-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" + "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent" ], "customizations": { "vscode": { diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 9578d32d13d..31cfeaf4ca3 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -11,6 +11,7 @@ python/dask_cudf/ @rapidsai/cudf-dask-codeowners cpp/CMakeLists.txt @rapidsai/cudf-cmake-codeowners cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners **/cmake/ @rapidsai/cudf-cmake-codeowners +*.cmake @rapidsai/cudf-cmake-codeowners #java code owners java/ @rapidsai/cudf-java-codeowners diff --git a/.github/labeler.yml b/.github/labeler.yml index b0b0db9684a..d14344384d1 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -10,6 +10,7 @@ libcudf: CMake: - '**/CMakeLists.txt' - '**/cmake/**' + - '**/*.cmake' cuDF (Java): - 'java/**' diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9e11993048f..4a662ed0f43 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -141,8 +141,10 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@fix/devcontainer-json-location with: + arch: '["amd64"]' + cuda: '["12.2"]' build_command: | sccache -z; build-all -DBUILD_BENCHMARKS=ON --verbose; diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d302543368e..9235c80bdc9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -126,12 +126,6 @@ repos: - cmakelang==0.6.13 verbose: true require_serial: true - - id: copyright-check - name: copyright-check - entry: python ./ci/checks/copyright.py --git-modified-only --update-current-year - language: python - pass_filenames: false - additional_dependencies: [gitpython] - id: doxygen-check name: doxygen-check entry: ./ci/checks/doxygen.sh @@ -161,6 +155,13 @@ repos: hooks: - id: ruff files: python/.*$ + - repo: https://github.com/rapidsai/pre-commit-hooks + rev: v0.0.1 + hooks: + - id: verify-copyright + exclude: | + (?x) + cpp/include/cudf_test/cxxopts[.]hpp$ default_language_version: diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py deleted file mode 100644 index dd89b092496..00000000000 --- a/ci/checks/copyright.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import datetime -import os -import re -import sys - -import git - -FilesToCheck = [ - re.compile(r"[.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx)$"), - re.compile(r"CMakeLists[.]txt$"), - re.compile(r"CMakeLists_standalone[.]txt$"), - re.compile(r"setup[.]cfg$"), - re.compile(r"meta[.]yaml$"), -] -ExemptFiles = [ - re.compile(r"cpp/include/cudf_test/cxxopts.hpp"), -] - -# this will break starting at year 10000, which is probably OK :) -CheckSimple = re.compile( - r"Copyright *(?:\(c\))? *(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" -) -CheckDouble = re.compile( - r"Copyright *(?:\(c\))? *(\d{4})-(\d{4}),? *NVIDIA C(?:ORPORATION|orporation)" # noqa: E501 -) - - -def checkThisFile(f): - if isinstance(f, git.Diff): - if f.deleted_file or f.b_blob.size == 0: - return False - f = f.b_path - elif not os.path.exists(f) or os.stat(f).st_size == 0: - # This check covers things like symlinks which point to files that DNE - return False - for exempt in ExemptFiles: - if exempt.search(f): - return False - for checker in FilesToCheck: - if checker.search(f): - return True - return False - - -def modifiedFiles(): - """Get a set of all modified files, as Diff objects. - - The files returned have been modified in git since the merge base of HEAD - and the upstream of the target branch. We return the Diff objects so that - we can read only the staged changes. - """ - repo = git.Repo() - # Use the environment variable TARGET_BRANCH or RAPIDS_BASE_BRANCH (defined in CI) if possible - target_branch = os.environ.get("TARGET_BRANCH", os.environ.get("RAPIDS_BASE_BRANCH")) - if target_branch is None: - # Fall back to the closest branch if not on CI - target_branch = repo.git.describe( - all=True, tags=True, match="branch-*", abbrev=0 - ).lstrip("heads/") - - upstream_target_branch = None - if target_branch in repo.heads: - # Use the tracking branch of the local reference if it exists. This - # returns None if no tracking branch is set. - upstream_target_branch = repo.heads[target_branch].tracking_branch() - if upstream_target_branch is None: - # Fall back to the remote with the newest target_branch. This code - # path is used on CI because the only local branch reference is - # current-pr-branch, and thus target_branch is not in repo.heads. - # This also happens if no tracking branch is defined for the local - # target_branch. We use the remote with the latest commit if - # multiple remotes are defined. - candidate_branches = [ - remote.refs[target_branch] for remote in repo.remotes - if target_branch in remote.refs - ] - if len(candidate_branches) > 0: - upstream_target_branch = sorted( - candidate_branches, - key=lambda branch: branch.commit.committed_datetime, - )[-1] - else: - # If no remotes are defined, try to use the local version of the - # target_branch. If this fails, the repo configuration must be very - # strange and we can fix this script on a case-by-case basis. - upstream_target_branch = repo.heads[target_branch] - merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0] - diff = merge_base.diff() - changed_files = {f for f in diff if f.b_path is not None} - return changed_files - - -def getCopyrightYears(line): - res = CheckSimple.search(line) - if res: - return int(res.group(1)), int(res.group(1)) - res = CheckDouble.search(line) - if res: - return int(res.group(1)), int(res.group(2)) - return None, None - - -def replaceCurrentYear(line, start, end): - # first turn a simple regex into double (if applicable). then update years - res = CheckSimple.sub(r"Copyright (c) \1-\1, NVIDIA CORPORATION", line) - res = CheckDouble.sub( - rf"Copyright (c) {start:04d}-{end:04d}, NVIDIA CORPORATION", - res, - ) - return res - - -def checkCopyright(f, update_current_year): - """Checks for copyright headers and their years.""" - errs = [] - thisYear = datetime.datetime.now().year - lineNum = 0 - crFound = False - yearMatched = False - - if isinstance(f, git.Diff): - path = f.b_path - lines = f.b_blob.data_stream.read().decode().splitlines(keepends=True) - else: - path = f - with open(f, encoding="utf-8") as fp: - lines = fp.readlines() - - for line in lines: - lineNum += 1 - start, end = getCopyrightYears(line) - if start is None: - continue - crFound = True - if start > end: - e = [ - path, - lineNum, - "First year after second year in the copyright " - "header (manual fix required)", - None, - ] - errs.append(e) - elif thisYear < start or thisYear > end: - e = [ - path, - lineNum, - "Current year not included in the copyright header", - None, - ] - if thisYear < start: - e[-1] = replaceCurrentYear(line, thisYear, end) - if thisYear > end: - e[-1] = replaceCurrentYear(line, start, thisYear) - errs.append(e) - else: - yearMatched = True - # copyright header itself not found - if not crFound: - e = [ - path, - 0, - "Copyright header missing or formatted incorrectly " - "(manual fix required)", - None, - ] - errs.append(e) - # even if the year matches a copyright header, make the check pass - if yearMatched: - errs = [] - - if update_current_year: - errs_update = [x for x in errs if x[-1] is not None] - if len(errs_update) > 0: - lines_changed = ", ".join(str(x[1]) for x in errs_update) - print(f"File: {path}. Changing line(s) {lines_changed}") - for _, lineNum, __, replacement in errs_update: - lines[lineNum - 1] = replacement - with open(path, "w", encoding="utf-8") as out_file: - out_file.writelines(lines) - - return errs - - -def getAllFilesUnderDir(root, pathFilter=None): - retList = [] - for dirpath, dirnames, filenames in os.walk(root): - for fn in filenames: - filePath = os.path.join(dirpath, fn) - if pathFilter(filePath): - retList.append(filePath) - return retList - - -def checkCopyright_main(): - """ - Checks for copyright headers in all the modified files. In case of local - repo, this script will just look for uncommitted files and in case of CI - it compares between branches "$PR_TARGET_BRANCH" and "current-pr-branch" - """ - retVal = 0 - - argparser = argparse.ArgumentParser( - "Checks for a consistent copyright header in git's modified files" - ) - argparser.add_argument( - "--update-current-year", - dest="update_current_year", - action="store_true", - required=False, - help="If set, " - "update the current year if a header is already " - "present and well formatted.", - ) - argparser.add_argument( - "--git-modified-only", - dest="git_modified_only", - action="store_true", - required=False, - help="If set, " - "only files seen as modified by git will be " - "processed.", - ) - - args, dirs = argparser.parse_known_args() - - if args.git_modified_only: - files = [f for f in modifiedFiles() if checkThisFile(f)] - else: - files = [] - for d in [os.path.abspath(d) for d in dirs]: - if not os.path.isdir(d): - raise ValueError(f"{d} is not a directory.") - files += getAllFilesUnderDir(d, pathFilter=checkThisFile) - - errors = [] - for f in files: - errors += checkCopyright(f, args.update_current_year) - - if len(errors) > 0: - if any(e[-1] is None for e in errors): - print("Copyright headers incomplete in some of the files!") - for e in errors: - print(" %s:%d Issue: %s" % (e[0], e[1], e[2])) - print("") - n_fixable = sum(1 for e in errors if e[-1] is not None) - path_parts = os.path.abspath(__file__).split(os.sep) - file_from_repo = os.sep.join(path_parts[path_parts.index("ci") :]) - if n_fixable > 0 and not args.update_current_year: - print( - f"You can run `python {file_from_repo} --git-modified-only " - "--update-current-year` and stage the results in git to " - f"fix {n_fixable} of these errors.\n" - ) - retVal = 1 - - return retVal - - -if __name__ == "__main__": - sys.exit(checkCopyright_main()) diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index ace71bb0b75..bacb54b3896 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -18,7 +18,7 @@ rapids-logger "pytest cudf" ./ci/run_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-coverage.xml" \ @@ -32,7 +32,7 @@ rapids-logger "pytest cudf" rapids-logger "pytest for cudf benchmarks" ./ci/run_cudf_pytest_benchmarks.sh \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-coverage.xml" \ @@ -41,7 +41,7 @@ rapids-logger "pytest for cudf benchmarks" rapids-logger "pytest for cudf benchmarks using pandas" ./ci/run_cudf_pandas_pytest_benchmarks.sh \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=.coveragerc \ --cov=cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-benchmark-pandas-coverage.xml" \ diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index bc15747b26a..9cdceb295db 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -23,7 +23,7 @@ rapids-logger "pytest dask_cudf" ./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=dask_cudf \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ @@ -33,7 +33,7 @@ rapids-logger "pytest custreamz" ./ci/run_custreamz_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ --cov-config=../.coveragerc \ --cov=custreamz \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index b7e8f862ed5..af5779f478a 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -37,7 +37,7 @@ else --cache-clear \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ . popd fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5fd6cd3544a..c74963be50d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -376,7 +376,7 @@ add_library( src/io/functions.cpp src/io/json/byte_range_info.cu src/io/json/json_column.cu - src/io/json/json_quote_normalization.cu + src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index 639a3dc1181..a7db972d39f 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -45,7 +45,9 @@ void bench_case(nvbench::state& state) cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range auto input = cudf::strings_column_view(col_view); auto ascii_column = create_random_column( - cudf::type_id::INT8, row_count{input.chars_size(cudf::get_default_stream())}, ascii_profile); + cudf::type_id::INT8, + row_count{static_cast(input.chars_size(cudf::get_default_stream()))}, + ascii_profile); auto ascii_data = ascii_column->view(); col_view = cudf::column_view(col_view.type(), diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 0eb0e17ea10..3f7f7e9bb32 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -63,4 +63,14 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& in rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Normalize unquoted whitespace (space and tab characters) using FST + * + * @param inbuf Input device buffer + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ +rmm::device_uvector normalize_whitespace(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index f0c3d48ab7e..593dd044d51 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -118,6 +118,9 @@ class json_reader_options { // Normalize single quotes bool _normalize_single_quotes = false; + // Normalize unquoted spaces and tabs + bool _normalize_whitespace = false; + // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; @@ -265,6 +268,13 @@ class json_reader_options { */ bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; } + /** + * @brief Whether the reader should normalize unquoted whitespace characters + * + * @returns true if the reader should normalize whitespace, false otherwise + */ + bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; } + /** * @brief Queries the JSON reader's behavior on invalid JSON lines. * @@ -358,6 +368,14 @@ class json_reader_options { */ void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; } + /** + * @brief Set whether the reader should enable normalization of unquoted whitespace + * + * @param val Boolean value to indicate whether the reader should normalize unquoted whitespace + * characters i.e. tabs and spaces + */ + void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * @@ -533,6 +551,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the reader should normalize unquoted whitespace + * + * @param val Boolean value to indicate whether the reader should normalize unquoted + * whitespace + * @return this for chaining + */ + json_reader_options_builder& normalize_whitespace(bool val) + { + options._normalize_whitespace = val; + return *this; + } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index 840a2dd1165..036589e17fe 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -112,7 +112,7 @@ class strings_column_view : private column_view { * @param stream CUDA stream used for device memory operations and kernel launches * @return Number of bytes in the chars child column */ - [[nodiscard]] size_type chars_size(rmm::cuda_stream_view stream) const noexcept; + [[nodiscard]] int64_t chars_size(rmm::cuda_stream_view stream) const noexcept; /** * @brief Return an iterator for the chars child column. diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_normalization.cu similarity index 57% rename from cpp/src/io/json/json_quote_normalization.cu rename to cpp/src/io/json/json_normalization.cu index a13b6e0b016..86e4da664a8 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -32,13 +32,15 @@ namespace cudf::io::json { -using SymbolT = char; -using StateT = char; +// Type used to represent the atomic symbol type used within the finite-state machine +using SymbolT = char; +using StateT = char; + +// Type sufficiently large to index symbols within the input and output (may be unsigned) using SymbolOffsetT = uint32_t; namespace normalize_quotes { -// Type sufficiently large to index symbols within the input and output (may be unsigned) enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " @@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes { } // namespace normalize_quotes +namespace normalize_whitespace { + +enum class dfa_symbol_group_id : uint32_t { + DOUBLE_QUOTE_CHAR, ///< Quote character SG: " + ESCAPE_CHAR, ///< Escape character SG: '\\' + NEWLINE_CHAR, ///< Newline character SG: '\n' + WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' + OTHER_SYMBOLS, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; +// Alias for readability of symbol group ids +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); +// The i-th string representing all the characters of a symbol group +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ + {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +/** + * -------- FST states --------- + * ----------------------------- + * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double + * | quotes as well as any other character not enclosed by a string. Also handles + * | newline character present within a string + * TT_DQS | Double-quoted string state handling all characters within double quotes except + * | newline character + * TT_DEC | State handling escaped characters inside double-quoted string. Note that this + * | state is necessary to process escaped double-quote characters. Without this + * | state, whitespaces following escaped double quotes inside strings may be removed. + * + * NOTE: An important case NOT handled by this FST is that of whitespace following newline + * characters within a string. Consider the following example + * Input: {"a":"x\n y"} + * FST output: {"a":"x\ny"} + * Expected output: {"a":"x\n y"} + * Such strings are not part of the JSON standard (characters allowed within quotes should + * have ASCII at least 0x20 i.e. space character and above) but may be encountered while + * reading JSON files + */ +enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; +// Aliases for readability of the transition table +constexpr auto TT_OOS = dfa_states::TT_OOS; +constexpr auto TT_DQS = dfa_states::TT_DQS; +constexpr auto TT_DEC = dfa_states::TT_DEC; +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); + +// Transition table +std::array, TT_NUM_STATES> const wna_state_tt{ + {/* IN_STATE " \ \n OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + +// The DFA's starting state +constexpr StateT start_state = static_cast(TT_OOS); + +struct TransduceToNormalizedWS { + /** + * @brief Returns the -th output symbol on the transition (state_id, match_id). + */ + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + // -------- TRANSLATION TABLE ------------ + // Let the alphabet set be Sigma + // --------------------------------------- + // ---------- NON-SPECIAL CASES: ---------- + // Output symbol same as input symbol + // state | read_symbol -> output_symbol + // DQS | Sigma -> Sigma + // OOS | Sigma\{,\t} -> Sigma\{,\t} + // DEC | Sigma -> Sigma + // ---------- SPECIAL CASES: -------------- + // Input symbol translates to output symbol + // OOS | {} -> + // OOS | {\t} -> + + // Case when read symbol is a space or tab but is unquoted + // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function + // However, since there is no output in this case i.e. the count returned by + // operator()(state_id, match_id, read_symbol) is zero, this function is never called. + // So skipping the check for this case. + + // In all other cases, we have an output symbol for the input symbol. + // We simply output the input symbol + return read_symbol; + } + + /** + * @brief Returns the number of output characters for a given transition. + * During whitespace normalization, we always emit one output character i.e., the input + * character, except when we need to remove the space/tab character + */ + template + constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + // Case when read symbol is a space or tab but is unquoted + if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && + state_id == static_cast(dfa_states::TT_OOS)) { + return 0; + } + return 1; + } +}; + +} // namespace normalize_whitespace + namespace detail { rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& inbuf, @@ -198,5 +310,29 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector normalize_whitespace(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), + fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), + fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}), + stream); + + rmm::device_uvector outbuf(inbuf.size(), stream, mr); + rmm::device_scalar outbuf_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), + outbuf.data(), + thrust::make_discard_iterator(), + outbuf_size.data(), + normalize_whitespace::start_state, + stream); + + outbuf.resize(outbuf_size.value(stream), stream); + return outbuf; +} + } // namespace detail } // namespace cudf::io::json diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index ba8acf2d47a..506d7b6cddc 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -235,6 +235,13 @@ table_with_metadata read_json(host_span> sources, normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource()); } + // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is + // enabled, invoke pre-processing FST + if (reader_opts.is_enabled_normalize_whitespace()) { + buffer = + normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource()); + } + return device_parse_nested_json(buffer, reader_opts, stream, mr); // For debug purposes, use host_parse_nested_json() } diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index cf2ba369023..d2026473b6c 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -18,7 +18,6 @@ #include "io/utilities/config_utils.hpp" #include -#include #include #include #include @@ -27,7 +26,6 @@ #include -#include #include #include #include @@ -338,6 +336,33 @@ class device_buffer_source final : public datasource { cudf::device_span _d_buffer; ///< A non-owning view of the existing device data }; +// zero-copy host buffer source +class host_buffer_source final : public datasource { + public: + explicit host_buffer_source(cudf::host_span h_buffer) : _h_buffer{h_buffer} {} + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + auto const count = std::min(size, this->size() - offset); + std::memcpy(dst, _h_buffer.data() + offset, count); + return count; + } + + std::unique_ptr host_read(size_t offset, size_t size) override + { + auto const count = std::min(size, this->size() - offset); + return std::make_unique( + reinterpret_cast(_h_buffer.data() + offset), count); + } + + [[nodiscard]] bool supports_device_read() const override { return false; } + + [[nodiscard]] size_t size() const override { return _h_buffer.size(); } + + private: + cudf::host_span _h_buffer; ///< A non-owning view of the existing host data +}; + /** * @brief Wrapper class for user implemented data sources * @@ -424,9 +449,7 @@ std::unique_ptr datasource::create(host_buffer const& buffer) std::unique_ptr datasource::create(cudf::host_span buffer) { - // Use Arrow IO buffer class for zero-copy reads of host memory - return std::make_unique(std::make_shared( - reinterpret_cast(buffer.data()), buffer.size())); + return std::make_unique(buffer); } std::unique_ptr datasource::create(cudf::device_span buffer) diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp index 6be22d8e729..83ae916afc3 100644 --- a/cpp/src/strings/strings_column_view.cpp +++ b/cpp/src/strings/strings_column_view.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -45,10 +45,10 @@ strings_column_view::offset_iterator strings_column_view::offsets_end() const return offsets_begin() + size() + 1; } -size_type strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept +int64_t strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept { - if (size() == 0) return 0; - return detail::get_value(offsets(), offsets().size() - 1, stream); + if (size() == 0) { return 0L; } + return cudf::strings::detail::get_offset_value(offsets(), offsets().size() - 1, stream); } strings_column_view::chars_iterator strings_column_view::chars_begin(rmm::cuda_stream_view) const diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu index 545d8d2c4f9..336d360063f 100644 --- a/cpp/tests/io/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json_whitespace_normalization_test.cu @@ -13,177 +13,41 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "io/fst/lookup_tables.cuh" -#include "io/utilities/hostdevice_vector.hpp" - #include #include -#include +#include #include -#include +#include +#include +#include #include +#include -#include #include +#include -#include - -#include #include -namespace { -// Type used to represent the atomic symbol type used within the finite-state machine -using SymbolT = char; -using StateT = char; - -// Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; - -enum class dfa_symbol_group_id : uint32_t { - DOUBLE_QUOTE_CHAR, ///< Quote character SG: " - ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' - WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' - OTHER_SYMBOLS, ///< SG implicitly matching all other characters - NUM_SYMBOL_GROUPS ///< Total number of symbol groups -}; -// Alias for readability of symbol group ids -constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; - -/** - * -------- FST states --------- - * ----------------------------- - * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character - * TT_DEC | State handling escaped characters inside double-quoted string. Note that this - * | state is necessary to process escaped double-quote characters. Without this - * | state, whitespaces following escaped double quotes inside strings may be removed. - * - * NOTE: An important case NOT handled by this FST is that of whitespace following newline - * characters within a string. Consider the following example - * Input: {"a":"x\n y"} - * FST output: {"a":"x\ny"} - * Expected output: {"a":"x\n y"} - */ -enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; -// Aliases for readability of the transition table -constexpr auto TT_OOS = dfa_states::TT_OOS; -constexpr auto TT_DQS = dfa_states::TT_DQS; -constexpr auto TT_DEC = dfa_states::TT_DEC; -constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); - -// Transition table -std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; - -// The DFA's starting state -constexpr StateT start_state = static_cast(TT_OOS); - -struct TransduceToNormalizedWS { - /** - * @brief Returns the -th output symbol on the transition (state_id, match_id). - */ - template - constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, - SymbolGroupT const match_id, - RelativeOffsetT const relative_offset, - SymbolT const read_symbol) const - { - // -------- TRANSLATION TABLE ------------ - // Let the alphabet set be Sigma - // --------------------------------------- - // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol - // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // OOS | Sigma\{,\t} -> Sigma\{,\t} - // DEC | Sigma -> Sigma - // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {} -> - // OOS | {\t} -> - - // Case when read symbol is a space or tab but is unquoted - // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function - // However, since there is no output in this case i.e. the count returned by - // operator()(state_id, match_id, read_symbol) is zero, this function is never called. - // So skipping the check for this case. - - // In all other cases, we have an output symbol for the input symbol. - // We simply output the input symbol - return read_symbol; - } - - /** - * @brief Returns the number of output characters for a given transition. - * During whitespace normalization, we always emit one output character i.e., the input - * character, except when we need to remove the space/tab character - */ - template - constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id, - SymbolGroupT const match_id, - SymbolT const read_symbol) const - { - // Case when read symbol is a space or tab but is unquoted - if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && - state_id == static_cast(dfa_states::TT_OOS)) { - return 0; - } - return 1; - } -}; -} // namespace - // Base test fixture for tests struct JsonWSNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& input, std::string const& output) +void run_test(std::string const& host_input, std::string const& expected_host_output) { - auto parser = cudf::io::fst::detail::make_fst( - cudf::io::fst::detail::make_symbol_group_lut(wna_sgs), - cudf::io::fst::detail::make_transition_table(wna_state_tt), - cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedWS{}), - cudf::test::get_default_stream()); - - auto d_input_scalar = cudf::make_string_scalar(input, cudf::test::get_default_stream()); - auto& d_input = static_cast&>(*d_input_scalar); + auto stream_view = cudf::get_default_stream(); + auto device_input = cudf::detail::make_device_uvector_async( + host_input, stream_view, rmm::mr::get_current_device_resource()); - // Prepare input & output buffers - constexpr std::size_t single_item = 1; - cudf::detail::hostdevice_vector output_gpu(input.size(), - cudf::test::get_default_stream()); - cudf::detail::hostdevice_vector output_gpu_size(single_item, - cudf::test::get_default_stream()); + // Preprocessing FST + auto device_fst_output = cudf::io::json::detail::normalize_whitespace( + std::move(device_input), stream_view, rmm::mr::get_current_device_resource()); - // Allocate device-side temporary storage & run algorithm - parser.Transduce(d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - thrust::make_discard_iterator(), - output_gpu_size.device_ptr(), - start_state, - cudf::test::get_default_stream()); + auto const preprocessed_host_output = + cudf::detail::make_std_vector_sync(device_fst_output, stream_view); - // Async copy results from device to host - output_gpu.device_to_host_async(cudf::test::get_default_stream()); - output_gpu_size.device_to_host_async(cudf::test::get_default_stream()); - - // Make sure results have been copied back to host - cudf::test::get_default_stream().synchronize(); - - // Verify results - ASSERT_EQ(output_gpu_size[0], output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size()); + ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL( + preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); } TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces) @@ -259,4 +123,33 @@ TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput) run_test(input, output); } +TEST_F(JsonWSNormalizationTest, ReadJsonOption) +{ + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace + + // Test input + std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true); + + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); + + // Expected table + std::string const expected_input = R"({ "a" : {"b":"c"}})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false); + + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 73f63ae1343..2e5b3916c65 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. reduce rolling scalar + search stream_compaction sorting replace diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst new file mode 100644 index 00000000000..aa57bcd9d92 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst @@ -0,0 +1,6 @@ +====== +search +====== + +.. automodule:: cudf._lib.pylibcudf.search + :members: diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 62496e32f7a..b37d0d88ec9 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -31,6 +31,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean lines; private final boolean recoverWithNull; private final boolean normalizeSingleQuotes; + private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; @@ -40,6 +41,7 @@ private JSONOptions(Builder builder) { lines = builder.lines; recoverWithNull = builder.recoverWithNull; normalizeSingleQuotes = builder.normalizeSingleQuotes; + normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; } @@ -61,6 +63,10 @@ public boolean isNormalizeSingleQuotes() { return normalizeSingleQuotes; } + public boolean isNormalizeWhitespace() { + return normalizeWhitespace; + } + public boolean isMixedTypesAsStrings() { return mixedTypesAsStrings; } @@ -84,6 +90,7 @@ public static final class Builder extends ColumnFilterOptions.Builder(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .keep_quotes(keep_quotes) - .mixed_types_as_string(mixed_types_as_string); + .normalize_whitespace(static_cast(normalize_whitespace)) + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1461,8 +1462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string, - jboolean keep_quotes) { + jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace, + jboolean mixed_types_as_string, jboolean keep_quotes) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1484,8 +1485,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .keep_quotes(keep_quotes) - .mixed_types_as_string(mixed_types_as_string); + .normalize_whitespace(static_cast(normalize_whitespace)) + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1573,8 +1575,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes, - jlong ds_handle) { + jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, + jboolean keep_quotes, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1606,6 +1608,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1646,7 +1649,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) { + jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, + jboolean keep_quotes) { bool read_buffer = true; if (buffer == 0) { @@ -1693,6 +1697,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 6f0b2b51f4c..44dd20561bf 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -88,6 +88,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json"); + private static final File TEST_JSON_WHITESPACES_FILE = TestUtils.getResourceAsFile("whitespaces.json"); private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json"); private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json"); @@ -349,6 +350,39 @@ void testReadSingleQuotesJSONFile() throws IOException { } @Test + void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(false) + .build(); + try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, source)); + } + } + + @Test + void testReadWhitespacesJSONFile() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("b", "50", "[1,2,3,4,5,6,7,8]", "{\"c\":\"d\"}", "b") + .build(); + MultiBufferDataSource source = sourceFrom(TEST_JSON_WHITESPACES_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -547,21 +581,6 @@ void testReadMixedType2JSONFile() throws IOException { } } - @Test - void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { - Schema schema = Schema.builder() - .column(DType.STRING, "A") - .build(); - JSONOptions opts = JSONOptions.builder() - .withLines(true) - .withNormalizeSingleQuotes(false) - .build(); - try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { - assertThrows(CudfException.class, () -> - Table.readJSON(schema, opts, source)); - } - } - @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() @@ -3745,6 +3764,10 @@ void testChunkedPackTwoPasses() { // this test packes ~2MB worth of long into a 1MB bounce buffer // this is 3 iterations because of the validity buffer Long[] longs = new Long[256*1024]; + // Initialize elements at odd-numbered indices + for (int i = 1; i < longs.length; i += 2) { + longs[i] = (long)i; + } try (Table t1 = new Table.TestBuilder().column(longs).build(); DeviceMemoryBuffer bounceBuffer = DeviceMemoryBuffer.allocate(1L*1024*1024); ChunkedPack cp = t1.makeChunkedPack(1L*1024*1024); @@ -3757,7 +3780,7 @@ void testChunkedPackTwoPasses() { while (cp.hasNext()) { long copied = cp.next(bounceBuffer); target.copyFromDeviceBufferAsync( - offset, target, 0, copied, Cuda.DEFAULT_STREAM); + offset, bounceBuffer, 0, copied, Cuda.DEFAULT_STREAM); offset += copied; } diff --git a/java/src/test/resources/whitespaces.json b/java/src/test/resources/whitespaces.json new file mode 100644 index 00000000000..f5ddd8cde5f --- /dev/null +++ b/java/src/test/resources/whitespaces.json @@ -0,0 +1,5 @@ +{"a":"b"} + { "a" : "50" } +{"a": [1, 2, 3, 4, 5, 6, 7, 8]} +{"a": {"c": "d"}} +{"a": "b"} diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index 68f01003fe6..86dc0f0de95 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -68,3 +68,8 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: table_view source_table, vector[libcudf_types.order] column_order, vector[libcudf_types.null_order] null_precedence) except + + + cdef unique_ptr[table] stable_sort( + table_view source_table, + vector[libcudf_types.order] column_order, + vector[libcudf_types.null_order] null_precedence) except + diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index e8539ecb9c3..55854a9444f 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -30,21 +30,28 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: vector[size_type] keys, size_type keep_threshold) except + + cdef unique_ptr[table] drop_nans(table_view source_table, + vector[size_type] keys, + size_type keep_threshold) except + + cdef unique_ptr[table] apply_boolean_mask( table_view source_table, column_view boolean_mask ) except + - cdef size_type distinct_count( - column_view source_table, - null_policy null_handling, - nan_policy nan_handling) except + + cdef unique_ptr[table] unique( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + ) except + - cdef unique_ptr[table] stable_distinct( + cdef unique_ptr[table] distinct( table_view input, vector[size_type] keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equals, ) except + cdef unique_ptr[column] distinct_indices( @@ -53,3 +60,29 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil: null_equality nulls_equal, nan_equality nans_equal, ) except + + + cdef unique_ptr[table] stable_distinct( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + ) except + + + cdef size_type unique_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type unique_count( + table_view source_table, + null_policy null_handling) except + + + cdef size_type distinct_count( + column_view column, + null_policy null_handling, + nan_policy nan_handling) except + + + cdef size_type distinct_count( + table_view source_table, + null_policy null_handling) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 68e6765cc49..fd749a5edc1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -28,6 +28,7 @@ set(cython_sources replace.pyx rolling.pyx scalar.pyx + search.pyx stream_compaction.pyx sorting.pyx table.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5ef10fb2ffc..96aa42cc257 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -14,6 +14,7 @@ from . cimport ( reduce, replace, rolling, + search, sorting, stream_compaction, types, @@ -45,6 +46,7 @@ __all__ = [ "reduce", "replace", "rolling", + "search", "stream_compaction", "sorting", "types", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 4689c49fdb1..19cc782dd92 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -13,6 +13,7 @@ reduce, replace, rolling, + search, sorting, stream_compaction, types, @@ -43,6 +44,7 @@ "reduce", "replace", "rolling", + "search", "stream_compaction", "sorting", "types", diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pxd b/python/cudf/cudf/_lib/pylibcudf/search.pxd new file mode 100644 index 00000000000..0faf18b108f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/search.pxd @@ -0,0 +1,21 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .table cimport Table + + +cpdef Column lower_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +) + +cpdef Column upper_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +) + +cpdef Column contains(Column haystack, Column needles) diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx new file mode 100644 index 00000000000..a186167af13 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx @@ -0,0 +1,116 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.cpp cimport search as cpp_search +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport null_order, order + +from .column cimport Column +from .table cimport Table + + +cpdef Column lower_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +): + """Find smallest indices in haystack where needles may be inserted to retain order. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to find insertion points. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Column + The insertion points + """ + cdef unique_ptr[column] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_search.lower_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column upper_bound( + Table haystack, + Table needles, + list column_order, + list null_precedence, +): + """Find largest indices in haystack where needles may be inserted to retain order. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to find insertion points. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Column + The insertion points + """ + cdef unique_ptr[column] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_search.upper_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, + ) + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column contains(Column haystack, Column needles): + """Check whether needles are present in haystack. + + Parameters + ---------- + haystack : Table + The search space. + needles : Table + The values for which to search. + + Returns + ------- + Column + Boolean indicator for each needle. + """ + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_search.contains( + haystack.view(), + needles.view(), + ) + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd index fb22da0b0fd..3ed241622c0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd @@ -59,3 +59,5 @@ cpdef Table stable_sort_by_key( ) cpdef Table sort(Table source_table, list column_order, list null_precedence) + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence) diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx index 4e73760720a..1668a3efc7c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx @@ -50,7 +50,8 @@ cpdef Column stable_sorted_order( list column_order, list null_precedence, ): - """Computes the row indices required to sort the table, maintaining input order. + """Computes the row indices required to sort the table, + preserving order of equal elements. Parameters ---------- @@ -206,7 +207,8 @@ cpdef Table stable_segmented_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, within segments, maintaining input order. + """Sorts the table by key preserving order of equal elements, + within segments. Parameters ---------- @@ -287,7 +289,7 @@ cpdef Table stable_sort_by_key( list column_order, list null_precedence, ): - """Sorts the table by key, maintaining input order. + """Sorts the table by key preserving order of equal elements. Parameters ---------- @@ -349,3 +351,34 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): ) ) return Table.from_libcudf(move(c_result)) + + +cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): + """Sorts the table preserving order of equal elements. + + Parameters + ---------- + source_table : Table + The table to sort. + column_order : List[ColumnOrder] + Whether each column should be sorted in ascending or descending order. + null_precedence : List[NullOrder] + Whether nulls should come before or after non-nulls. + + Returns + ------- + Table + The sorted table. + """ + cdef unique_ptr[table] c_result + cdef vector[order] c_orders = column_order + cdef vector[null_order] c_null_precedence = null_precedence + with nogil: + c_result = move( + cpp_sorting.stable_sort( + source_table.view(), + c_orders, + c_null_precedence, + ) + ) + return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd index 78adb20021c..29acc21fc05 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd @@ -15,19 +15,21 @@ from .table cimport Table cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold) -cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ) cpdef Column distinct_indices( @@ -36,3 +38,23 @@ cpdef Column distinct_indices( null_equality nulls_equal, nan_equality nans_equal, ) + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +) + +cpdef size_type unique_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) + +cpdef size_type distinct_count( + Column column, + null_policy null_handling, + nan_policy nan_handling +) diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx index 0357866980a..af7a85d31bf 100644 --- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx @@ -51,6 +51,34 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): return Table.from_libcudf(move(c_result)) +cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): + """Filters out rows from the input table based on the presence of NaNs. + + Parameters + ---------- + source_table : Table + The input table to filter. + keys : List[size_type] + The list of column indexes to consider for NaN filtering. + keep_threshold : size_type + The minimum number of non-NaNs required to keep a row. + + Returns + ------- + Table + A new table with rows removed based on NaNs. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold + ) + ) + return Table.from_libcudf(move(c_result)) + + cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """Filters out rows from the input table based on a boolean mask. @@ -76,39 +104,55 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): return Table.from_libcudf(move(c_result)) -cpdef size_type distinct_count( - Column source_table, - null_policy null_handling, - nan_policy nan_handling +cpdef Table unique( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, ): - """Returns the number of unique elements in the input column. + """Filter duplicate consecutive rows from the input table. Parameters ---------- - source_table : Column - The input column to count the unique elements of. - null_handling : null_policy - Flag to include or exclude nulls from the count. - nan_handling : nan_policy - Flag to include or exclude NaNs from the count. + input : Table + The input table to filter + keys : list[int] + The list of column indexes to consider for filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. Returns ------- - size_type - The number of unique elements in the input column. + Table + New Table with unique rows from each sequence of equivalent rows + as specified by keep. In the same order as the input table. + + Notes + ----- + If the input columns to be filtered on are sorted, then + unique can produce the same result as stable_distinct, but faster. """ - return cpp_stream_compaction.distinct_count( - source_table.view(), null_handling, nan_handling - ) + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.unique( + input.view(), c_keys, keep, nulls_equal + ) + ) + return Table.from_libcudf(move(c_result)) -cpdef Table stable_distinct( +cpdef Table distinct( Table input, list keys, duplicate_keep_option keep, null_equality nulls_equal, + nan_equality nans_equal, ): - """Get the distinct rows from the input table, preserving input order. + """Get the distinct rows from the input table. Parameters ---------- @@ -120,18 +164,21 @@ cpdef Table stable_distinct( The option to specify which rows to keep in the case of duplicates. nulls_equal : null_equality The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. Returns ------- Table - A new table with distinct rows from the input table. + A new table with distinct rows from the input table. The + output will not necessarily be in the same order as the input. """ cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: c_result = move( - cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal + cpp_stream_compaction.distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) ) return Table.from_libcudf(move(c_result)) @@ -169,3 +216,99 @@ cpdef Column distinct_indices( ) ) return Column.from_libcudf(move(c_result)) + + +cpdef Table stable_distinct( + Table input, + list keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, +): + """Get the distinct rows from the input table, preserving input order. + + Parameters + ---------- + input : Table + The input table to filter. + keys : list + The list of column indexes to consider for distinct filtering. + keep : duplicate_keep_option + The option to specify which rows to keep in the case of duplicates. + nulls_equal : null_equality + The option to specify how nulls are handled in the comparison. + nans_equal : nan_equality + The option to specify how NaNs are handled in the comparison. + + Returns + ------- + Table + A new table with distinct rows from the input table, preserving + the input table order. + """ + cdef unique_ptr[table] c_result + cdef vector[size_type] c_keys = keys + with nogil: + c_result = move( + cpp_stream_compaction.stable_distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal + ) + ) + return Table.from_libcudf(move(c_result)) + + +cpdef size_type unique_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of unique consecutive elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of unique consecutive elements in the input column. + + Notes + ----- + If the input column is sorted, then unique_count can produce the + same result as distinct_count, but faster. + """ + return cpp_stream_compaction.unique_count( + source.view(), null_handling, nan_handling + ) + + +cpdef size_type distinct_count( + Column source, + null_policy null_handling, + nan_policy nan_handling +): + """Returns the number of distinct elements in the input column. + + Parameters + ---------- + source : Column + The input column to count the unique elements of. + null_handling : null_policy + Flag to include or exclude nulls from the count. + nan_handling : nan_policy + Flag to include or exclude NaNs from the count. + + Returns + ------- + size_type + The number of distinct elements in the input column. + """ + return cpp_stream_compaction.distinct_count( + source.view(), null_handling, nan_handling + ) diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx index fef3a08c6d7..1ee73949fd3 100644 --- a/python/cudf/cudf/_lib/search.pyx +++ b/python/cudf/cudf/_lib/search.pyx @@ -1,18 +1,10 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -cimport cudf._lib.cpp.search as cpp_search -cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.column cimport Column -from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.utils cimport table_view_from_columns + +from cudf._lib import pylibcudf @acquire_spill_lock() @@ -31,50 +23,31 @@ def search_sorted( If 'left', the index of the first suitable location is given. If 'right', return the last such index """ - cdef unique_ptr[column] c_result - cdef vector[libcudf_types.order] c_column_order - cdef vector[libcudf_types.null_order] c_null_precedence - cdef libcudf_types.order c_order - cdef libcudf_types.null_order c_null_order - cdef table_view c_table_data = table_view_from_columns(source) - cdef table_view c_values_data = table_view_from_columns(values) - # Note: We are ignoring index columns here - c_order = (libcudf_types.order.ASCENDING - if ascending - else libcudf_types.order.DESCENDING) - c_null_order = ( - libcudf_types.null_order.AFTER - if na_position=="last" - else libcudf_types.null_order.BEFORE + column_order = [ + pylibcudf.types.Order.ASCENDING + if ascending + else pylibcudf.types.Order.DESCENDING + ] * len(source) + null_precedence = [ + pylibcudf.types.NullOrder.AFTER + if na_position == "last" + else pylibcudf.types.NullOrder.BEFORE + ] * len(source) + + func = getattr( + pylibcudf.search, + "lower_bound" if side == "left" else "upper_bound", ) - c_column_order = vector[libcudf_types.order](len(source), c_order) - c_null_precedence = vector[libcudf_types.null_order]( - len(source), c_null_order + return Column.from_pylibcudf( + func( + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]), + pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), + column_order, + null_precedence, + ) ) - if side == 'left': - with nogil: - c_result = move( - cpp_search.lower_bound( - c_table_data, - c_values_data, - c_column_order, - c_null_precedence, - ) - ) - elif side == 'right': - with nogil: - c_result = move( - cpp_search.upper_bound( - c_table_data, - c_values_data, - c_column_order, - c_null_precedence, - ) - ) - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def contains(Column haystack, Column needles): @@ -87,15 +60,9 @@ def contains(Column haystack, Column needles): needles : A column of values to search for """ - cdef unique_ptr[column] c_result - cdef column_view c_haystack = haystack.view() - cdef column_view c_needles = needles.view() - - with nogil: - c_result = move( - cpp_search.contains( - c_haystack, - c_needles, - ) + return Column.from_pylibcudf( + pylibcudf.search.contains( + haystack.to_pylibcudf(mode="read"), + needles.to_pylibcudf(mode="read"), ) - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 04883eac559..834f91f48d9 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -109,6 +109,7 @@ def drop_duplicates(list columns, keep_option, pylibcudf.types.NullEquality.EQUAL if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL, + pylibcudf.types.NanEquality.ALL_EQUAL, ) ) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 896cc55b425..b6637e9df08 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -149,7 +149,9 @@ cpdef generate_pandas_metadata(table, index): col for col in table._columns ], - df=table, + # It is OKAY to do `.head(0).to_pandas()` because + # this method will extract `.columns` metadata only + df=table.head(0).to_pandas(), column_names=col_names, index_levels=index_levels, index_descriptors=index_descriptors, diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 58e2241e810..de44f392eef 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -910,7 +910,7 @@ def notna(self): """ raise NotImplementedError - def to_pandas(self, *, nullable: bool = False): + def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False): """ Convert to a Pandas Index. @@ -924,6 +924,12 @@ def to_pandas(self, *, nullable: bool = False): If ``nullable`` is ``False``, the resulting index will either convert null values to ``np.nan`` or ``None`` depending on the dtype. + arrow_type : bool, Default False + Return the Index with a ``pandas.ArrowDtype`` + + Notes + ----- + nullable and arrow_type cannot both be set to ``True`` Examples -------- @@ -937,6 +943,8 @@ def to_pandas(self, *, nullable: bool = False): >>> type(idx) + >>> idx.to_pandas(arrow_type=True) + Index([-3, 10, 15, 20], dtype='int64[pyarrow]') """ raise NotImplementedError diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index a1c86b617b0..2a46654ccc2 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -16,7 +16,6 @@ column_empty_like_same_mask, concat_columns, deserialize_columns, - full, serialize_columns, ) from cudf.core.column.datetime import DatetimeColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 9ecd461cf99..88bb4521a5b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -734,8 +734,8 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn: ) return other - ary = column.full( - len(self), self._encode(other), dtype=self.codes.dtype + ary = column.as_column( + self._encode(other), length=len(self), dtype=self.codes.dtype ) return column.build_categorical_column( categories=self.dtype.categories._values, @@ -770,10 +770,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: ) def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") + elif arrow_type: + raise NotImplementedError(f"{arrow_type=} is not implemented.") if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) @@ -1438,11 +1444,9 @@ def _create_empty_categorical_column( return column.build_categorical_column( categories=column.as_column(dtype.categories), codes=column.as_column( - column.full( - categorical_column.size, - _DEFAULT_CATEGORICAL_VALUE, - categorical_column.codes.dtype, - ) + _DEFAULT_CATEGORICAL_VALUE, + length=categorical_column.size, + dtype=categorical_column.codes.dtype, ), offset=categorical_column.offset, size=categorical_column.size, diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index cecdaf70750..8941d111d02 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -58,7 +58,6 @@ infer_dtype, is_bool_dtype, is_datetime64_dtype, - is_decimal_dtype, is_dtype_equal, is_integer_dtype, is_list_dtype, @@ -199,6 +198,7 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: """Convert object to pandas type. @@ -206,13 +206,23 @@ def to_pandas( """ # This default implementation does not handle nulls in any meaningful # way - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - pd_series = self.to_arrow().to_pandas() + pa_array = self.to_arrow() + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) + else: + pd_series = pa_array.to_pandas() - if index is not None: - pd_series.index = index - return pd_series + if index is not None: + pd_series.index = index + return pd_series @property def values_host(self) -> "np.ndarray": @@ -855,7 +865,7 @@ def isin(self, values: Sequence) -> ColumnBase: except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails - return full(len(self), False, dtype="bool") + return as_column(False, length=len(self), dtype="bool") return lhs._obtain_isin_result(rhs) @@ -882,9 +892,9 @@ def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]: if self.null_count and rhs.null_count: return self.isnull() else: - return cudf.core.column.full(len(self), False, dtype="bool") + return as_column(False, length=len(self), dtype="bool") elif self.null_count == 0 and (rhs.null_count == len(rhs)): - return cudf.core.column.full(len(self), False, dtype="bool") + return as_column(False, length=len(self), dtype="bool") else: return None @@ -1345,9 +1355,7 @@ def _label_encoding( na_sentinel = cudf.Scalar(-1) def _return_sentinel_column(): - return cudf.core.column.full( - size=len(self), fill_value=na_sentinel, dtype=dtype - ) + return as_column(na_sentinel, dtype=dtype, length=len(self)) if dtype is None: dtype = min_scalar_type(max(len(cats), na_sentinel), 8) @@ -1444,7 +1452,9 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype), + as_column( + 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype + ), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1463,7 +1473,9 @@ def column_empty( elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) children = ( - full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype), + as_column( + 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype + ), ) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -2006,33 +2018,32 @@ def as_column( if dtype is not None: data = data.astype(dtype) - elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)): - # This will always treat NaTs as nulls since it's not technically a - # discrete value like NaN - length = length or 1 - data = as_column( - pa.array(pd.Series([arbitrary] * length), from_pandas=True) - ) - if dtype is not None: - data = data.astype(dtype) - - elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): - length = length or 1 + elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview): + if length is None: + length = 1 + elif length < 0: + raise ValueError(f"{length=} must be >=0.") + if isinstance(arbitrary, pd.Interval): + # No cudf.Scalar support yet + return as_column( + pd.Series([arbitrary] * length), + nan_as_null=nan_as_null, + dtype=dtype, + length=length, + ) if ( - (nan_as_null is True) + nan_as_null is True and isinstance(arbitrary, (np.floating, float)) and np.isnan(arbitrary) ): - arbitrary = None if dtype is None: - dtype = cudf.dtype("float64") - - data = as_column(full(length, arbitrary, dtype=dtype)) - if not nan_as_null and not is_decimal_dtype(data.dtype): - if np.issubdtype(data.dtype, np.floating): - data = data.fillna(np.nan) - elif np.issubdtype(data.dtype, np.datetime64): - data = data.fillna(np.datetime64("NaT")) + dtype = getattr(arbitrary, "dtype", cudf.dtype("float64")) + arbitrary = None + arbitrary = cudf.Scalar(arbitrary, dtype=dtype) + if length == 0: + return column_empty(length, dtype=arbitrary.dtype) + else: + return ColumnBase.from_scalar(arbitrary, length) elif hasattr(arbitrary, "__array_interface__"): # CUDF assumes values are always contiguous @@ -2150,8 +2161,6 @@ def as_column( return as_column( np.asarray(view), dtype=dtype, nan_as_null=nan_as_null ) - elif isinstance(arbitrary, cudf.Scalar): - data = ColumnBase.from_scalar(arbitrary, length if length else 1) else: if dtype is not None: # Arrow throws a type error if the input is of @@ -2494,42 +2503,6 @@ def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: return columns -def full( - size: int, fill_value: ScalarLike, dtype: Optional[Dtype] = None -) -> ColumnBase: - """ - Returns a column of given size and dtype, filled with a given value. - - Parameters - ---------- - size : int - size of the expected column. - fill_value : scalar - A scalar value to fill a new array. - dtype : default None - Data type specifier. It is inferred from other arguments by default. - - Returns - ------- - Column - - Examples - -------- - >>> import cudf - >>> col = cudf.core.column.full(size=5, fill_value=7, dtype='int8') - >>> col - - >>> cudf.Series(col) - 0 7 - 1 7 - 2 7 - 3 7 - 4 7 - dtype: int8 - """ - return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) - - def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b03b21a7aba..85f07064c97 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -318,18 +318,27 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - # `copy=True` workaround until following issue is fixed: - # https://issues.apache.org/jira/browse/ARROW-9772 - - return pd.Series( - self.to_arrow(), - copy=True, - dtype=self.dtype, - index=index, - ) + elif arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + else: + # `copy=True` workaround until following issue is fixed: + # https://issues.apache.org/jira/browse/ARROW-9772 + return pd.Series( + self.to_arrow(), + copy=True, + dtype=self.dtype, + index=index, + ) @property def values(self): @@ -723,15 +732,25 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - series = self._local_time.to_pandas().dt.tz_localize( - self.dtype.tz, ambiguous="NaT", nonexistent="NaT" - ) - if index is not None: - series.index = index - return series + elif arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + else: + series = self._local_time.to_pandas().dt.tz_localize( + self.dtype.tz, ambiguous="NaT", nonexistent="NaT" + ) + if index is not None: + series.index = index + return series def to_arrow(self): return pa.compute.assume_timezone( diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 0e90b522f2c..b83a6ded416 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -69,8 +69,8 @@ def as_string_column( def __pow__(self, other): if isinstance(other, int): if other == 0: - res = cudf.core.column.full( - size=len(self), fill_value=1, dtype=self.dtype + res = cudf.core.column.as_column( + 1, dtype=self.dtype, length=len(self) ) if self.nullable: res = res.set_mask(self.mask) diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index 5d93fa26298..dc609f732e0 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -105,15 +105,25 @@ def as_interval_column(self, dtype): raise ValueError("dtype must be IntervalDtype") def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: # Note: This does not handle null values in the interval column. # However, this exact sequence (calling __from_arrow__ on the output of # self.to_arrow) is currently the best known way to convert interval # types into pandas (trying to convert the underlying numerical columns # directly is problematic), so we're stuck with this for now. + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") + elif arrow_type: + raise NotImplementedError(f"{arrow_type=} is not implemented.") return pd.Series( self.dtype.to_pandas().__from_arrow__(self.to_arrow()), index=index ) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b2205af34e8..1c2bcbef2ec 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -6,6 +6,7 @@ from typing import List, Optional, Sequence, Tuple, Union import numpy as np +import pandas as pd import pyarrow as pa from typing_extensions import Self @@ -288,6 +289,29 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: ) return lc + def to_pandas( + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, + ) -> pd.Series: + # Can't rely on Column.to_pandas implementation for lists. + # Need to perform `to_pylist` to preserve list types. + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + if nullable: + raise NotImplementedError(f"{nullable=} is not implemented.") + pa_array = self.to_arrow() + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) + else: + return pd.Series(pa_array.tolist(), dtype="object", index=index) + class ListMethods(ColumnMethods): """ diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index b80dd626066..8d9da8982ac 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -42,7 +42,6 @@ as_column, build_column, column, - full, string, ) from cudf.core.dtypes import CategoricalDtype @@ -513,7 +512,7 @@ def find_and_replace( ) if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( - full(len(to_replace_col), replacement[0], self.dtype) + replacement[0], length=len(to_replace_col), dtype=self.dtype ) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return self.copy() @@ -690,8 +689,17 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: - if nullable and self.dtype in np_dtypes_to_pandas_dtypes: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + elif nullable and self.dtype in np_dtypes_to_pandas_dtypes: pandas_nullable_dtype = np_dtypes_to_pandas_dtypes[self.dtype] arrow_array = self.to_arrow() pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2373f94ee97..e947c9375d7 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5499,7 +5499,9 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: - offsets = column.full(size + 1, 0, dtype=size_type_dtype) + offsets = column.as_column( + 0, length=size + 1, dtype=size_type_dtype + ) children = (offsets,) @@ -5791,8 +5793,17 @@ def to_pandas( *, index: Optional[pd.Index] = None, nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + elif nullable: pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) pd_series = pd.Series(pandas_array, copy=False) else: @@ -5921,8 +5932,8 @@ def _binaryop( "__eq__", "__ne__", }: - return column.full( - len(self), op == "__ne__", dtype="bool" + return column.as_column( + op == "__ne__", length=len(self), dtype="bool" ).set_mask(self.mask) else: return NotImplemented @@ -5931,7 +5942,9 @@ def _binaryop( if isinstance(other, cudf.Scalar): other = cast( StringColumn, - column.full(len(self), other, dtype="object"), + column.as_column( + other, length=len(self), dtype="object" + ), ) # Explicit types are necessary because mypy infers ColumnBase diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 69e9a50956b..1b2ffcc2700 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -58,14 +58,27 @@ def to_arrow(self): ) def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: # We cannot go via Arrow's `to_pandas` because of the following issue: # https://issues.apache.org/jira/browse/ARROW-12680 - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - - return pd.Series(self.to_arrow().tolist(), dtype="object", index=index) + pa_array = self.to_arrow() + if arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(pa_array), index=index + ) + else: + return pd.Series(pa_array.tolist(), dtype="object", index=index) @cached_property def memory_usage(self): diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index b911c86fa01..ee326b254b9 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -147,20 +147,31 @@ def to_arrow(self) -> pa.Array: ) def to_pandas( - self, *, index: Optional[pd.Index] = None, nullable: bool = False + self, + *, + index: Optional[pd.Index] = None, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: # `copy=True` workaround until following issue is fixed: # https://issues.apache.org/jira/browse/ARROW-9772 - - if nullable: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - - return pd.Series( - self.to_arrow(), - copy=True, - dtype=self.dtype, - index=index, - ) + elif arrow_type: + return pd.Series( + pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index + ) + else: + return pd.Series( + self.to_arrow(), + copy=True, + dtype=self.dtype, + index=index, + ) def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: reflect, op = self._check_reflected_op(op) @@ -499,7 +510,7 @@ def components(self, index=None) -> "cudf.DataFrame": break for name in keys_list: - res_col = cudf.core.column.full(len(self), 0, dtype="int64") + res_col = column.as_column(0, length=len(self), dtype="int64") if self.nullable: res_col = res_col.set_mask(self.mask) data[name] = res_col @@ -588,7 +599,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn": # of nanoseconds. if self._time_unit != "ns": - res_col = cudf.core.column.full(len(self), 0, dtype="int64") + res_col = column.as_column(0, length=len(self), dtype="int64") if self.nullable: res_col = res_col.set_mask(self.mask) return cast("cudf.core.column.NumericalColumn", res_col) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a0e1a041342..31a748da856 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1407,7 +1407,7 @@ def __setitem__(self, arg, value): allow_non_unique=True, ) if is_scalar(value): - self._data[arg] = column.full(len(self), value) + self._data[arg] = as_column(value, length=len(self)) else: value = as_column(value) self._data[arg] = value @@ -1455,8 +1455,8 @@ def __setitem__(self, arg, value): else: for col in arg: if is_scalar(value): - self._data[col] = column.full( - size=len(self), fill_value=value + self._data[col] = as_column( + value, length=len(self) ) else: self._data[col] = column.as_column(value) @@ -3205,10 +3205,16 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): ) if _is_scalar_or_zero_d_array(value): - value = column.full( - len(self), + dtype = None + if isinstance(value, (np.ndarray, cupy.ndarray)): + dtype = value.dtype + value = value.item() + if libcudf.scalar._is_null_host_scalar(value): + dtype = "str" + value = as_column( value, - "str" if libcudf.scalar._is_null_host_scalar(value) else None, + length=len(self), + dtype=dtype, ) if len(self) == 0: @@ -5203,7 +5209,9 @@ def describe( return res @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.DataFrame: """ Convert to a Pandas DataFrame. @@ -5218,11 +5226,17 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: If ``nullable`` is ``False``, the resulting columns will either convert null values to ``np.nan`` or ``None`` depending on the dtype. + arrow_type : bool, Default False + Return the Index with a ``pandas.ArrowDtype`` Returns ------- out : Pandas DataFrame + Notes + ----- + nullable and arrow_type cannot both be set to ``True`` + Examples -------- >>> import cudf @@ -5236,8 +5250,7 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: >>> type(pdf) - ``nullable`` parameter can be used to control - whether dtype can be Pandas Nullable or not: + ``nullable=True`` converts the result to pandas nullable types: >>> df = cudf.DataFrame({'a': [0, None, 2], 'b': [True, False, None]}) >>> df @@ -5265,13 +5278,20 @@ def to_pandas(self, *, nullable: bool = False) -> pd.DataFrame: a float64 b object dtype: object + + ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``: + + >>> df.to_pandas(arrow_type=True).dtypes + a int64[pyarrow] + b bool[pyarrow] + dtype: object """ out_data = {} out_index = self.index.to_pandas() for i, col_key in enumerate(self._data): out_data[i] = self._data[col_key].to_pandas( - index=out_index, nullable=nullable + index=out_index, nullable=nullable, arrow_type=arrow_type ) out_df = pd.DataFrame(out_data, index=out_index) @@ -5898,7 +5918,7 @@ def isin(self, values): fill_value = cudf.Scalar(False) def make_false_column_like_self(): - return column.full(len(self), fill_value, "bool") + return column.as_column(fill_value, length=len(self), dtype="bool") # Preprocess different input types into a mapping from column names to # a list of values to check. @@ -6017,7 +6037,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): { name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable - else column.full(len(filtered._data[name]), True) + else as_column(True, length=len(filtered._data[name])) for name in filtered._data.names } ) @@ -7808,8 +7828,8 @@ def func(left, right, output): return output for name in uncommon_columns: - output._data[name] = column.full( - size=len(output), fill_value=value, dtype="bool" + output._data[name] = as_column( + value, length=len(output), dtype="bool" ) return output diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index e4370be304a..caf5ac5928f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -363,13 +363,22 @@ def indices(self): >>> df.groupby(by=["a"]).indices {10: array([0, 1]), 40: array([2])} """ - group_names, offsets, _, grouped_values = self._grouped() + offsets, group_keys, (indices,) = self._groupby.groups( + [ + cudf.core.column.as_column( + range(len(self.obj)), dtype=size_type_dtype + ) + ] + ) + group_keys = libcudf.stream_compaction.drop_duplicates(group_keys) + if len(group_keys) > 1: + index = cudf.MultiIndex.from_arrays(group_keys) + else: + (group_keys,) = group_keys + index = cudf.Index(group_keys) return dict( - zip( - group_names.to_pandas(), - np.split(grouped_values.index.values, offsets[1:-1]), - ) + zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1])) ) @_cudf_nvtx_annotate @@ -414,8 +423,7 @@ def get_group(self, name, obj=None): "instead of ``gb.get_group(name, obj=df)``.", FutureWarning, ) - - return obj.loc[self.groups[name].drop_duplicates()] + return obj.iloc[self.indices[name]] @_cudf_nvtx_annotate def size(self): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 1b9893d1256..bd9dc1ae3da 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -483,9 +483,13 @@ def dtype(self): return _maybe_convert_to_default_type(dtype) @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.RangeIndex: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.RangeIndex: if nullable: raise NotImplementedError(f"{nullable=} is not implemented.") + elif arrow_type: + raise NotImplementedError(f"{arrow_type=} is not implemented.") return pd.RangeIndex( start=self._start, stop=self._stop, @@ -1227,9 +1231,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) needle = as_column(target) - result = cudf.core.column.full( - len(needle), - fill_value=-1, + result = as_column( + -1, + length=len(needle), dtype=libcudf.types.size_type_dtype, ) @@ -1521,9 +1525,12 @@ def _clean_nulls_from_index(self): def any(self): return self._values.any() - def to_pandas(self, *, nullable: bool = False) -> pd.Index: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.Index: return pd.Index( - self._values.to_pandas(nullable=nullable), name=self.name + self._values.to_pandas(nullable=nullable, arrow_type=arrow_type), + name=self.name, ) def append(self, other): @@ -2094,18 +2101,26 @@ def isocalendar(self): return cudf.core.tools.datetimes._to_iso_calendar(self) @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.DatetimeIndex: - if nullable: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.DatetimeIndex: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - freq = ( - self._freq._maybe_as_fast_pandas_offset() - if self._freq is not None - else None - ) - return pd.DatetimeIndex( - self._values.to_pandas(), name=self.name, freq=freq - ) + result = self._values.to_pandas(arrow_type=arrow_type) + if arrow_type: + return pd.Index(result, name=self.name) + else: + freq = ( + self._freq._maybe_as_fast_pandas_offset() + if self._freq is not None + else None + ) + return pd.DatetimeIndex(result, name=self.name, freq=freq) @_cudf_nvtx_annotate def _get_dt_field(self, field): @@ -2426,13 +2441,21 @@ def __getitem__(self, index): return value @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.TimedeltaIndex: - if nullable: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.TimedeltaIndex: + if arrow_type and nullable: + raise ValueError( + f"{arrow_type=} and {nullable=} cannot both be set." + ) + elif nullable: raise NotImplementedError(f"{nullable=} is not implemented.") - return pd.TimedeltaIndex( - self._values.to_pandas(), - name=self.name, - ) + + result = self._values.to_pandas(arrow_type=arrow_type) + if arrow_type: + return pd.Index(result, name=self.name) + else: + return pd.TimedeltaIndex(result, name=self.name) @property # type: ignore @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8e43000d0a8..df703370f78 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -50,7 +50,7 @@ from cudf.core._base_index import BaseIndex from cudf.core._compat import PANDAS_LT_300 from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase, as_column, full +from cudf.core.column import ColumnBase, as_column from cudf.core.column_accessor import ColumnAccessor from cudf.core.copy_types import BooleanMask, GatherMap from cudf.core.dtypes import ListDtype @@ -2872,6 +2872,8 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: self._column_names, None if has_range_index or not keep_index else self._index.names, ) + result._data.label_dtype = self._data.label_dtype + result._data.rangeindex = self._data.rangeindex if keep_index and has_range_index: result.index = self.index[start:stop] @@ -3046,14 +3048,14 @@ def duplicated(self, subset=None, keep="first"): (result,) = libcudf.copying.scatter( [cudf.Scalar(False, dtype=bool)], distinct, - [full(len(self), True, dtype=bool)], + [as_column(True, length=len(self), dtype=bool)], bounds_check=False, ) return cudf.Series(result, index=self.index) @_cudf_nvtx_annotate def _empty_like(self, keep_index=True) -> Self: - return self._from_columns_like_self( + result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ *(self._index._data.columns if keep_index else ()), @@ -3063,6 +3065,9 @@ def _empty_like(self, keep_index=True) -> Self: self._column_names, self._index.names if keep_index else None, ) + result._data.label_dtype = self._data.label_dtype + result._data.rangeindex = self._data.rangeindex + return result def _split(self, splits, keep_index=True): if self._num_rows == 0: @@ -3322,9 +3327,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): # Mask and data column preallocated ans_col = _return_arr_from_dtype(retty, len(self)) - ans_mask = cudf.core.column.full( - size=len(self), fill_value=True, dtype="bool" - ) + ans_mask = as_column(True, length=len(self), dtype="bool") output_args = [(ans_col, ans_mask), len(self)] input_args = _get_input_args_from_frame(self) launch_args = output_args + input_args + list(args) @@ -6255,10 +6258,10 @@ def _get_replacement_values_for_columns( values_columns = { col: [value] if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) - else full( - len(to_replace), + else as_column( value, - cudf.dtype(type(value)), + length=len(to_replace), + dtype=cudf.dtype(type(value)), ) for col in columns_dtype_map } diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index df1b1ea10cd..315a21020a2 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -667,7 +667,7 @@ def isin(self, values, level=None): self_df = self.to_frame(index=False).reset_index() values_df = values_idx.to_frame(index=False) idx = self_df.merge(values_df, how="leftsemi")._data["index"] - res = cudf.core.column.full(size=len(self), fill_value=False) + res = column.as_column(False, length=len(self)) res[idx] = True result = res.values else: @@ -1574,10 +1574,12 @@ def droplevel(self, level=-1): return mi @_cudf_nvtx_annotate - def to_pandas(self, *, nullable: bool = False) -> pd.MultiIndex: + def to_pandas( + self, *, nullable: bool = False, arrow_type: bool = False + ) -> pd.MultiIndex: result = self.to_frame( index=False, name=list(range(self.nlevels)) - ).to_pandas(nullable=nullable) + ).to_pandas(nullable=nullable, arrow_type=arrow_type) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod @@ -1843,9 +1845,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "index must be monotonic increasing or decreasing" ) - result = cudf.core.column.full( - len(target), - fill_value=-1, + result = column.as_column( + -1, + length=len(target), dtype=libcudf.types.size_type_dtype, ) if not len(self): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3f51ecdf7dc..1b18e11c047 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -55,7 +55,6 @@ IntervalColumn, TimeDeltaColumn, as_column, - full, ) from cudf.core.column.categorical import ( CategoricalAccessor as CategoricalAccessor, @@ -1311,7 +1310,7 @@ def map(self, arg, na_action=None) -> "Series": { "x": arg.keys(), "s": arg.values(), - "bool": full(len(arg), True, dtype=self.dtype), + "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) res = lhs.merge(rhs, on="x", how="left").sort_values( @@ -1333,7 +1332,7 @@ def map(self, arg, na_action=None) -> "Series": { "x": arg.keys(), "s": arg, - "bool": full(len(arg), True, dtype=self.dtype), + "bool": as_column(True, length=len(arg), dtype=self.dtype), } ) res = lhs.merge(rhs, on="x", how="left").sort_values( @@ -1983,10 +1982,14 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs): @_cudf_nvtx_annotate def to_pandas( - self, *, index: bool = True, nullable: bool = False + self, + *, + index: bool = True, + nullable: bool = False, + arrow_type: bool = False, ) -> pd.Series: """ - Convert to a Pandas Series. + Convert to a pandas Series. Parameters ---------- @@ -2003,10 +2006,16 @@ def to_pandas( If ``nullable`` is ``False``, the resulting series will either convert null values to ``np.nan`` or ``None`` depending on the dtype. + arrow_type : bool, Default False + Return the Series with a ``pandas.ArrowDtype`` Returns ------- - out : Pandas Series + out : pandas Series + + Notes + ----- + nullable and arrow_type cannot both be set to ``True`` Examples -------- @@ -2021,8 +2030,7 @@ def to_pandas( >>> type(pds) - ``nullable`` parameter can be used to control - whether dtype can be Pandas Nullable or not: + ``nullable=True`` converts the result to pandas nullable types: >>> ser = cudf.Series([10, 20, None, 30]) >>> ser @@ -2043,12 +2051,23 @@ def to_pandas( 2 NaN 3 30.0 dtype: float64 + + ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``: + + >>> ser.to_pandas(arrow_type=True) + 0 10 + 1 20 + 2 + 3 30 + dtype: int64[pyarrow] """ if index is True: index = self.index.to_pandas() else: index = None # type: ignore[assignment] - s = self._column.to_pandas(index=index, nullable=nullable) + s = self._column.to_pandas( + index=index, nullable=nullable, arrow_type=arrow_type + ) s.name = self.name return s diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 97779522b8b..19dde2e51b9 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -242,7 +242,10 @@ def __cuda_array_interface__(self): try: return self._column.__cuda_array_interface__ except NotImplementedError: - raise AttributeError + raise AttributeError( + f"'{type(self).__name__}' object has no attribute " + "'__cuda_array_interface__'" + ) @_cudf_nvtx_annotate def factorize(self, sort=False, use_na_sentinel=True): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 0e0df4ecf6e..d182b7b4a7c 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -770,7 +770,7 @@ def _isin_datetimelike( was_string = len(rhs) and rhs.dtype.kind == "O" if rhs.dtype.kind in {"f", "i", "u"}: - return cudf.core.column.full(len(lhs), False, dtype="bool") + return column.as_column(False, length=len(lhs), dtype="bool") rhs = rhs.astype(lhs.dtype) if was_string: warnings.warn( @@ -787,7 +787,7 @@ def _isin_datetimelike( except ValueError: # pandas functionally returns all False when cleansing via # typecasting fails - return cudf.core.column.full(len(lhs), False, dtype="bool") + return column.as_column(False, length=len(lhs), dtype="bool") res = lhs._obtain_isin_result(rhs) return res diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 890e4ecc2f0..2037b1682db 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -9,7 +9,6 @@ import cudf from cudf import _lib as libcudf from cudf.api.types import is_integer, is_number -from cudf.core import column from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import as_column from cudf.core.mixins import Reducible @@ -236,8 +235,8 @@ def _apply_agg_column(self, source_column, agg_name): window = None else: preceding_window = as_column(self.window) - following_window = column.full( - self.window.size, 0, dtype=self.window.dtype + following_window = as_column( + 0, length=self.window.size, dtype=self.window.dtype ) window = None diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 6c70b08384f..bead9c352ef 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -20,7 +20,7 @@ import cudf from cudf._lib import parquet as libparquet from cudf.api.types import is_list_like -from cudf.core.column import build_categorical_column, column_empty, full +from cudf.core.column import as_column, build_categorical_column, column_empty from cudf.utils import ioutils from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate @@ -762,9 +762,9 @@ def _parquet_to_frame( _len = len(dfs[-1]) if partition_categories and name in partition_categories: # Build the categorical column from `codes` - codes = full( - size=_len, - fill_value=partition_categories[name].index(value), + codes = as_column( + partition_categories[name].index(value), + length=_len, ) dfs[-1][name] = build_categorical_column( categories=partition_categories[name], @@ -788,10 +788,10 @@ def _parquet_to_frame( masked=True, ) else: - dfs[-1][name] = full( - size=_len, - fill_value=value, + dfs[-1][name] = as_column( + value, dtype=_dtype, + length=_len, ) if len(dfs) > 1: diff --git a/python/cudf/cudf/tests/groupby/test_groupby_obj.py b/python/cudf/cudf/tests/groupby/test_groupby_obj.py new file mode 100644 index 00000000000..04b483e08dc --- /dev/null +++ b/python/cudf/cudf/tests/groupby/test_groupby_obj.py @@ -0,0 +1,15 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from numpy.testing import assert_array_equal + +import cudf +from cudf.testing._utils import assert_eq + + +def test_groupby_14955(): + # https://github.com/rapidsai/cudf/issues/14955 + df = cudf.DataFrame({"a": [1, 2] * 2}, index=[0] * 4) + agg = df.groupby("a") + pagg = df.to_pandas().groupby("a") + for key in agg.groups: + assert_array_equal(pagg.indices[key], agg.indices[key].get()) + assert_eq(pagg.get_group(key), agg.get_group(key)) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py index 1f20152172b..213c6c2c1f9 100644 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ b/python/cudf/cudf/tests/test_cuda_array_interface.py @@ -4,10 +4,10 @@ from contextlib import ExitStack as does_not_raise import cupy +import numba.cuda import numpy as np import pandas as pd import pytest -from numba import cuda import cudf from cudf.core.buffer.spill_manager import get_global_manager @@ -25,7 +25,7 @@ def test_cuda_array_interface_interop_in(dtype, module): if dtype in DATETIME_TYPES: expectation = pytest.raises(ValueError) elif module == "numba": - module_constructor = cuda.to_device + module_constructor = numba.cuda.to_device with expectation: module_data = module_constructor(np_data) @@ -55,7 +55,7 @@ def to_host_function(x): return cupy.asnumpy(x) elif module == "numba": - module_constructor = cuda.as_cuda_array + module_constructor = numba.cuda.as_cuda_array def to_host_function(x): return x.copy_to_host() @@ -89,7 +89,7 @@ def to_host_function(x): elif module == "numba": expectation = pytest.raises(NotImplementedError) - module_constructor = cuda.as_cuda_array + module_constructor = numba.cuda.as_cuda_array def to_host_function(x): return x.copy_to_host() @@ -135,9 +135,11 @@ def test_cuda_array_interface_as_column(dtype, nulls, mask_type): if mask_type == "bools": if nulls == "some": - obj.__cuda_array_interface__["mask"] = cuda.to_device(mask) + obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(mask) elif nulls == "all": - obj.__cuda_array_interface__["mask"] = cuda.to_device([False] * 10) + obj.__cuda_array_interface__["mask"] = numba.cuda.to_device( + [False] * 10 + ) expect = sr got = cudf.Series(obj) @@ -193,7 +195,11 @@ def test_cuda_array_interface_pytorch(): assert_eq(got, cudf.Series(buffer, dtype=np.bool_)) - # TODO: This test fails with PyTorch 2. Is it still expected to be valid? + # TODO: This test fails with PyTorch 2. It appears that PyTorch + # checks that the pointer is device-accessible even when the + # size is zero. See + # https://github.com/pytorch/pytorch/issues/98133 + # # index = cudf.Index([], dtype="float64") # tensor = torch.tensor(index) # got = cudf.Index(tensor) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2084db89909..3143851ddd6 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3012,43 +3012,31 @@ def test_series_rename(): @pytest.mark.parametrize("data_type", dtypes) @pytest.mark.parametrize("nelem", [0, 100]) def test_head_tail(nelem, data_type): - def check_index_equality(left, right): - assert left.index.equals(right.index) - - def check_values_equality(left, right): - if len(left) == 0 and len(right) == 0: - return None - - np.testing.assert_array_equal(left.to_pandas(), right.to_pandas()) - - def check_frame_series_equality(left, right): - check_index_equality(left, right) - check_values_equality(left, right) - - gdf = cudf.DataFrame( + pdf = pd.DataFrame( { "a": np.random.randint(0, 1000, nelem).astype(data_type), "b": np.random.randint(0, 1000, nelem).astype(data_type), } ) + gdf = cudf.from_pandas(pdf) - check_frame_series_equality(gdf.head(), gdf[:5]) - check_frame_series_equality(gdf.head(3), gdf[:3]) - check_frame_series_equality(gdf.head(-2), gdf[:-2]) - check_frame_series_equality(gdf.head(0), gdf[0:0]) + assert_eq(gdf.head(), pdf.head()) + assert_eq(gdf.head(3), pdf.head(3)) + assert_eq(gdf.head(-2), pdf.head(-2)) + assert_eq(gdf.head(0), pdf.head(0)) - check_frame_series_equality(gdf["a"].head(), gdf["a"][:5]) - check_frame_series_equality(gdf["a"].head(3), gdf["a"][:3]) - check_frame_series_equality(gdf["a"].head(-2), gdf["a"][:-2]) + assert_eq(gdf["a"].head(), pdf["a"].head()) + assert_eq(gdf["a"].head(3), pdf["a"].head(3)) + assert_eq(gdf["a"].head(-2), pdf["a"].head(-2)) - check_frame_series_equality(gdf.tail(), gdf[-5:]) - check_frame_series_equality(gdf.tail(3), gdf[-3:]) - check_frame_series_equality(gdf.tail(-2), gdf[2:]) - check_frame_series_equality(gdf.tail(0), gdf[0:0]) + assert_eq(gdf.tail(), pdf.tail()) + assert_eq(gdf.tail(3), pdf.tail(3)) + assert_eq(gdf.tail(-2), pdf.tail(-2)) + assert_eq(gdf.tail(0), pdf.tail(0)) - check_frame_series_equality(gdf["a"].tail(), gdf["a"][-5:]) - check_frame_series_equality(gdf["a"].tail(3), gdf["a"][-3:]) - check_frame_series_equality(gdf["a"].tail(-2), gdf["a"][2:]) + assert_eq(gdf["a"].tail(), pdf["a"].tail()) + assert_eq(gdf["a"].tail(3), pdf["a"].tail(3)) + assert_eq(gdf["a"].tail(-2), pdf["a"].tail(-2)) def test_tail_for_string(): @@ -4328,6 +4316,17 @@ def test_one_row_head(): assert_eq(head_pdf, head_gdf) +@pytest.mark.parametrize("index", [None, [123], ["a", "b"]]) +def test_no_cols_head(index): + pdf = pd.DataFrame(index=index) + gdf = cudf.from_pandas(pdf) + + head_gdf = gdf.head() + head_pdf = pdf.head() + + assert_eq(head_pdf, head_gdf) + + @pytest.mark.parametrize("dtype", ALL_TYPES) @pytest.mark.parametrize( "np_dtype,pd_dtype", @@ -10862,3 +10861,44 @@ def test_dataframe_duplicate_index_reindex(): lfunc_args_and_kwargs=([10, 11, 12, 13], {}), rfunc_args_and_kwargs=([10, 11, 12, 13], {}), ) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_dataframe_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + df = cudf.DataFrame({"a": pa_array}) + with pytest.raises(ValueError): + df.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_dataframe_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + df = cudf.DataFrame({"a": pa_array}) + result = df.to_pandas(arrow_type=True) + expected = pd.DataFrame({"a": pd.arrays.ArrowExtensionArray(pa_array)}) + pd.testing.assert_frame_equal(result, expected) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index cced05d2217..51e9a3022f4 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3,6 +3,7 @@ """ Test related to Index """ +import datetime import operator import re @@ -3138,3 +3139,40 @@ def test_from_pandas_rangeindex_return_rangeindex(): def test_index_to_pandas_nullable_notimplemented(idx): with pytest.raises(NotImplementedError): idx.to_pandas(nullable=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + ], +) +def test_index_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + idx = cudf.Index(pa_array) + with pytest.raises(ValueError): + idx.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + ], +) +def test_index_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + idx = cudf.Index(pa_array) + result = idx.to_pandas(arrow_type=True) + expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) + pd.testing.assert_index_equal(result, expected) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 7ae7ae34b97..f04cb8a91a4 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import functools import operator @@ -41,6 +41,8 @@ def test_create_list_series(data): expect = pd.Series(data) got = cudf.Series(data) assert_eq(expect, got) + assert isinstance(got[0], type(expect[0])) + assert isinstance(got.to_pandas()[0], type(expect[0])) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index a13fe333107..4926d79e734 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -3,6 +3,7 @@ """ Test related to MultiIndex """ +import datetime import itertools import operator import pickle @@ -13,6 +14,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pytest import cudf @@ -2118,3 +2120,35 @@ def test_multiindex_from_arrays(array): def test_multiindex_from_arrays_wrong_arg(arg): with pytest.raises(TypeError): cudf.MultiIndex.from_arrays(arg) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + ], +) +def test_index_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) + with pytest.raises(ValueError): + midx.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [1, 1.0, "a", datetime.datetime(2020, 1, 1), datetime.timedelta(1)], +) +def test_index_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) + result = midx.to_pandas(arrow_type=True) + expected = pd.MultiIndex( + levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]] + ) + pd.testing.assert_index_equal(result, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index caf8947e3b0..e043f358bbe 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,5 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - +import datetime import decimal import hashlib import operator @@ -2708,3 +2708,44 @@ def test_series_from_large_string(): expected = pd.Series(pa_large_string_array) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_series_to_pandas_arrow_type_nullable_raises(scalar): + pa_array = pa.array([scalar, None]) + ser = cudf.Series(pa_array) + with pytest.raises(ValueError, match=".* cannot both be set"): + ser.to_pandas(nullable=True, arrow_type=True) + + +@pytest.mark.parametrize( + "scalar", + [ + 1, + 1.0, + "a", + datetime.datetime(2020, 1, 1), + datetime.timedelta(1), + {"1": 2}, + [1], + decimal.Decimal("1.0"), + ], +) +def test_series_to_pandas_arrow_type(scalar): + pa_array = pa.array([scalar, None]) + ser = cudf.Series(pa_array) + result = ser.to_pandas(arrow_type=True) + expected = pd.Series(pd.arrays.ArrowExtensionArray(pa_array)) + pd.testing.assert_series_equal(result, expected) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index 091cd6b57a4..1994536f395 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -6,7 +6,7 @@ import pytest import cudf -from cudf.core.column.column import as_column, full +from cudf.core.column.column import as_column from cudf.testing import ( assert_frame_equal, assert_index_equal, @@ -172,8 +172,8 @@ def test_assert_column_equal_dtype_edge_cases(other): assert_column_equal(base.slice(0, 0), other.slice(0, 0), check_dtype=False) assert_column_equal(other.slice(0, 0), base.slice(0, 0), check_dtype=False) - base = full(len(base), fill_value=cudf.NA, dtype=base.dtype) - other = full(len(other), fill_value=cudf.NA, dtype=other.dtype) + base = as_column(cudf.NA, length=len(base), dtype=base.dtype) + other = as_column(cudf.NA, length=len(other), dtype=other.dtype) assert_column_equal(base, other, check_dtype=False) assert_column_equal(other, base, check_dtype=False) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index ec5693e14d2..95621cf9519 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import decimal import functools @@ -396,8 +396,8 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value): else: result_mask = None - result_col = column.full( - size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_) + result_col = column.as_column( + bool_fill_value, dtype=cudf.dtype(np.bool_), length=len(lhs) ) if result_mask is not None: result_col = result_col.set_mask(result_mask.as_mask()) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 454cce76ff2..317c45ba582 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -105,8 +105,10 @@ def _get_non_empty_data(s): categories = ( s.categories if len(s.categories) else [UNKNOWN_CATEGORIES] ) - codes = cudf.core.column.full( - size=2, fill_value=0, dtype=cudf._lib.types.size_type_dtype + codes = cudf.core.column.as_column( + 0, + dtype=cudf._lib.types.size_type_dtype, + length=2, ) ordered = s.ordered data = cudf.core.column.build_categorical_column( diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index c8cc6e65fa5..30251b88dea 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -702,13 +702,9 @@ def test_is_supported(arg, supported): def test_groupby_unique_lists(): df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]}) - ddf = dd.from_pandas(df, 2) gdf = cudf.from_pandas(df) gddf = dask_cudf.from_cudf(gdf, 2) - dd.assert_eq( - ddf.groupby("a").b.unique().compute(), - gddf.groupby("a").b.unique().compute(), - ) + dd.assert_eq( gdf.groupby("a").b.unique(), gddf.groupby("a").b.unique().compute(),